% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% \usepackage{microtype}
\usepackage{graphicx}
\usepackage{float}
\usepackage[caption = false]{subfig}
\usepackage{booktabs} % for professional tables
\usepackage{bm}
%\usepackage{hyperref}
\usepackage{diagbox}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
\usepackage{thm-restate}
%\usepackage{algorithmic}
\usepackage[algo2e,ruled,vlined]{algorithm2e}
% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

%\algnewcommand\algorithmicinput{\textbf{Input:}}
%\algnewcommand\algorithmicoutput{\textbf{Output:}}
%\algnewcommand\INPUT{\item[\algorithmicinput]}
%\algnewcommand\OUTPUT{\item[\algorithmicoutput]}
%\algnewcommand{\LineComment}[1]{\Statex \(\triangleright\) #1}

% \usepackage{geometry}
% \geometry{top=1in,bottom=1in,left=0.94in,right=0.94in}

\usepackage{amssymb,amsthm}
\usepackage{tikz}
\usepackage{url}
\usepackage{setspace}
% \usepackage[pdftex,bookmarksnumbered,bookmarksopen,
% colorlinks,citecolor=blue,linkcolor=blue,urlcolor=blue]{hyperref}
%\usepackage{tablefootnote}
% \usepackage{framed}
% \usepackage{xcolor}
% \usepackage{soul}
\usepackage{longtable}

\usepackage{times}
% \usepackage{enumitem}
\usepackage{varwidth}
\usepackage{graphicx}
\usepackage{wrapfig}
% \usepackage{enumerate}
\usepackage{caption}
% \usepackage{subcaption}
%\usepackage{subcaption}
%\usepackage{mwe}


\usepackage{amssymb}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{graphicx}
\usepackage{url}
\usepackage{setspace}
%\usepackage[pdftex,bookmarksnumbered,bookmarksopen,
%colorlinks,citecolor=blue,linkcolor=blue]{hyperref}
%\usepackage{tablefootnote}
\usepackage{framed}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{longtable}

\usepackage{times}
% \usepackage{enumitem}
\usepackage{varwidth}
\usepackage{graphicx}
\usepackage{wrapfig}
% \usepackage{enumerate}
%\usepackage{subcaption}
%\usepackage{mwe}




% \usepackage[utf8]{inputenc} % allow utf-8 input
% \usepackage[T1]{fontenc}    % use 8-bit T1 fonts
% %\usepackage{hyperref}       % hyperlinks
% \usepackage{url}            % simple URL typesetting
% \usepackage{booktabs}       % professional-quality tables
% \usepackage{amsfonts}       % blackboard math symbols
% \usepackage{nicefrac}       % compact symbols for 1/2, etc.
% \usepackage{microtype}      % microtypography




%\usepackage[tight]{subfigure}
% % \usepackage{graphicx}
% % \usepackage{appendix}
% \usepackage{amsmath,amsfonts,amsthm}
%\usepackage{algorithmic}
%\usepackage[algo2e,ruled,vlined]{algorithm2e}
%\setlength{\Algomargin}{-0.05em}
% \usepackage{mdwlist}
\usepackage{xspace}
%\usepackage{enumitem}
\usepackage{color}
\usepackage{mathrsfs}

\usepackage{booktabs}
\usepackage{comment}
%\usepackage{geometry}

\usepackage{multirow}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\ind}{\mathbb{I}}

% Functions using mathrm
\renewcommand{\dim}{\mathrm{dim}}
\newcommand{\OPT}{\textup{\textsf{OPT}}}
\newcommand{\range}{\mathcal{range}}
\newcommand{\CR}{\text{CR}}
\newcommand{\sign}{\textup{\textsf{sign}}}
\newcommand{\sgn}{\textup{\textsf{sign}}}
\newcommand{\diag}{\textsf{Diag}}
\newcommand{\ber}{\textup{\textsf{Ber}}}
\newcommand{\err}{\mathrm{err}}
\newcommand{\adv}{\mathrm{adv}}
\newcommand{\nat}{\mathrm{nat}}
\newcommand{\greedy}{\mathrm{greedy}}
\newcommand{\opt}{\mathrm{opt}}
\newcommand{\abstain}{\mathrm{abstain}}
\newcommand{\gen}{(\frac{\nu}{12})}
\newcommand{\error}{\mathrm{err}}
\newcommand{\hinge}{\mathrm{hinge}}
\newcommand{\minimax}{\mathrm{minimax}}
\newcommand{\boundary}{\mathrm{DB}}
\newcommand{\erf}{\mathrm{erf}}
\newcommand{\ERM}{\mathrm{ERM}}
\newcommand{\Appendix}[1]{the full version for}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}

\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\ltwonorm}[1]{\left\| #1 \right\|_2}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{remark}{Remark}
\newtheorem{claim}{Claim}
\newtheorem{fact}{Fact}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newtheorem{conjecture}{Conjecture}
\newtheorem{condition}{Condition}
\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\renewcommand{\a}{\mathbf{a}}
\renewcommand{\b}{\mathbf{b}}
\renewcommand{\c}{\mathbf{c}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\g}{\mathbf{g}}
\renewcommand{\u}{\bm{u}}
\renewcommand{\v}{\mathbf{v}}
\newcommand{\w}{\bm{w}}
\newcommand{\x}{\bm{x}}
\newcommand{\y}{\bm{y}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\F}{\mathbf{F}}
\newcommand{\G}{\mathcal{G}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\bI}{\mathbb{I}}
\newcommand{\K}{\mathcal{K}}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\M}{\mathbf{M}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\P}{\mathcal{P}}
\newcommand{\Q}{\mathbf{Q}}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\S}{\mathbf{S}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\U}{\mathbf{U}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\rank}{\textup{\textsf{rank}}}
\newcommand{\orthc}{\mathbf{orth}_c}
\newcommand{\orthr}{\mathbf{orth}_r}
\newcommand{\bLambda}{\mathbf{\Lambda}}
\newcommand{\RS}{\mathcal{R}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\1}{\mathbf{1}}
\renewcommand{\comment}[1]{}
\newcommand{\red}[1]{{\color{red}#1}}
% \newcommand{\red}[1]{{\color{white}#1}}
\newcommand{\blue}[1]{{\color{blue}#1}}
\newcommand{\tr}{\textsf{tr}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\bbB}{\mathbb{B}}
\newcommand{\bbE}{\mathbb{E}}
\newcommand{\bbN}{\mathbb{N}}
\newcommand{\bbS}{\mathbb{S}}
\newcommand{\Pro}{\text{Pro}}
\newcommand{\imperceptible}{\mathsf{imperceptible}}
\newcommand{\dist}{\mathsf{dist}}
\newcommand{\spann}{\mathsf{span}}
\newcommand{\vol}{\mathsf{vol}}
\newcommand{\Null}{\mathsf{null}}
\newcommand{\Area}{\mathsf{Area}}
\newcommand{\Agree}{\mathsf{Agree}}

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{./sharma_554}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Efficiently Learning the Graph for Semi-supervised Learning\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dravyans@cs.cmu.edu>?Subject=Your UAI 2023 paper}{Dravyansh Sharma}{}}
\author[1]{\href{mailto:<mjones2@andrew.cmu.edu>?Subject=Your UAI 2023 paper}{Maxwell Jones}{}}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science.\\
    Carnegie Mellon University\\
    Pittsburgh, PA, 15213
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

\appendix

\section{Proofs from Sections \ref{sec:addag} and \ref{sec:sparse}}\label{app:proof of approx feedback}


\textbf{Theorem 3.1 (restated).} {\it 
Suppose $l_1,\dots,l_T:\cP\rightarrow[0,1]$ is a sequence of $\beta$-dispersed loss functions, and the domain  $\cP\subset\R^d$ is contained in a ball of radius $R$. The Approximate Continuous Exp3-Set algorithm (Algorithm \ref{algorithm: semibandit}) achieves expected regret $\tilde{O}(\sqrt{dMT\log(RT)}+T^{1-\min\{\beta,\beta'\}})$ with access to $(\epsilon,\gamma)$-approximate semi-bandit feedback with system size $M$, provided $\gamma \le T^{-\beta'},\epsilon\le {\vol(\cB(T^{-\beta}))} T^{-\beta'}$, where $\cB(r)$ is a $d$-ball of radius $r$.
}


\begin{proof}[Proof of Theorem \ref{thm:approx-feedback}]
We adapt the $\textsc{Continuous-Exp3-SET}$ analysis of \cite{alon2017nonstochastic,dick2020semi}. %, and uses similar arguments to the robust meta-learning guarantees in \cite{balcan2021learning} where the learner observes a perturbed version of the true loss function although in the full information setting. 
Define weights $w_t(\rho)$ over the parameter space $\cP$ as $w_{1}(\rho)=1$ and $w_{t+1}(\rho)=w_{t}(\rho)\exp(-\eta\hat{l}_t(\rho))$ and normalized weights $W_t=\int_{\cP}w_t(\rho)d\rho$. Note that $p_t(\rho)=\frac{w_t(\rho)}{W_t}$. We will give upper and lower bounds on  the quantity $\E[\log W_{T+1}/W_{1}]$, i.e. the expected value of the log-ratio of normalized weights.

\noindent Using $\exp(-x) \le 1 - x + x^2/2$ for all $x \ge 0$, we get
\begin{align*}
    \frac{W_{t+1}}{W_t} &= \int_\cP p_t(\rho)\exp(-\eta\hat{l}_t(\rho))d\rho \\&\le 1-\eta \int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho + \frac{\eta^2}{2}\int_\cP p_t(\rho)\hat{l}_t^2(\rho)d\rho.
\end{align*}
Computing the oscillating product and using $1-x\le \exp(-x)$ for all $x\ge 0$, we get
\begin{align*}
    \frac{W_{T+1}}{W_{1}}\le \exp\Bigg(&-\eta \sum_{t=1}^T\int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho + \frac{\eta^2}{2}\sum_{t=1}^T\int_\cP p_t(\rho)\hat{l}_t^2(\rho)d\rho\Bigg).
\end{align*}


\noindent Taking logarithm and expectations on both sides we get

\begin{align*}
    \E\left[\log \frac{W_{T+1}}{W_{1}}\right]\le &-\eta\sum_{t=1}^T\E\left[\int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho\right]+\frac{\eta^2}{2}\sum_{t=1}^T\E\left[\int_\cP p_t(\rho)\hat{l}_t^2(\rho)d\rho\right].
\end{align*}

\noindent %Define $\bar{l}_t(\rho)=\frac{\I[\rho\in A_t(\rho_t)]}{\int_{A_t(\rho_t)}p_t(\rho)d\rho}l_t(\rho)$. 
We have, by the definitions of expectation and approximate semi-bandit feedback,

% \iffalse
% \begin{align*}
%     \E_t\left[l_t(\rho_t)\right]=
% \E\left[\int_{\cP}p_t(\rho)\bar{l}_t(\rho)d\rho\right]
% &=
% \E\left[\int_{A_t(\rho_t)}p_t(\rho)\bar{l}_t(\rho)d\rho\right]\\
% &=
% \E\left[\int_{\Tilde{A}_t(\rho_t)}p_t(\rho)\bar{l}_t(\rho)d\rho\right] + \E\left[\int_{A_t(\rho_t)\setminus\Tilde{A}_t(\rho_t)}p_t(\rho)\bar{l}_t(\rho)d\rho\right]\\
% &\le \E\left[\int_{A_t(\rho_t)\setminus\Tilde{A}_t(\rho_t)}p_t(\rho)d\rho\right]
% \\
% &\le \E\left[\int_{\Tilde{A}_t(\rho_t)}p_t(\rho)\hat{l}_t(\rho)d\rho\right]+\epsilon
% \end{align*}
% \fi

\begin{align*}\E_t\left[l_t(\rho_t)\right]&=
\int_{\cP}p_t(\rho){l}_t(\rho)d\rho\\
&=
\sum_{i=1}^M \int_{\tilde{A}_t^{(i)}}p_t(\rho){l}_t(\rho)d\rho
\\
&=
\sum_{i=1}^M \left[\int_{\hat{A}_t^{(i)}}p_t(\rho){l}_t(\rho)d\rho+\int_{\tilde{A}_t^{(i)}\setminus \hat{A}_t^{(i)}}p_t(\rho){l}_t(\rho)d\rho\right]\\
&\le \sum_{i=1}^M \int_{\hat{A}_t^{(i)}}p_t(\rho)(\tilde{l}_t(\rho)+\gamma)d\rho+M\epsilon \qquad\qquad\qquad\qquad\qquad(\because\; p_t(\rho){l}_t(\rho)\le 1 \;\forall\;\rho)\\
&\le \sum_{i=1}^M \int_{\tilde{A}_t^{(i)}}p_t(\rho)(\tilde{l}_t(\rho)+\gamma)d\rho+M\epsilon \\
&=\int_{\cP}p_t(\rho)\tilde{l}_t(\rho)d\rho+\gamma  + M\epsilon \qquad\qquad\qquad\qquad\qquad\qquad\quad(\because\; \int_{\cP}p_t(\rho)d\rho=1).%\\
% &=\E_t\left[\tilde{l}_t(\rho_t)\right]+\gamma  + M\epsilon 
\end{align*}

Moreover, \begin{align*}\E\left[\int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho\right]&=\E_{<t}\E_t\left[\int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho\right]\\&=\E_{<t}\left[\int_\cP p_t(\rho)\tilde{l}_t(\rho)d\rho\right],\end{align*} using the definition of $\hat{l}_t$ in Algorithm \ref{algorithm: semibandit}. Plugging this in above, we get

\begin{align*}\E\left[l_t(\rho_t)\right]&=\E_{<t}\E_t\left[l_t(\rho_t)\right]\\&\le \E_{<t}\left[\int_\cP p_t(\rho)\tilde{l}_t(\rho)d\rho\right] + \gamma+M\epsilon \\&= \E\left[\int_\cP p_t(\rho)\hat{l}_t(\rho)d\rho\right]+\gamma +M\epsilon,\end{align*}


% \noindent Applying the definition of $(\epsilon,\gamma)$-approximate feedback we can simplify the first term using $\tilde{l}_t(\rho)\ge  l_t(\rho) - \gamma $ for all $\rho\in\Tilde{A}_t^{(i)}$, or

% \begin{align*}\int_\cP p_t(\rho)\tilde{l}_t(\rho)d\rho=\sum_{i=1}^M \int_{A_t^{(i)}} p_t(\rho)\tilde{l}_t(\rho)d\rho&\ge \sum_{i=1}^M \int_{\Tilde{A}_t^{(i)}} p_t(\rho)\tilde{l}_t(\rho)d\rho\\&\ge \sum_{i=1}^M \int_{\Tilde{A}_t^{(i)}} p_t(\rho)l_t(\rho)d\rho-\gamma \sum_{i=1}^M \int_{\Tilde{A}_t^{(i)}} p_t(\rho)d\rho\\
% &\ge \sum_{i=1}^M \int_{\Tilde{A}_t^{(i)}} p_t(\rho)l_t(\rho)d\rho-\gamma
% \end{align*}

% \noindent Putting together, $\E[\int_\cP p_t(\rho)\tilde{l}_t(\rho)d\rho]=\E_t\E_{<t}[\int_\cP p_t(\rho)\tilde{l}_t(\rho)d\rho]\ge \E_t\E_{<t}[\sum_{i=1}^M \int_{\Tilde{A}_t^{(i)}} p_t(\rho)l_t(\rho)d\rho]-\gamma \ge \E\left[l_t(\rho_t)\right]-M\epsilon-\gamma$. For the second term we use,
% \begin{align*}
%     \E[\tilde{l}_t(\rho)^2]&\le
%     \E[l_t(\rho)^2 + \gamma^2 + 2\gamma l_t(\rho)] = \E[{l}_t(\rho)^2]+ \gamma^2 + 2\gamma\E[l_t(\rho)].
% \end{align*}
\noindent and, further,
\begin{align*}
   \E_t[\hat{l}_t(\rho)^2] &=
    \int_\cP p_t(\rho')\left(\frac{\I[\rho\in \Tilde{A}_t(\rho')]}{p_t(\Tilde{A}_t(\rho'))}\tilde{l}_t(\rho)\right)^2d\rho'\\
    &= \left(\frac{\tilde{l}_t(\rho)}{p_t(\Tilde{A}_t(\rho))}\right)^2\int_{\Tilde{A}_t(\rho)}p_t(\rho')d\rho'\\
    &\le\frac{1}{p_t(\Tilde{A}_t(\rho))}.
\end{align*}

\noindent Therefore, $$\E[\int_{\cP}p_t(\rho)\hat{l}_t(\rho)^2d\rho]\le \E\left[\int_{\cP}p_t(\rho)\cdot \frac{1}{p_t(\Tilde{A}_t(\rho))}d\rho\right]=M.$$ Putting together, we get

$$\E\left[\log \frac{W_{T+1}}{W_{1}}\right]\le -\eta\E\left[\sum_{t=1}^Tl_t(\rho_t)\right] + \eta T(M\epsilon+\gamma)+ \frac{\eta^2MT}{2}.$$


% \noindent Here we use that, by the construction of the loss estimator and linearity of expectation, $\E[\tilde{l}_t(\rho)]\ge \E[\hat{l}_t(\rho)] - \epsilon = \E[l_t(\rho)] - \epsilon $ to get the first term and for the second term we use,
% \begin{align*}
%     \E[\tilde{l}_t(\rho)^2]&\le
%     \E[\hat{l}_t(\rho)^2 + \epsilon^2 + 2\epsilon\hat{l}_t(\rho)] = \E[\hat{l}_t(\rho)^2]+ \epsilon^2 + 2\epsilon\E[\hat{l}_t(\rho)].
% \end{align*}
% and, further,
% \begin{align*}
%    \E[\hat{l}_t(\rho)^2] &=
%     \int_\cP p_t(\rho')\left(\frac{\I[\rho\in \Tilde{A}_t(\rho')]}{p_t(\Tilde{A}_t(\rho'))}l_t(\rho)\right)^2d\rho'\\
%     &= \left(\frac{l_t(\rho)}{p_t(\Tilde{A}_t(\rho))}\right)^2\int_{\Tilde{A}_t(\rho)}p_t(\rho')d\rho'\\
%     &\le\frac{1}{p_t(\Tilde{A}_t(\rho'))}.
% \end{align*}
% Plugging into the sum $\E\int_\cP p_t(\rho)\hat{l}_t^2(\rho)d\rho$ and evaluating the integral by writing it as the sum of integrals over the $M$ possible feedback sets $A_t$, we get the upper bound on the second term. 
\noindent We can also adapt the argument of \cite{dick2020semi} to obtain a lower bound for $\frac{W_{T+1}}{W_1}$ in terms of $D_r$, the number of $L$-Lipschitz violations between the worst pair of points within distance $r$ across the $T$ loss functions. We have

\begin{align*}\frac{W_{T+1}}{W_1} &=\frac{1}{\vol(\cP)}\int_{\cP}w_{T+1}(\rho)d\rho \\&\ge \frac{1}{\vol(\cP)}\int_{\cB(\rho^*,r)}w_{T+1}(\rho)d\rho. \end{align*}

\noindent Taking the log and applying Jensen's inequality gives

\begin{align*}\log \frac{W_{T+1}}{W_1}\ge& \log \frac{\vol(\cB(\rho^*,r))}{\vol(\cP)}-\frac{\eta}{\vol(\cB(\rho^*,r))} \int_{\cB(\rho^*,r)}\sum_{t=1}^T\hat{l}_t(\rho)d\rho.\end{align*}

\noindent Taking expectations w.r.t. the randomness in Algorithm \ref{algorithm: semibandit} (but for any loss sequence $l_1,\dots,l_t$) and using the fact that $\cP$ is contained in a ball of radius $R$, we get


\begin{align*}\E\left[\log \frac{W_{T+1}}{W_1}\right]
&\ge d\log \frac{r}{R}-\frac{\eta}{\vol(\cB(\rho^*,r))} \sum_{t=1}^T\E\left[\int_{\cB(\rho^*,r)}\hat{l}_t(\rho)d\rho\right].\end{align*}

\noindent Using $\E[\hat{l}_t(\rho)]=\Tilde{l}_t(\rho)$, and noting that for any fixed $t$ and $r$

\begin{align*}
\int_{\cB(\rho^*,r)}\tilde{l}_t(\rho)d\rho&=\sum_{i=1}^M\int_{\cB(\rho^*,r)\cap \Tilde{A}_t^{(i)}}\tilde{l}_t(\rho)d\rho \\&\le \sum_{i=1}^M\int_{\cB(\rho^*,r)\cap \hat{A}_t^{(i)}}\tilde{l}_t(\rho)d\rho+M\epsilon\\
&\le \sum_{i=1}^M\int_{\cB(\rho^*,r)\cap \hat{A}_t^{(i)}}(l_t(\rho)+\gamma)d\rho+M\epsilon\\&\le \sum_{i=1}^M\int_{\cB(\rho^*,r)\cap \tilde{A}_t^{(i)}}\tilde{l}_t(\rho)d\rho+M\epsilon\\&
=\int_{\cB(\rho^*,r)}l_t(\rho)d\rho + \vol(\cB(\rho^*,r))\gamma + M\epsilon,
\end{align*}

\noindent we get that

\begin{align*}\E\left[\log \frac{W_{T+1}}{W_1}\right]
\ge &d\log \frac{r}{R}-\frac{\eta}{\vol(\cB(\rho^*,r))} \sum_{t=1}^T\int_{\cB(\rho^*,r)}{l}_t(\rho)d\rho-\eta\gamma - \frac{\eta M\epsilon}{\vol(\cB(\rho^*,r))}.\end{align*}

\noindent By above assumption on the number of $L$-Lipschitz violations we get $\sum_tl_t(\rho)\ge \sum_tl_t(\rho^*)-TLr-D_r$, or

\begin{align*}\E\left[\log\frac{W_{T+1}}{W_1}\right] \ge &d\log\frac{r}{R}-\eta\sum_{t=1}^Tl_t(\rho^*)-\eta TLr- \eta D_r-\eta\gamma T-\frac{\eta M\epsilon T}{\vol(\cB(\rho^*,r))}.\end{align*}

\noindent Combining the lower and upper bounds gives

\begin{align*}\E\left[\sum_{t=1}^Tl_t(\rho_t)\right] &-\sum_{t=1}^Tl_t(\rho^*) \le \frac{d}{\eta}\log\frac{R}{r} + \frac{\eta MT}{2} +D_r  +T\left(M\epsilon+2\gamma +Lr+\frac{ M\epsilon }{\vol(\cB(\rho^*,r))}\right). \end{align*}

\noindent Finally, setting $r=T^{-\beta}$, $\eta=\sqrt{\frac{2d\log (RT^\beta)}{TM}}, \gamma\le T^{-\beta'}$ and $\epsilon\le \vol(\cB(r))T^{-\beta'}$, and using that the loss sequence is $\beta$-dispersed, we get the desired regret bound

\begin{align*}\E\left[\sum_{t=1}^Tl_t(\rho_t)-\sum_{t=1}^Tl_t(\rho^*)\right]\le O(\sqrt{dMT\log(RT)}+T^{1-\beta}+T^{1-\beta'})  = O(\sqrt{dMT\log(RT)}+T^{1-\min\{\beta,\beta'\}}).\end{align*}

\noindent In particular, we have used $\vol(\cB(T^{-\beta}))\le \vol(\cB(1{}))\le \frac{8\pi^2}{15}$ for any $d$, $T\ge 1$ and $\beta\ge 0$.
\iffalse
The proof adapts the analysis of the exponential forecaster in \cite{balcan2018dispersion}. Let $W_t = \int_Cw_t(\rho) d\rho$ be
the normalizing constant and $P_t = \E_{\rho\sim p_t}
[u_t(\rho)]$ be the expected payoff at round $t$. Also let $U_t(\rho)=\sum_{j=1}^{t}u_j(\rho)$ denote the sum of true payoffs at round $t$. Further let $\Tilde{u_j}(\rho)$ be an $\epsilon$-uniform-approximation for $u_j(\rho)$, i.e. $|\Tilde{u_j}(\rho)-u_j(\rho)|\le\epsilon$ for all $\rho\in\C$. We seek to bound $R_T=OPT-P(T)$, where $OPT=U_{T}(\rho^*)$ for optimal parameter $\rho^*$ and $P(T)=\sum_{t=1}^{T}P_t$ is the expected utility of the exponential forecaster algorithm in $T$ rounds.
We will do this by lower bounding $P(T)$ and upper bounding $OPT$ by analyzing the normalizing constant  $W_t$.

{\it Lower bound for $P(T)$}: Using the definitions in Algorithm, it follows that
\begin{align*}\frac{W_{t+1}}{W_{t}} &= \frac{\int_{\C}e^{\rho \tilde{u}_t(\rho)}w_{t}(\rho)d\rho}{W_{t}} = \int_{\C}e^{\rho \tilde{u}_t(\rho)}\frac{w_{t}(\rho)}{W_{t}}d\rho = \int_{\C}e^{\rho \tilde{u}_t(\rho)}p_{t}(\rho)d\rho.\end{align*}
Use inequalities $e^{\rho x}\le1+(e^{\rho}-1)x$ for $x\in[0,1]$ and $1+x\le e^x$ to conclude
\begin{align*}\frac{W_{t+1}}{W_{t}} \le\int_{\C}p_{t}(\rho)\left(1+(e^{\rho}-1)\tilde{u}_t(\rho)\right)d\rho = 1+(e^{H\rho}-1)({P_t}+\epsilon) \le \exp\left((e^{\rho}-1)({P_t}+\epsilon)\right).\end{align*}
Finally, we can write $W_{T+1}/W_1$ as a telescoping product to obtain
\[\frac{W_{T+1}}{W_{1}}=\prod_{t=1}^{T}\frac{W_{t+1}}{W_{t}}\le \exp\left((e^{\rho}-1){\sum_t({P_t}+\epsilon)}\right) = \exp\left({(P(T)+\epsilon T)(e^{\rho}-1)}\right),\]
or, $W_{T+1}\le \exp\left({(P(T)+\epsilon T)(e^{\rho}-1)}\right)\text{Vol}(\C)$.

% Observe that $w_t(\rho)=w_1(\rho)\exp(\rho U_{t-1}(\rho))$.

{\it Upper bound for $OPT$}: Let $\B^* (r)$ be the ball of radius $r$ around $\rho^*$. If there are at most $k$ discontinuities in any ball of radius $r$, we can conclude that for all $\rho\in\B^* (r)$, $U_{T}(\rho) \ge OPT - k-LTr$. Now, since $W_{T+1}=\int_Cw_1(\rho)\exp(\rho \Tilde{U}_{T}(\rho))d\rho$, we have

\begin{align*}
     W_{T+1}
&\ge \int_{\B^* (r)}w_1(\rho)e^{\rho\Tilde {U}_{T}(\rho)}d\rho\\&\ge \int_{\B^* (r)}w_1(\rho)e^{\rho(OPT - k-LTr)}d\rho \\&=e^{\rho(OPT - k-LTr-T\epsilon)}\int_{\B^* (r)}w_1(\rho)d\rho.
\end{align*}


Putting together with the lower bound, and rearranging, gives
\begin{align*}OPT-P_T&\le \frac{P(T)(e^{\rho}-1-\rho)}{\rho}+\frac{\log (1/Z)}{\rho}+k+LTr\\
&\le T\rho +\frac{\log (1/Z)}{\rho}+k+(Lr+2\epsilon)T,\end{align*}
where we use that $P(T)\le T$ and for all $x\in[0,1], e^x \le 1 + x + (e-2)x^2$. Take expectation over the sequence of utility functions, set $\epsilon=\frac{1}{T^{\beta'}}$ and apply dispersion to conclude the result.

--- We can also adapt the semi-bandit setting analysis of \cite{dick2020semi} in a similar way.

\fi
\end{proof}

\textbf{Theorem 4.1 (restated).} {\it
The pseudo-dimension of $\mathcal{H}_{k,\sigma}$ is $O(K+\log n)$ when the labeling algorithm $A$ is the mincut approach of \citet{blum2001learning}.
}







\begin{proof}[Proof of Theorem \ref{thm:sigma-pdim}]
Consider an arbitrary node $u$ in any fixed problem instance. Also fix $k\in [K]$. Since $f(d)=\exp(-d^2/\sigma^2)$ is monotonic in $d$ for any $\sigma>0$, the set $N_k(u)$ of $k$ nearest neighbors of $u$ is the same for all values $\sigma$. This is true for any $u$,  therefore $N_k$ and also the set of mutual nearest neighbors $N'_k(u)=\{v\in N_k(u)\mid (u,v)\in N_k\}$ is also fixed given the pairwise distances for the instance.

We can show that the label of $u$ can flip for at most $O(K2^{2K})$ distinct values of $\sigma$ for the given instance. Suppose that the label of $u$ flips for $\sigma=\sigma_0$ (as $\sigma$ is increased from 0 to infinity), say from positive to negative (WLOG). Let $S_+,S_+'\subseteq N'_k$ for $G(k,\sigma_0^-)$ and $G(k,\sigma_0^+)$ respectively denote the positively labeled neighbors of $u$ just before and after $\sigma=\sigma_0$. Note that $\sigma_0$ is the root of an exponential equation in at most $2k$ terms and therefore has at most $2k$ possible values (Lemma 26 in \citet{balcan2021data}) obtained by comparing the total weights of edges in $\delta(u,N_k'\setminus S_+)$  and $\delta(u,S_+')$, where $\delta(v,V)$ denotes the set of edges with one end-point $v$ and the other end point in vertex set $V$. Over all possible pairs of $S_+,S_+'$ we have at most $2k{2^k\choose 2}=O(K2^{2K})$ possibilities for $\sigma_0$.

The above bound holds for any fixed $k$. For all $k\in[K]$ there are at most $O(K^22^{2K})$ label flips for any fixed node $u$ (as $\sigma$ is varied). Summing up over all $n$ possible choices of $u$ and over all $m$ problem instances, we have at most $O(mnK^22^K)$ intervals of $\sigma$ such that the labelings of all nodes are identical for all instances, for all values of $k$, within a fixed interval. Using Lemma 2.3 of \citet{balcan2020data} (proof of which involves arguments similar to those used in the proof of Theorem \ref{thm:threshold}), the pseudo-dimension $m$ satisfies $2^m\le O(mnK^22^K)$, or $m=O(K+\log n)$.
%Let $C_-$ and $C_+$ be min-cuts for $\lim_{\sigma\rightarrow\sigma_0-} G(k,\sigma)$ and $\lim_{\sigma\rightarrow\sigma_0+} G(k,\sigma)$ respectively.
%TODO add proof for harmonic objective.
\end{proof}


\textbf{Theorem 4.2 (restated).} {\it 
The pseudo-dimension of $\mathcal{H}_{k,r}$ is $O(\log n)$ for any labeling algorithm $A$.
}

\begin{proof}[Proof of Theorem \ref{thm:threshold}]
Consider any fixed problem instance with $n$ examples. For any fixed choice of parameter $k$, there are at most $\frac{nk}{2}$ (unweighted) edges in $G(k,r)$ for any value of $r$. Therefore, as $r$ is increased from 0 to infinity, the graph changes only when $r$ corresponds to one of $\frac{nk}{2}$ distinct distances between pairs of data points, and so at most $\frac{nk}{2}+1$ distinct graphs may be obtained for any $k$. Summing over all possible values of $k\in[n]$, we have at most $O(n^3)$ distinct graphs. \looseness-1

\noindent Thus given set $\mathcal{S}$ of $m$ instances $(d^{(i)},L^{(i)},U^{(i)})$, we can partition the real line into $O(mn^3)$ intervals such that all values of $r$ behave identically for all instances, and for all values of $k$, within any fixed interval. Since $A$ and therefore its loss is deterministic once the graph $G$ is fixed, the loss function is identical in each interval. Each piece can have a {\it witness} above or below it as $r$ is varied for the corresponding interval, and so the binary labeling of $\mathcal{S}$ is fixed in that interval. The pseudo-dimension $m$ satisfies $2^m\le O(mn^3)$ and is therefore $O(\log n)$.
\end{proof}

\subsection{Sample complexity for uniform learning.}

Let $h^*:\cX\rightarrow\{0,1\}$ denote the target concept. We say $\cH$ is {\it $(\epsilon,\delta)$-uniformly learnable} with sample complexity $n$ if, for every distribution $\cD$, given a sample $S\sim\cD^n$ of size $n$, with probability at least $1 - \delta$, $\big\lvert \frac{1}{n}\sum_{s\in S}|h(s)-h^*(s)| - \bbE_{s\sim\cD}[|h(s)-h^*(s)|] \big\rvert < \epsilon$ for every $h\in\cH$. It is well-known that $(\epsilon,\delta)$-uniform learnability with $n$ samples implies $(\epsilon,\delta)$-PAC learnability with $n$ samples \citep{anthony1999neural}.


\section{Approximate Soft Label and Gradient Computation}\label{app:conjugate gradient}


The piecewise constant interval computation in Algorithm \ref{algorithm: semi harmonic} needs computation of soft labels $f(\sigma)$ as well as gradients $\frac{\partial f}{\partial \sigma}$ for all unlabeled nodes. Typically, one computes a matrix inverse to exactly compute these quantities, and the exact matrix inverted is different for different approaches. In this section, we provide approximate but more efficient procedures for computing these quantities for computing soft labels using the Harmonic objective approach of \cite{zhu2003semi}, as well as for the scalable approach of \cite{delalleau2005efficient}. We also provide convergence guarantees for our algorithms, in terms of the number of conjugate gradient iterations needed for obtaining an $\epsilon$-approximation to the above quantities. Note that replacing $\text{CG}(A, b, t)$ by the computation $A^{-1}b$ recovers the algorithm from \cite{balcan2021data}, which is more precise but takes longer ($O(n^3)$ time or $O(n^\omega)$, where $\omega$ is the matrix multiplication exponent, for the matrix inversion step).

\subsection{Approximate Efficient Soft-labeling of \cite{zhu2003semi}}

% \subsection{Harmonic Function Approximation}
% Algorithm \ref{algorithm: harmonic approx} computes the soft label that optimizes the harmonic function objective \cite{zhu2003semi} and gradient for a given value of graph parameter $\sigma$ for a fixed unlabeled node $u$, by running the conjugate gradient for given number of iterations.

% \begin{algorithm}
% \caption{$\textsc{HarmonicApproximation}(G,f_L,u, \sigma,\epsilon)$}
% \label{algorithm: harmonic approx}
% \begin{algorithmic}[1]
% \STATE {\bfseries Input:} Graph $G$ with labeled nodes $f_L$, unlabeled node $u$, query parameter $\sigma$, error tolerance $\epsilon$.
% \STATE {\bfseries Output:} approximate soft label $f_{u, \epsilon}$ and approximate gradient $\frac{\partial f_u}{\partial \sigma}_\epsilon$.
% \STATE{Let $\text{CG}(A, b, t)$ represent running the conjugate gradient method for $t$ iterations to solve equation $Ax = b$}
% \STATE{Let $t_\epsilon$ indicate the number of iterations sufficient for $\epsilon$-approximation (Theorem \ref{thm:harmapprox}).}
% \STATE {Let $f_{U, \epsilon}(\sigma)=\text{CG}\left((I - P_{UU}), P_{UL}f_L, t_\epsilon\right)$, where $D_{ij}:=\bI[i=j]\sum_{k}W_{ik}, P=D^{-1}W$}.
% \STATE{Let $\frac{\partial f}{\partial \sigma}_{\epsilon} =\text{CG}\left((I-P_{UU}),\left(\frac{\partial P_{UU}}{\partial \sigma}f_{U,\epsilon}+\frac{\partial P_{UL}}{\partial \sigma}f_L\right), t_\epsilon\right)$}, where
% \begin{align*}
%     \frac{\partial P_{ij}}{\partial \sigma}&=\frac{\frac{\partial w(i,j)}{\partial \sigma}-P_{ij}\sum_{k\in L+U}\frac{\partial w(i,k)}{\partial \sigma}}{\sum_{k\in L+U}w(i,k)},\\
%     \frac{\partial w(i,j)}{\partial \sigma}&=\frac{2w(i,j)d(i,j)^2}{\sigma^3}.
% \end{align*}
% \RETURN{$f_{u, \epsilon}(\sigma), \frac{\partial f_u}{\partial \sigma}_\epsilon$.}
% \end{algorithmic}
% \end{algorithm}


\noindent We provide an approximation guarantee for Algorithm \ref{algorithm: harmonic approx}. We first need a simple lemma to upper bound matrix vector products for positive definite matrices.

\begin{lemma} \label{lem:maxeigen} Suppose matrix $A \in \mathbb{R}^{n \times n}$ is positive definite, with $x \in \mathbb{R}^n$. Then $\|Ax\|_2 \leq \lambda_{\text{max}}\|x\|_2$ where $\lambda_{\text{max}}$ is the maximum eigenvalue of $A$\end{lemma}
\begin{proof}
    The idea is to normalize vector $x$, then consider SVD of $A$. Since the vectors are orthonormal, we will be able to simplify to a form that can be upper bounded by $\lambda_{\text{max}}\|x\|_2$. Letting $\hat{x} = \frac{x}{\|x\|}$ and $\{\phi_i\}_{i \in [n]}$ be an orthonormal basis for $A$, we can write $\hat{x}$ as a linear combination of $\{\phi_i\}$: 
    $$\hat{x} = \sum_{i \in [n]} \alpha_i \phi_i.$$ 
    % Since $\|\hat{x}\| = 1$ and $\{\phi_i\}$ is an orthonormal basis, it must be that each $\alpha_i \leq 1$. From here we see
    Now,
    \begin{align*}
         \ltwonorm{A\hat{x}}^2
         &= \ltwonorm{ A\sum_{i \in [n]} \alpha_i \phi_i}^2 \\
         &= \ltwonorm{\sum_{i \in [n]} \alpha_i \lambda_i \phi_i}^2 \\
         &= \sum_{i \in [n]} \alpha_i^2 \lambda_i^2 &&(\phi_i \text{ orthonormal}) \\
        &\leq  \lambda_{\text{max}}^2 &&\text{($\lambda_i\le \lambda_{\text{max}}\forall i$; $\hat{x}$ is a unit vector).}
    \end{align*}
    Thus, $\|Ax\| \leq \lambda_{\text{max}} \|x\|$
    using $\hat{x} = \frac{x}{\|x\|}$.
\end{proof}

Equipped with this lemma, we are ready to prove our approximation guarantee for Algorithm \ref{algorithm: harmonic approx}.

\begin{theorem} \label{thm:harmapprox}
    \textit{Suppose the function } $f : \mathbb{R} \rightarrow \mathbb{R}$ \textit{is convex and differentiable, and that its gradient is Lipschitz continuous with constant }$L > 0$\textit{, i.e. we have that }$|f'(x) - f'(y)| \leq L |x - y|$\textit{ for any }$x,y$. \textit{Then for some $\sigma \in [\sigma_{\min}, \sigma_{\max}]$, where $\left|\frac{\partial f}{\partial \sigma}\right| < \frac{1}{\epsilon \lambda_{\min}(I - P_{UU})}$ on $[\sigma_{\min}, \sigma_{\max}]$, $\kappa(A)$ is condition number of matrix $A$ and $\lambda_{\min}(A)$ is the minimum eigenvalue of $A$, we can find an $\epsilon$ approximation of $f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma}$} \textit{achieving $\left|f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma} - \left(f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma}\right)_\epsilon\right| < \epsilon$, where $f_{u}(\sigma), \frac{\partial f_u}{\partial \sigma}$ are as described in Algorithm \ref{algorithm: harmonic approx} using } $O\left(\sqrt{\kappa(I - P_{UU})}\log\left(\frac{n}{\epsilon \lambda_{\min}(I - P_{UU})}\right)\right)$ conjugate gradient iterations.
\end{theorem}
\iffalse
\begin{proofoutline}
    Here we show that the quadratic objective introduced in \cite{zhu2003semi} as well as its derivative can be solved efficiently using the conjugate gradient method under standard assumptions, given a well conditioned graph (i.e. reasonable minimum/maximum eigenvalues). We apply \cite{AXELSSON1976123} which states that an $\epsilon$ approximation of the solution $x$ in equation $Ax = b$ can be found in $$O(\sqrt{\kappa(A)} \log\frac{1}{\epsilon})$$ time using the CG method. In order to find an $\epsilon$-approximation of the product $f \frac{\partial f}{\partial \sigma}$ given we can find approximations of $f$ and $\frac{\partial f}{\partial \sigma}$, we show that $f$ and $\frac{\partial f}{\partial \sigma}$ are bounded as well. This gives theoretical justification for use of the conjugate gradient method with some fixed number of steps to find good approximate intervals in Algorithm \ref{algorithm: semi harmonic}, which uses $f \frac{\partial f}{\partial \sigma}$ on lines 7 and 8.\\\\
    First, we note that $I - P_{UU}$ is positive definite, as this is required to apply the CG method to solve equation $(I - P_{UU})x = b$ for some vector $b$ \cite{hestenes1952methods}. Next, we bound $f$ by $\frac{1}{\lambda_{\min}(I - P_{UU})}$ and note $\frac{\partial f}{\partial \sigma}$ is bounded by $\frac{1}{\epsilon \lambda_{\min}(I - P_{UU})}$. 
    An $\epsilon'$ approximation of the CG method gives us $|f_{\epsilon'} - f| \leq \epsilon' f, |\frac{\partial f}{\partial \sigma}_{\epsilon'} - \frac{\partial f}{\partial \sigma}| \leq \epsilon' \frac{\partial f}{\partial \sigma}$. Setting $\epsilon' = \frac{\epsilon^2 \lambda_{\min}(I - P_{UU})}{3}$, we achieve the desired bound of $\left|f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma} - \left(f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma}\right)_\epsilon\right| < \epsilon$\\\\
    By \cite{AXELSSON1976123}, finding $\epsilon'$ approximations using the CG method on positive definite matrix $G$ be done in $$O(\sqrt{\kappa(G)} \log \frac{1}{\epsilon})$$
iterations. Plugging in our $\epsilon'$ value and matrix $I - P_{UU}$, this takes
 $$O\left(\sqrt{\kappa(I - P_{UU})}\log\left(\frac{1}{\epsilon \lambda_{\min}(I - P_{UU})}\right)\right)$$ iterations of the CG method.
\end{proofoutline}
\fi
\begin{proof}

\iffalse
First, we prove that the laplacian $I - P_{UU}$ is positive definite: 
\begin{align*}
    & x^{\top} (I - P_{UU}) x \\
    = & x^{\top} (I - D^{-1}W_{UU}) x \\
    = & x^{\top}D^{-\frac{1}{2}} (D - W_{UU}) D^{-\frac{1}{2}}x \\
    = &  x^{\top}D^{-\frac{1}{2}} \left(\sum_{i \in U, j \not \in U}W_{ij}e_{i - |L|}e^\top_{i - |L|} + \sum_{i,k \in U} W_{ik}(e_{i - |L|} - e_{k - |L|})(e_{i - |L|}i - e_{k - |L|})^\top \right)D^{-\frac{1}{2}}x 
    \end{align*}
    The first sum comes from the fact that matrix $D$ contains weights between labeled and unlabeled nodes, but $W_{UU}$ does not. The second sum describes all weights between nodes in the unlabeled set. \\
    \\Also note that indices in $P_{UU}$ and $I - P_{UU}$ are offset by the number of labels $|L|$ from indices in $W$ \\

    \begin{align*}
    = & \sum_{i \in U, j \not \in U}\frac{W_{ij}}{\sum_{k}W_{ik}}x^\top e_{i - |L|}e^\top_{i - |L|}x + \sum_{j, k \in U} \frac{W_{ij}}{\sum_{k}W_{ik}}x^\top (e_{i - |L|} - e_{k - |L|})(e_{i - |L|}i - e_{k - |L|})^\top x \\
   = & \sum_{i \in U, j \not \in U}\frac{W_{ij}}{\sum_{k}W_{ik}}x_{i - |L|}^2 + \sum_{j, k \in U} \frac{W_{ij}}{\sum_{k}W_{ik}}(x_{i - |L|} - x_{k - |L|})^2 \\
\end{align*}
Note that all weights $W_{ij}$ are positive, and the first sum is a weighted sum of squares of indices of $x$. As a result, if $x$ is nonzero, then the first sum is nonzero, and since the second sum is a sum of squares it must be nonnegative. This shows that the overall sum is positive, and thus the matrix $I - P_{UU}$ is positive definite. 
\fi
A {\it grounded}  Laplacian (aka Dirichlet Laplacian) matrix is obtained by ``grounding'', i.e. removing rows and columns corresponding to, some subset of graph nodes from the Laplacian matrix $L=D-W$.
It is well known that the grounded Laplacian matrix is positive definite \cite{varga1962matrix,miekkala1993graph}. In particular, $\cL_{UU}=D_{UU}-W_{UU}$ and therefore $I-P_{UU}=D^{-1/2}_{UU}\cL_{UU}D^{-1/2}_{UU}$ are positive definite.
This implies $(I - P_{UU})^{-1}$ is also positive definite with maximum eigenvalue $\frac{1}{\lambda_{\min}}$, where $\lambda_{\min}$ is the minimum eigenvalue for $I - P_{UU}$. From here, note that all elements of $ P_{UL}f_L$ are less than one as all labels are 0 or 1, and $P$ is positive in all terms and row normalized to have rowsums of 1. Therefore, %for any $x\in%\\\\

%Since $A$ is positive definite, so is $A^{-1}$, and the max eigenvalue of $A^{-1}$ is $\frac{1}{\lambda_{\text{min}}}$. this implies $$A^{-1}x \leq \frac{x}{\lambda_{\min}}$$

% Therefore,
$$\|f(\sigma)\| = \|(I - P_{UU})^{-1} P_{UL}f_L\|%\leq \kappa_G P_{UL}f_L 
\leq \frac{1}{\lambda_{\min}} \|P_{UL}f_L\| \leq \frac{\sqrt{n}}{\lambda_{\min}}$$ 
where the first inequality holds via Lemma \ref{lem:maxeigen}. \\\\
We now have that $\|f(\sigma)\|$ is bounded by $\frac{\sqrt{n}}{\lambda_{\min}(I - P_{UU})}$ 
 on $[\sigma_{\text{min}}, \sigma_{\text{max}}]$. To find an $\epsilon$ approximation in the sense $\|f - f_\epsilon\| \leq  \epsilon$, we set 
$$\epsilon' = \epsilon\lambda_{\min}(I - P_{UU}) \leq \frac{\sqrt{n}\epsilon}{\max_{\sigma \in [\sigma_{\text{min}}, \sigma_{\text{max}}]}f(\sigma)}$$
and note
$$\|f - f_{\epsilon'}\| \leq  \epsilon' \|f\| \leq  \epsilon$$
We consider this process for  $\frac{\partial f}{\partial \sigma}$ as well since $\frac{\partial f}{\partial \sigma}$ is bounded by $\frac{1}{\epsilon\lambda_{\min}(I - P_{UU})}$. Setting $\epsilon' = \epsilon^2 \lambda_{\min}(I - P_{UU})$, $$\left|\frac{\partial f}{\partial \sigma} - \frac{\partial f}{\partial \sigma}_\epsilon\right| \leq \epsilon' \frac{\partial f}{\partial \sigma} \leq \epsilon$$ holds. Finally, letting $$\epsilon' = \frac{\sqrt{n}\epsilon^2 \lambda_{\min}(I - P_{UU})}{3}$$ we achieve the desired result 
$$\left|f \frac{\partial f}{\partial \sigma} - f_{\epsilon'} \frac{\partial f}{\partial \sigma}_{\epsilon'}\right| < \epsilon' f + \epsilon' \frac{\partial f}{\partial \sigma} + \epsilon'^2 < \epsilon.$$
Next we analyze the number of iterations of the CG method used. By \cite{AXELSSON1976123}, finding $\epsilon'$ approximations using the CG method on positive definite matrix $G$ be done in $$O(\sqrt{\kappa(G)}) \log \frac{1}{\epsilon'}$$
iterations. Here we need an $\epsilon' = \frac{\sqrt{n}\epsilon^2 \lambda_{\min}(I - P_{UU})}{3}$ approximation for matrix $I - P_{UU}$, so this takes
 $$O\left(\sqrt{\kappa(I - P_{UU})}\log\left(\frac{n}{\epsilon \lambda_{\min}(I - P_{UU})}\right)\right)$$ iterations of the CG method.
\end{proof}

\subsection{Approximate Efficient Soft-labeling of \cite{delalleau2005efficient}}
Algorithm \ref{algorithm: delalleau approx} computes the soft label corresponding to the efficient algorithm of \cite{delalleau2005efficient} and gradient for a given value of graph parameter $\sigma$ for a fixed unlabeled node $i$, by running the conjugate gradient for given number of iterations.


\begin{algorithm}[h]
\caption{$\textsc{NonParametricApproximation}(G,f_L, i, \sigma, \epsilon)[\tilde{U},\lambda]$}
\label{algorithm: delalleau approx}
\begin{algorithmic}[1]
\STATE {\bfseries Input:} Graph $G$ with labeled nodes $f_L$ and set of unlabeled nodes $U$, unlabeled node $i\in U$, query parameter $\sigma$, error tolerance $\epsilon$.
\STATE {\bfseries Hyperparameters:} Small subset $\tilde{U} \subset U$ (e.g. chosen by the greedy approach of \cite{delalleau2005efficient}, or via \cite{wang2016scalable}), labeled loss regularization coefficient $\lambda$ (see \cite{delalleau2005efficient}). 
\STATE {\bfseries Output:} approximate soft label $\tilde{f}_{i, \epsilon}$ and approximate gradient $\frac{\partial \tilde{f}_i}{\partial \sigma}_\epsilon$.
\STATE{Let $\text{CG}(A, b, t)$ represent running the conjugate gradient method for $t$ iterations to solve equation $Ax = b$.}
\STATE{Let $t_\epsilon$ indicate the number of iterations sufficient for $\epsilon$-approximation (Theorem \ref{thm:delapprox}).}
\STATE {\label{AlgDelapproxLineWeighted} Let $\tilde{f}_{i, \epsilon}(\sigma)=\frac{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma) f_j(\sigma)_\epsilon}{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)}$, where}
\begin{align*}
    f(\sigma)_\epsilon & = \text{CG}(A, \lambda \overrightarrow{y}, t_\epsilon), \\
    A & = \lambda \Delta_L + Diag(W \mathbf{1}_n) - W, \\
    \left(\Delta_L\right)_{ij} &= \delta_{ij}\delta_{i \in L}, \\
    \overrightarrow{y} &= (y_1,...,y_l,0,...,0)^\top.
\end{align*}
\STATE{Let $\frac{\partial \tilde{f}_i}{\partial \sigma}_{\epsilon} = \frac{\sum_{j \in \tilde{U} \cup L}\frac{\partial W_{ij}}{\partial \sigma}f_j(\sigma) + \sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma) \frac{\partial f_j}{\partial \sigma}_\epsilon + \tilde{f}_{i, \epsilon}(\sigma)\sum_{j \in \tilde{U} \cup L}\frac{\partial W_{ij}}{\partial \sigma}}{\sum_{j \in \tilde{U} \cup L}W_{ij}}$}, where
\begin{align*}
\frac{\partial f}{\partial \sigma}_\epsilon &= -\text{CG}(A, \frac{\partial A}{\partial \sigma} f, t_\epsilon), \\
\frac{\partial A}{\partial \sigma} &= \text{Diag}\left(\frac{\partial W}{\partial \sigma}\bf{1}_n \right) - \frac{\partial W}{\partial \sigma}, \\
\frac{\partial W_{ij}}{\partial \sigma} &= \frac{2 W_{ij}d_{ij}^2}{\sigma^3}. \\
\end{align*}
\RETURN{$\tilde{f}_{i, \epsilon}(\sigma), \frac{\partial \tilde{f}_i}{\partial \sigma}_\epsilon$.}
\end{algorithmic}
\end{algorithm}

\noindent We again provide an approximation guarantee for the algorithm.

\begin{theorem} \label{thm:delapprox}
    Suppose the function  $f : \mathbb{R} \rightarrow \mathbb{R}$ is convex and differentiable, and that its gradient is Lipschitz continuous with constant $L > 0$, i.e. we have that $|f'(x) - f'(y)| \leq L |x - y|$ for any $x,y$. Then for some $\sigma \in [\sigma_{\min}, \sigma_{\max}]$,
    where $\left|\frac{\partial f}{\partial \sigma}\right| \in O\left(\frac{1}{\epsilon \lambda_{\min}(A)}\right)$ on $[\sigma_{\min}, \sigma_{\max}]$, $\kappa(A)$ is 
    condition number of matrix $A$ and $\lambda_{\min}(A)$ is the minimum eigenvalue of $A$, we can find an $\epsilon$ 
    approximation of $\tilde{f}_{u}(\sigma)\cdot\frac{\partial \tilde{f}_u}{\partial \sigma}$ achieving 
    $\left|\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma} - \left(\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma}\right)_\epsilon\right| < \epsilon$, where $\tilde{f}_{u}(\sigma), \frac{\partial \tilde{f}_u}{\partial \sigma}$ are as described in Algorithm \ref{algorithm: delalleau approx} using \\
    $O\left(\sqrt{\kappa(A)}\log\left(\frac{\lambda (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\epsilon \sigma_{\min}\lambda_{\min}(A)}\right)\right)$  conjugate gradient iterations. Here $L_{\text{Labels}}$ and $\tilde{U}_{\text{Labels}}$ are sets of labels as described in Algorithm \ref{algorithm: delalleau approx}, and $\lambda$ is the parameter passed into Algorithm \ref{algorithm: delalleau approx}.
\end{theorem}
\iffalse
\begin{proofoutline}
    The proof follows similarly to \ref{thm:harmapprox}, except we now reference Theorem \ref{thm:delapprox} for a bound on the number of iterations of the CG method. In \cite{delalleau2005efficient}, a matrix inverse is used to calculate labels for some small subset $U$ of our data. From here, a pseudolabel $\Tilde{f}_x(\sigma)$ for any other point $x \not \in U$ is found by taking an average sum of labels in $U$ weighted by their distance to $x$. We find an epsilon approximation for both $\Tilde{f}$ and $\frac{\partial \Tilde{f}}{\partial \sigma}$ with only an extra $\log \frac{|L + U|}{\sigma_{\min}}$ term being applied from \ref{thm:harmapprox}. To do so, we find a slightly tighter $\epsilon'$ approximation of the inverse of the graph matrix corresponding to subset $U$, then show that this gives us $\epsilon$ approximations of the pseudolabels and pseudlabel derivatives. As a result, we show that the number of CG iterations necessary for good approximation 
of pseudolabels/derivatives scales with log of the size of subset $U$. This motivates use of the CG-method with some fixed number of iterations to approximate matrix inverses when using the method from \cite{delalleau2005efficient} to find labels in Theorem \ref{algorithm: semi harmonic}. \\\\

First, we show matrix $A$ is positive definite. In this version, we show $f$ is bounded by $\frac{\lambda}{\lambda_{\min}(A)}$, where $\lambda$ is the parameter from algorithm \ref{algorithm: delalleau approx}. Since $\frac{\partial f}{\partial \sigma}$ is bounded by $\frac{1}{\epsilon\lambda_{\min}(A)}$ We can then find $\epsilon$ approximations of $f$ and $\frac{\partial f}{\partial \sigma}$ satisfying $|f_{\epsilon'} - f| < \epsilon, |\frac{\partial f}{\partial \sigma}_{\epsilon'} - \frac{\partial f}{\partial \sigma}| < \epsilon$ by setting $\epsilon' = \frac{\epsilon^2 \lambda_{\min}(A)}{3\lambda}$. \\\\
    This only solves the subproblem on $|U| \ll |V|$ nodes however, as for an arbitrary vertex $v$, we defined $\tilde{f}_v(\sigma) = \frac{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma) f_j(\sigma)_\epsilon}{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)}$. We now need to find similar $\epsilon$ approximations of $\Tilde{f}, \frac{\partial \Tilde{f}}{\partial \sigma}$. In order to do this, we show that an $\epsilon'' =\frac{\epsilon^2 \lambda_{\min}(A)}{3\lambda} \frac{\sigma^2_{\min}}{(|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}$ approximation is sufficient to achieve $$\left|\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma} - \left(\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma}\right)_{\epsilon''}\right| < \epsilon$$ noting that $|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}| \ll |V|$ \\\\
    Applying \cite{AXELSSON1976123} again, finding $\epsilon''$ approximations using the CG method on positive definite matrix $G$ be done in $$O(\sqrt{\kappa(G)}) \log \frac{1}{\epsilon''}$$
iterations. Putting it all together, this takes
 $$O\left(\sqrt{\kappa(A)}\log\left(\frac{\lambda (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\epsilon \sigma_{\min}\lambda_{\min}(A)}\right)\right)$$ iterations of the CG method.
\end{proofoutline}
\fi
\begin{proof}
As noted in the proof of \ref{thm:harmapprox}, the grounded Laplacian $A$ is positive definite.
    % Similar to \ref{thm:harmapprox}, we show that if $x$ has some nonzero value in the first $|L|$ positions, then $x^\top A x > 0$ for $$A = \lambda \Delta_L + Diag(W \mathbf{1}_n) - W$$. 
    % \begin{align*}
    %     &  x^\top A x \\
    %     = & x^\top (\lambda \Delta_L + Diag(W \mathbf{1}_n) - W) x \\
    %     = & x^\top \lambda \Delta_L x + x^\top (Diag(W \mathbf{1}_n - W)) x \\
    %     = & \lambda \sum_{i \in |L|} x_i^2 + \sum_{i, j} (x_i - x_j)^2 \\ 
    % \end{align*}
    % Here we see that if $x$ has a nonzero term in the first $|L|$ terms, then clearly $x^\top A x > 0$ as the first of the two sum terms is nonzero. Next, consider all of the first $|L|$ terms are zero, but some $j \not \in L$ is nonzero. Then set $i = 0$ and note that $(x_0 - x_j)^2 = x_j^2$ as $x_0$ is a labeled term and thus the second sum is nonzero. In both cases the result $x^\top A x$ is nonzero, and thus the matrix $A$ is positive definite. \\\\
    We can now bound  $A^{-1}$ as in Theorem $\ref{thm:harmapprox}$ and note that 
    $$A^{-1} \lambda \overrightarrow{y} \leq \frac{\lambda \sqrt{|L_\text{Labels}|}}{\lambda_{\min}(A)} $$ via Lemma \ref{lem:maxeigen} as the vector $\overrightarrow{y}$ contains at most $L_{\text{labels}}$  elements with value 1. Note that $\lambda$ is the constant passed in to Algorithm \ref{algorithm: delalleau approx}, and $\lambda_{\min}(A)$ is the smallest eigenvalue of $A$. 
    
    Next, we argue that we can find $\epsilon$ approximations of $f, \frac{\partial f}{\partial \sigma}$ with $\epsilon' = \frac{\sqrt{|L_\text{Labels}|}\epsilon^2\lambda_{\min}(A)}{\lambda}$ similarly to Theorem \ref{algorithm: harmonic approx} as well. 
    From here we consider $\Tilde{f}(\sigma)$ and note that
    \begin{align*}
         \left|\frac{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)f(\sigma)}{\sum_{j \in \tilde{U} \cup L}W_{ij}} - \frac{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)f(\sigma)_\epsilon}{\sum_{j \in \tilde{U} \cup L}W_{ij}}\right| 
        <  \left|\frac{\sum_{j \in \tilde{U} \cup L}W_{ij}}{\sum_{j \in \tilde{U} \cup L}W_{ij}}\epsilon\right| 
        =  \epsilon
    \end{align*}
    Finally, we show that the result holds for $\frac{\partial \tilde{f}_i}{\partial \sigma}$, noting we have proven the result for both $\tilde{f}_{i, \epsilon}$ and $\frac{\partial f}{\partial \sigma}_\epsilon$, and noting that we have exact values for $W_{ij}$ and $\frac{\partial W_{ij}}{\partial \sigma}$
    \begin{align*}
        \left|\frac{\partial \tilde{f}_i}{\partial \sigma}_{\epsilon} - \frac{\partial \tilde{f}_i}{\partial \sigma}\right| 
        &= \frac{\sum_{j \in \tilde{U} \cup L} W_{ij}(\sigma) \epsilon + \epsilon \sum_{j \in \tilde{U} \cup L}{\frac{\partial W_{ij}}{\partial \sigma}}}{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)} \\
        &=  \epsilon + \epsilon \frac{\sum_{j \in \tilde{U} \cup L}\frac{\partial W_{ij}}{\partial \sigma}}{\sum_{j \in \tilde{U} \cup L}W_{ij}(\sigma)} \\
        &= \epsilon + \frac{2\epsilon}{\sigma^3} \frac{\sum_{j \in \tilde{U} \cup L}e^{-\frac{d_{ij}^2}{\sigma^2}
        }d_{ij}^2}{\sum_{j \in \tilde{U} \cup L}e^{-\frac{d_{ij}^2}{\sigma^2}
        }} \\
        &\leq  \epsilon + \frac{2 \epsilon}{\sigma^3} \sum_{j \in \tilde{U} \cup L} e^{-\frac{d_{ij}^2}{2\sigma^2}}d_{ij} &&\text{(Cauchy-Schwartz inequality)} \\
        &\leq  \epsilon + \frac{2 \epsilon}{\sigma^3} \sum_{j \in \tilde{U} \cup L}\sigma e^{-\frac{1}{2}} &&\text{(maximum of $f(x) = xe^{-\frac{x^2}{2c^2}}$ attained at $x = c$)} \\
        &\leq  \epsilon\left(1 + \frac{2 (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\sigma^2}\right) \\
        &\leq  \epsilon\left(1 + \frac{2 (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\sigma_{\min}^2}\right). 
    \end{align*}
In a similar manner to Theorem \ref{algorithm: harmonic approx}, we need 
$$\epsilon' = \frac{\sqrt{|L_\text{Labels}|}\epsilon^2 \lambda_{\min}(A)}{\lambda}$$ to achieve $\epsilon$ approximations of $\frac{\partial f}{\partial \sigma}$ and $\Tilde{f}$.
Setting $$\epsilon'' = \frac{\epsilon^2 \sigma_{\min}^2\lambda_{\min}(A)}{(2(|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|) + \sigma^2_{\min})\sqrt{|L_\text{Labels}|}\lambda}$$
we also achieve
$$\left|\frac{\partial \tilde{f}_i}{\partial \sigma}_{\epsilon'} - \frac{\partial \tilde{f}_i}{\partial \sigma}\right| < \epsilon.$$
As a result, we obtain the desired bound
$$\left|\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma} - \left(\tilde{f}_{u}(\sigma)\frac{\partial \tilde{f}_u}{\partial \sigma}\right)_\epsilon\right| < \epsilon.$$
Since we have that $A$ is positive definite, via \cite{AXELSSON1976123}, This can be achieved in 
$$O\left(\sqrt{\kappa(A)}\log \frac{1}{\epsilon''}\right)=O\left(\sqrt{\kappa(A)}\log\left(\frac{\lambda (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\epsilon \sigma_{\min}\lambda_{\min}(A)}\right)\right)$$ iterations of the CG method.
\end{proof}


\section{Convergence of Nesterov's Gradient Descent and Newton's Method}

In this section we provide useful lemmas that provide convergence analysis for Nesterov's gradient descent and Newton's method, when working with approximate gradients. First we provide a guarantee for Nesterov's method in Theorem \ref{thm:nst}, which uses the result of \cite{d2008smooth} to analyse our algorithm.

\begin{theorem} \label{thm:nst}
    \textit{Suppose the function } $f : \mathbb{R} \rightarrow \mathbb{R}$ \textit{is convex and differentiable, and that its gradient is Lipschitz continuous with constant }$L > 0$\textit{, i.e. we have that }$|f'(x) - f'(y)| \leq L |x - y|$\textit{ for any }$x,y$. \textit{Then if we run Nesterov's method to minimize }$g(\sigma) = (f(\sigma) - \frac{1}{2})^2$ \textit{ on some range $[\sigma_{\text{min}}, \sigma_{\text{max}}]$ where $\left|\frac{\partial f}{\partial \sigma}\right| < \frac{1}{\epsilon \lambda_{\min}(G_A)}$ using $\frac{\partial g}{ \partial \sigma}$ as defined in Algorithm \ref{algorithm: semi harmonic} and finding soft labels and derivatives as defined by some algorithm $A$, we can achieve an $\epsilon$ approximation $\sigma^*_{\epsilon}$ of the optimal result $\sigma^*$ satisfying $|\sigma^*_{\epsilon} - \sigma^*| < \epsilon$ in } $O(\log \log \frac{1}{\epsilon})$ \textit{ iterations of nesterovs method.} We use $O(\text{CG}_A(\frac{\epsilon}{42(\sigma_{\max} - \sigma_{\min})})\log \log \frac{1}{\epsilon})$ conjugate gradient iterations overall, where $\text{CG}_A(\epsilon')$ is the number of conjugate gradient iterations used by algorithm $A$ to achieve $\epsilon'$ approximations of $f, \frac{\partial f}{\partial \sigma}$ satisfying $|f_{u, \epsilon}(\sigma)\frac{\partial f}{\partial \sigma}_\epsilon - f_{u}(\sigma) \frac{\partial f}{\partial \sigma}| < \epsilon'$
\end{theorem}
% \begin{proofoutline}
%     Here, we argue that Nesterov's method \cite{nesterov1983method} with epsilon approximations for the derivative of $g$ allow us to converge to within $\epsilon$ of the optimal minima $\sigma^*$ with the same quadratic convergence of exact Nesterov's method. To do so, we employ \cite{d2008smooth} on compact set $[\sigma_{\text{min}}, \sigma_{\text{max}}]$, which states that if we have $\epsilon$ approximations of $\frac{\partial g}{\partial \sigma}$ satisfying 
%     $$\left|\left\langle \left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'} - \left(\frac{\partial g}{\partial \sigma}\right), y - z \right\rangle\right| \leq \frac{\epsilon}{6} \ \forall y,z \in [\sigma_{\text{min}}, \sigma_{\text{max}}]$$ then Nesterov's method with approximate gradients will find our desired $\sigma^*_{\epsilon}$. We write this bound in terms of the number of CG steps needed to calculate $f$ and $\frac{\partial f}{\partial \sigma}$ with enough accuracy to achieve this bound for $\frac{\partial g}{\partial \sigma}$. This will allow us to apply Theorems \ref{thm:harmapprox} and \ref{thm:delapprox} later on. Here we consider the case of a bounded derivative on our range, and show later that if this is not the case, Algorithm \ref{algorithm: semi harmonic} will still converge as a result of bounds on the Newton's method step size. This theorem motivates the use of Nesterov's accelerated gradient descent succesfully finding approximate minima in Algorithm \ref{algorithm: harmonic approx} \\\\
%     First we find an $\epsilon' = \frac{\epsilon}{7 * 6(\sigma_{\max} - \sigma_{\min})}$ approximations of $f\frac{\partial f}{\partial \sigma}$, and use it to find $g$ and $\frac{\partial g}{\partial \sigma}_\epsilon$ satisfying $$\left|\left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'} - \left(\frac{\partial g}{\partial \sigma}\right)\right| \leq \frac{\epsilon}{6 (\sigma_{\text{max}} - \sigma_{\text{min}})}$$
%  On compact set $[\sigma_{\text{min}}, \sigma_{\text{max}}]$ with this bound, we then have that 
%  $$\left|\left\langle \left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'} - \left(\frac{\partial g}{\partial \sigma}\right), y - z \right\rangle\right| \leq \frac{\epsilon}{6} \ \forall y,z \in [\sigma_{\text{min}}, \sigma_{\text{max}}]$$
%     We apply \cite{d2008smooth} to argue that Nesterov's gradient descent with terms $\left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'}$ as updates to produce a solution within $\epsilon$ of the optimal value $\sigma^*$ with quadratic convergence. This takes $\log \log \frac{1}{\epsilon}$ iterations, and at every step we required an $\epsilon' = \frac{\epsilon}{7 * 6(\sigma_{\max} - \sigma_{\min})$ approximation of $f\frac{\partial f}{\partial \sigma}$. This yeilds $$O(\text{CG}_A(\frac{\epsilon}{42(\sigma_{\max} - \sigma_{\min})})\log \log \frac{1}{\epsilon})$$ iterations overall
% \end{proofoutline}
\begin{proof}
 First, note that 
 \begin{align*}
     \left|\frac{\partial g_u}{\partial \sigma} - \frac{\partial g_u}{\partial \sigma}_{\epsilon'}\right|
 &= \left|2\left(f_u(\sigma) - \frac{1}{2}\right)\left(\frac{\partial f_u}{\partial \sigma}\right) - 2\left(f_u(\sigma)_{\epsilon'} - \frac{1}{2}\right)\left(\frac{\partial f_u}{\partial \sigma}_{\epsilon'}\right)\right|\\
&\leq 4 \epsilon' f_u(\sigma) \frac{\partial f_u(\sigma)}{\partial \sigma} + 2 (\epsilon')^2 f_u(\sigma) \frac{\partial f_u(\sigma)}{\partial \sigma} + \epsilon' \frac{\partial f_u(\sigma)}{\partial \sigma}\\
&\leq 7 \left|f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma} - \left(f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma}\right)_\epsilon\right|.
 \end{align*}

 
 \noindent Letting $$\epsilon' = \frac{\epsilon}{42 (\sigma_{\text{max}} - \sigma_{\text{min}})}$$
 we find $\epsilon'$ approximations of $f$ and $\frac{\partial f_u}{\partial \sigma}$ in $\text{CG}_A(\epsilon')$ steps. 
 We can then bound $$\left|\left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'} - \left(\frac{\partial g}{\partial \sigma}\right)\right| \leq \frac{\epsilon}{6 (\sigma_{\text{max}} - \sigma_{\text{min}})}.$$
 On compact set $[\sigma_{\text{min}}, \sigma_{\text{max}}]$ with this bound, we then have that 
 $$\left|\left\langle \left(\frac{\partial g}{\partial \sigma}\right)_{\epsilon'} - \left(\frac{\partial g}{\partial \sigma}\right), y - z \right\rangle\right| \leq \frac{\epsilon}{6} \ \forall y,z \in [\sigma_{\text{min}}, \sigma_{\text{max}}].$$
 With this, \cite{d2008smooth} shows that Nesterov's accelerated gradient descent using an approximate gradient will converge to within $\epsilon$ of the optimal $\sigma^* \in [\sigma_{\text{min}}, \sigma_{\text{max}}]$ in $O(\frac{1}{\sqrt{\epsilon}})$ complexity. This yields $O(\log \log \frac{1}{\epsilon})$ steps until convergence\\\\
 Next we analyze the number of iterations of the CG method used. We called algorithm $A$ $O(\log \log \frac{1}{\epsilon})$ times, each time using $\text{CG}_A(\epsilon') = \text{CG}_A\left(\frac{\epsilon}{42(\sigma_{\max} - \sigma_{\min})}\right)$ iterations. This yields
 $$O\left(\text{CG}_A\left(\frac{\epsilon}{42(\sigma_{\max} - \sigma_{\min})}\right)\log \log \frac{1}{\epsilon}\right)$$
 overall iterations of the CG method to find $\sigma^*$.
\end{proof}

\noindent We also provide an analysis for convergence of Newton's method using approximate gradients in Theorem \ref{thm:ns}.

\begin{theorem}\label{thm:ns}
\textit{Suppose the function $f: \mathbb{R} \rightarrow \mathbb{R}$ has multiplicity 2 at optimal point $x^*$, with $f(x^*) = 0$. If Newton's accelerated method $x_{n + 1} = x_n - 2\frac{f(x_n)}{f'(x_n)}$ coverges quadratically, then so does an epsilon approximation $x_{n + 1} = x_n  - \frac{f(x_n)}{f'(x_n)_\epsilon}$} satisfying $|f'(x)_\epsilon - f'(x)| \leq \epsilon |f(x)| \forall x \in \mathbb{R}$
\end{theorem}
% \begin{proofoutline}
% We use the proof of accelerated Newton's method and the definition of a point with multiplicity 2 to prove the result updated with epsilon approximations, and show that the Newton's step of Algorithm \ref{algorithm: semi harmonic} will converge quadratically as well under standard assumptions. This theorem motivates the use of Newton's method succesfully finding approximate minima in Algorithm \ref{algorithm: harmonic approx} \\\\\

% First, quadratic convergence of accelerated Newton's method gives us $e_{n + 1} \leq L e_n^2$ for some constant $L$, where $e_n = x^* - x_n$ is the error for the accelerated Newton's method update, and $x_{n + 1} = x_n - 2\frac{f(x_n)}{f'(x_n)}$\\\\
% Using the Lagrange form of the Taylor series expansion, we see: 
% $$f(x_n) = f(x^*) + f'(x^*)(x^* - x_n) + (x^* - x_n)f''(\xi)$$
% with $\xi$ between $x^k$ and $x^*$. 
% Letting $x^*$ be the optimal point with $f(x^*) = 0, f'(x^*) = 0$ by multiplicity 2, we see that $f(x_k) =  (x^* - x_n)f''(\xi)$. Using this, we arrive at 
% $$  e_{n + 1} \leq & (L + 2\frac{\epsilon}{1+ \epsilon}f''(\xi))e_n^2$$
% as $x_n \rightarrow x^*$, we see that this is quadratic convergence if we are sufficiently close to $x^*$ ($f''(\xi) < C f''(x^*) \forall \xi \in [x_n, x^*]$). 
% \end{proofoutline}
\begin{proof}
First, quadratic convergence of accelerated Newton's method gives us $e_{n + 1} \leq L e_n^2$ for some constant $L$, where $e_n = x^* - x_n$ is the error for the accelerated Newton's method update, and $x_{n + 1} = x_n - 2\frac{f(x_n)}{f'(x_n)}$.

Using the Lagrange form of the Taylor series expansion, we see that
$$f(x_n) = f(x^*) + f'(x^*)(x^* - x_n) + (x^* - x_n)f''(\xi)$$
with $\xi$ between $x^k$ and $x^*$. 
Letting $x^*$ be the optimal point with $f(x^*) = 0, f'(x^*) = 0$ by multiplicity 2, we see that $f(x_k) =  (x^* - x_n)f''(\xi)$. 
Now to handle the $\epsilon$-approximate case note that
\begin{align*}
     e_{n + 1} 
     &= x^* - x_n - 2\frac{f(x^k)}{f'(x_n)_\epsilon} \\
    &\leq  x^* - x_n - 2\frac{f(x^k)}{f'(x_n)(1 + \epsilon)} \\
    &= x^* - x_n - \frac{2f(x^k)}{f'(x_n)} + \frac{2\epsilon}{1 + \epsilon}f(x_n) \\
    &\leq  L e_n^2 + \frac{2\epsilon}{1 + \epsilon}f(x_n) \\
    &\leq  L e_n^2 + \frac{2\epsilon}{1 + \epsilon} (x^* - x_n)^2f''(\xi)\\
    &\leq  \left(L + \frac{2\epsilon}{1+ \epsilon}f''(\xi)\right)e_n^2
\end{align*}
as $x_n \rightarrow x^*$, we see that this is quadratic convergence if we are sufficiently close to $x^*$ ($f''(\xi) < C f''(x^*) \forall \xi \in [x_n, x^*]$). 
\end{proof}



\section{Full Proof Details from Section 5}

\begin{theorem}\label{thm:harmbound}
    Given an algorithm for computing $\epsilon$-approximate soft labels and gradients for the harmonic objective of \citet{zhu2003semi} (\hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}}), if the soft label function $f_u(\sigma)$ is convex and smooth, Algorithm \ref{algorithm: semi harmonic} computes $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ %outputs the interval $[\sigma_{\min, \epsilon}, \sigma_{\max, \epsilon}]$ containing $\sigma$ (of the piecewise constant loss function $l(\sigma)$) up to $\epsilon$ accuracy (i.e., $|\sigma_{\min, \epsilon} - \sigma_{\min}| < \epsilon, |\sigma_{\max, \epsilon} - \sigma_{\max}| < \epsilon$) 
    in time $O\left(|E_G|n\sqrt{\kappa(\cL_{UU})}\log\left(\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\right)\log\log\frac{1}{\epsilon}\right)$, where $|E_G|$ is the number of edges in graph $G$, $\cL_{UU}=I - P_{UU}$ is the normalized grounded graph Laplacian (with labeled nodes grounded), $\Delta=\sigma_{\max} - \sigma_{\min}$ is the size of the parameter range and $\kappa(M)=\frac{\lambda_{\max}(M)}{\lambda_{\min}(M)}$ denotes the condition number of matrix $M$.
\end{theorem}
% \red{TODO: assumes f(sigma) is convex; can we get a weaker-than-quadratic convergence for non-convex f?}
\begin{proof}
% We present a proof similar to Theorem 14 in \citet{balcan2021data}, using Theorems \ref{thm:harmapprox}, \ref{thm:nst}, and \ref{thm:ns} to provide an overall time bound on the interval finding in Algorithm \ref{algorithm: semi harmonic} using $\epsilon$-approximations of the harmonic objective. We argue that Nesterov's accelerated gradient \citep{nesterov1983method} will converge given a bounded derivative, and if the derivative is very large, then the Newton's step will provide an upper bound of $\epsilon$ in the update step on either line \ref{algSemiHarmonic-linenestupdate} or \ref{algSemiHarmonic-linenewtupdate} of Algorithm \ref{algorithm: semi harmonic}, causing line \ref{algSemiHarmonic-linewhile} to return false and the algorithm to terminate. %Further, we define it in terms of the number of edges $E_G|$ to highlight time savings for kNN graphs. \\\\

As in \citet{balcan2021data}, note that any boundary $\sigma_{\min}$ or $\sigma_{\max}$ must have some $f_{u}(\sigma) = \frac{1}{2}$. Algorithm \ref{algorithm: semi harmonic} finds these boundary pieces by finding roots/zeros of $\left(f_u(\sigma) - \frac{1}{2}\right)^2$. As noted in Theorems C.1 and C.2, both Nesterov's and Newton's descent methods have quadratic convergence, so at every update step in algorithm \ref{algorithm: semi harmonic} (lines 12 and 15), we converge quadratically, leading to $\log \log (\frac{1}{\epsilon})$ update steps needed to satisfy $|\sigma_\epsilon^* - \sigma^*| < \epsilon$, where $\sigma^*$ is the root with $g_u(\sigma^*) = 0$.\\\\
In Theorems \ref{thm:harmapprox} and \ref{thm:delapprox}, we assumed that $|\frac{\partial f}{\partial \sigma}| < \frac{1}{\epsilon \lambda_{\min}(G)}$ for some graph $G$. Consider this is not the case. We examine the Newton update, which is an upper bound on the size of the update step used as our update uses the minimum of Newton and Nesterov steps: 
\begin{align*}
     2 \cdot &\frac{g_u(\sigma)}{g'_u(\sigma)} 
     = 2 \cdot \frac{(f_u(\sigma) - 1/2)^2}{2 \cdot (\partial f/\partial \sigma)(f_u(\sigma) - 1/2)} \\
      &= \frac{(f_u(\sigma) - 1/2)}{(\partial f / \partial \sigma)} \\
     &< \epsilon \lambda_{\min}(G)(f_u(\sigma) - 1/2) &&(\because |{\partial f}/{\partial \sigma}| > {1}/{\epsilon \lambda_{\min}(G)})\\
     &< \epsilon &&(\because\text{$f_u(\sigma)\le1/\lambda_{\min}$, cf. Thms \ref{thm:harmapprox} and \ref{thm:delapprox}}). \\
\end{align*}
Thus in this case the update step is less than $\epsilon$, and we will terminate after one subsequent step.

 As noted in Theorem \ref{thm:harmapprox}, we need $O\left(\sqrt{\kappa(\cL_{UU})}\log\left(\frac{n}{\epsilon' \lambda_{\min}(\cL_{UU})}\right)\right)$ CG steps to reach an $\epsilon'$ approximation of $f \frac{\partial f}{\partial \sigma}$. Theorem \ref{thm:nst} states that we need $\epsilon' = O\left(\frac{\epsilon}{\Delta}\right)$ to find an $\epsilon$ approximation of the root $\sigma^*$, so this takes complexity 

$$O\left(\sqrt{\kappa(\cL_{UU})}\log\left(\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\right)\right).$$

% $$O(\sqrt{\kappa(\cL_{UU})}\left(\log\left(\frac{1}{\epsilon \lambda_{\min}(\cL_{UU})(\sigma_{\max} - \sigma_{\min})}\right)\right))$$
Running a single iteration of the conjugate gradient method requires a constant number of matrix-vector products of form $Ax$, where $A$ is the weighted adjacency matrix for graph $G$. This computation takes $O(|E_G|)$ time. %, given the matrix is stored in some sparse format and the number of edges is in $\Omega(n)$. If not, the $E_G$ term is replaced with an $n$ term.\\\\
Finally, we run this algorithm for each of the $n$ points, leading to an overall time complexity of 

$$O\left(|E_G|n\sqrt{\kappa(\cL_{UU})} \log\left(\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\right)\log\log\frac{1}{\epsilon}\right).$$

If $G$ is the complete graph, $|E_G| \in O(n^2)$. If $G$ is a kNN graph for some fixed $k$, then $|E_G| = kn \in O(n)$.  
\end{proof}


\section{Experiment Details and Insights}

We include further experimental details below, including implementation and insights into further challenges as well as potential future work.

\begin{figure}
\centering
\subfloat[Algorithm \ref{algorithm: harmonic approx} \\ $|U| = 100$]{\includegraphics[width = 2in]{images/full_accuracys/normal.png}}
\subfloat[Algorithm \ref{algorithm: harmonic approx} (kNN) \\ $|U| = 100$]{\includegraphics[width = 2in]{images/full_accuracys/kNN.png}}
\subfloat[Algorithm \ref{algorithm: delalleau approx} (kNN) \\ $|\Tilde{U}| = 100, |U| = 1000$]{\includegraphics[width = 2in]{images/full_accuracys/delalleau.png}}
\caption{Accuracy values across $\sigma$ for different approaches using the CG Method with 20 iterations.}
\label{fig:intervals}
\end{figure}

\begin{figure}
\centering
\subfloat[MNIST]{\includegraphics[width = 2in]{images/intervals/MNIST_100_10_labels_seed_7_CG_20.png}}
\subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{images/intervals/FashionMNIST_100_10_labels_seed_5_CG_20.png}}
\subfloat[USPS]{\includegraphics[width = 2in]{images/intervals/USPS_100_10_labels_seed_6_CG_20.png}}
\caption{Interval calculation with labeling via Algorithm \ref{algorithm: harmonic approx} (complete graph) $|U| = 100$.}
\label{fig:naive}
\end{figure}

\begin{figure}
\centering
\subfloat[MNIST]{\includegraphics[width = 2in]{images/intervals/MNIST_100_10_labels_seed_6_CG_20_kNN_6.png}}
\subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{images/intervals/FashionMNIST_100_10_labels_seed_14_CG_20_kNN_6.png}}
\subfloat[USPS]{\includegraphics[width = 2in]{images/intervals/USPS_100_10_labels_seed_25_CG_20_kNN_6.png}}
\caption{Interval calculation with labeling via Algorithm \ref{algorithm: harmonic approx}, kNN with $k=6$, $|U| = 100$.}
\label{fig:kNN}
\end{figure}

\begin{figure}[h]
\centering
\subfloat[MNIST]{\includegraphics[width = 2in]{images/intervals/delalleau_MNIST_100_10_labels_seed_4_CG_20_kNN_6.png}}
\subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{images/intervals/delalleau_FashionMNIST_100_10_labels_seed_2_CG_20_kNN_6.png}}
\subfloat[USPS]{\includegraphics[width = 2in]{images/intervals/delalleau_USPS_100_10_labels_seed_2_CG_20_kNN_6.png}}
\caption{Interval calculation with labeling via Algorithm \ref{algorithm: delalleau approx}. 
$|L| = 10, |U| = 100, |\Tilde{U}| = 1000$.}
\label{fig:del}
\end{figure}

\begin{figure}[b]
\subfloat[$\sqrt{\kappa(I - P_{UU})}$ (\ref{thm:harmbound})]{\includegraphics[width = 1.6in, height=1.32in]{images/condition_numbers/MNIST.png}} 
\subfloat[$\sqrt{\kappa(I - P_{UU})}$ (\ref{thm:harmbound}, kNN)]{\includegraphics[width = 1.6in, height=1.32in]{images/condition_numbers/MNIST_kNN.png}} 
\subfloat[$\sqrt{\kappa(A)}$ (\ref{thm:delbound})]{\includegraphics[width = 1.6in]{images/condition_numbers/delalleau_MNIST.png}} 
\subfloat[$\sqrt{\kappa(A)}$ (\ref{thm:delbound}, kNN)]{\includegraphics[width = 1.6in]{images/condition_numbers/delalleau_MNIST_kNN.png}} 
\caption{Condition numbers for matrices from Theorems \ref{thm:harmbound} and \ref{thm:delbound} for MNIST subsets size 100.}
\label{fig:kappa}
\end{figure}

\subsection{Implementation Details} 
For all experiments, we consider 10 random subsets of datasets MNIST, FashionMNIST, and USPS. We will consider a bounded parameter domain to avoid highly ill-conditioned graph matrices (Figure \ref{fig:kappa}). We pick $\sigma_{\text{min}}$ based on behavior of graph condition numbers for low $\sigma$ values, where $\sigma_{\text{min}}$ is 1 for MNIST and FashionMNIST using the CG method, and .4 for USPS using the CG method. We keep $\sigma_{\text{max}} = 7$ for all experiments. We find that Algorithm \ref{algorithm: semi harmonic} does not produce valid intervals when condition number is high, and note that ill-conditioned graphs lead to low accuracy. In Figure \ref{fig:intervals}, we see that USPS has higher accuracy values in range $[.4,1]$ while MNIST and FashionMNIST do not display optima until later $\sigma$ values. We find that computing a full matrix inverse is less stable than the CG method for low $\sigma$ values, and use $\sigma_{\text{min}} = 2$ for full inverse interval calculation. We keep $\sigma_{\text{min}} = 1$ always when calculating average number of intervals overall in Table \ref{table:regularIntervals} %, \ref{table:kNNintervals}, and \ref{table:delIntervals} 
in order to compare number of intervals on the same range ([1,7]) for all problem instances. 

   \noindent In order to find intervals, we begin with $\sigma_0 = \sigma_{\text{min}}$. Once interval $[\sigma_{l}, \sigma_{u}]$ is calculated for $\sigma_0 = \sigma_{\text{min}}$, we let $\sigma_0^{(1)} = \sigma_{u} + \text{step}$ as the next initial $\sigma$. Here we use $\text{step} = .05, \epsilon = 1e-4, \eta = 1$, where $\epsilon$ and $\eta$ are used as in Algorithm \ref{algorithm: semi harmonic}. We also consider algorithmic optimizations to speedup runtime and improve performance of Algorithm \ref{algorithm: semi harmonic}, which can be found in Appendix \ref{sec:algopt}.  

\begin{figure}[b]
\centering
\subfloat[MNIST]{\includegraphics[width = 2in]{./images/failure_cases/MNIST_100_10_labels_seed_11_CG_20_kNN_6.png}}
\subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{./images/failure_cases/FashionMNIST_100_10_labels_seed_2_CG_20.png}}
\subfloat[USPS]{\includegraphics[width = 2in]{./images/failure_cases/USPS_300_10_labels_seed_5_CG_20.png}}
\caption{Challenging cases for algorithm}
\label{fig:failure}
\end{figure}

\subsection{Algorithm Optimization}\label{sec:algopt}
\noindent A few optimizations of Algorithm \ref{algorithm: semi harmonic} were used in practice. First, note that if we update the left endpoint $\sigma_l$ of the piecewise constant interval containing $\sigma_0$ on line \ref{algSemiHarmonic-lineupdateint}, then we need not consider any root $\sigma_{l'}$ with $\sigma_{l'} < \sigma_l$, as it will not change our current left endpoint of the interval. As a result, we can stop the while loop on line \ref{algSemiHarmonic-linewhile} of Algorithm \ref{algorithm: semi harmonic} if we leave current interval range $[\sigma_l, \sigma_h]$ at any point in the algorithm. Second, we change the while loop on line \ref{algSemiHarmonic-linewhile} to:
$$|\sigma_{n + 1} - \sigma_n| \ge \epsilon \text{ OR } |f_u(\sigma_u)_{n + 1} - f_u(\sigma_u)_n| \ge \epsilon $$
Noting that $f$ values can go through very short periods of high change as evidenced by Figure \ref{fig: gd+ns}, if $f$ values have some large change in a given step, then we may be making progress towards a critical point even if $\sigma$ has not changed drastically. Further, if both quantities are under $\epsilon$ yet we have a label that is not close to .5, we stop the algorithm prematurely without having found a critical point.

\noindent Finally, for $k$-nearest neighbor graphs, we note that the Gaussian kernel preserves order across $\sigma$, i.e. 
$$d(a,b) < d(c,d) \implies e^{-\frac{d(a,b)^2}{\sigma^2}} > e^{-\frac{d(c,d)^2}{\sigma^2}} \quad\forall\; \sigma \in (0, \infty).$$ 




\begin{table}[t]
\centering
\begin{tabular}{ |c|c|c|c| } 
\hline
\multirow{2}{4em}{Dataset} & \multirow{2}{2em}{Size} & 
\multirow{2}{9em}{Time per Inverse (s), Full Inverse} & 
\multirow{2}{8em}{Optimal Accuracy, Full  Inverse} \\
&&& \\\hline
\multirow{3}{4em}{MNIST}
& 500 & 0.1285 & 0.9988 \\
& 1000 & 0.2248 & 0.9991 \\
& 2000 & 0.5528 & 0.9986  \\ 
\hline
\multirow{3}{4em}{Fashion-MNIST}
& 500 & 0.1275 & 0.9502  \\
& 1000 & 0.2312 & 0.9775 \\
& 2000 & 0.5454 & 0.9570   \\
\hline
\multirow{3}{4em}{USPS}
& 500 & 0.1445 & 0.9998   \\
& 1000 & 0.2230 & 0.9997  \\
& 2000 & 0.5437 & 1.0   \\
\hline
\end{tabular}   
\caption{Optimal Accuracy/Average Time computing matrix inverse via Algorithm \ref{algorithm: harmonic approx} (Averaged over 10 samples).}
\label{table:invtimefull}
\end{table}

\begin{table}[t]
\centering
\begin{tabular}{ |c|c|c|c| } 
\hline
\multirow{2}{4em}{Dataset} & \multirow{2}{2em}{Size} & 
\multirow{2}{9em}{Time per Inverse (s), Full Inverse} & 
\multirow{2}{8em}{Optimal Accuracy, Full  Inverse} \\
&&& \\
\hline
\multirow{3}{4em}{MNIST}
& 500 & 0.1235 & 0.999\\
& 1000 & 0.2181 & 0.9993\\
& 2000 & 0.5354 & 0.9992\\
\hline
\multirow{3}{4em}{Fashion-MNIST}
& 500 & 0.1299 & 0.9637\\
& 1000 & 0.2244 & 0.9638\\
& 2000 & 0.5337 & 0.9683\\
\hline
\multirow{3}{4em}{USPS}
& 500 & 0.1254 & 0.9998\\
& 1000 & 0.2189 & 0.9998\\
& 2000 & 0.5411 & 1.0\\
\hline
\end{tabular}   
\caption{Optimal Accuracy/Average Time computing matrix inverse \textbf{with kNN=6} via Algorithm \ref{algorithm: harmonic approx} (Averaged over 10 samples).}
\label{table:invtimefull}
\end{table}

As a result, we only need to calculate $k$-nearest neighbors once when finding an interval centered around $\sigma_0$. After a kNN mask is computed for $W_{\sigma}$, it can then be used for any subsequent $W_{\sigma'}$. When analyzing the time to compute all intervals in a given range, this could be computed once for all starting points, but since we were interested in time for each interval, we calculated the mask for every interval.



\subsection{Challenging Cases}

There were certain challenging problem instances associated with Algorithm \ref{algorithm: semi harmonic}. First, we considered using gradient descent methods with adaptive step size \cite{VRAHATIS2000367} to combat the issue of very different gradient values at different $\sigma$ values, but we found this method to be ineffective for our specific task. Some of the datasets would return singular matrices or non-convergent matrices for very low $\sigma$ values, leading to interval calculation needing to be started at some later point, as mentioned in Section \ref{sec:FeedbackSet}. Similarly, we find that the algorithm was more likely to miss intervals for very high values of $\sigma$ ($\geq 6$) as seen in Figure \ref{fig:failure}. This could be due to very different or lower condition numbers as compared to earlier $\sigma$ values as evidenced by Figure \ref{fig:kappa}. One solution could be updating the learning rate $\eta$ as a function of condition number or $\sigma$ value. We also find an outlier graph in the kNN Delalleau graph family displayed in Figure \ref{fig:kappa} that behaved similarly to the harmonic minimizer graphs for low $\sigma$. In addition, we find that the algorithm may not be able to correctly find the rightmost point of very long piecewise intervals (size 3 or more), as the descent algorithms has trouble finding critical points that are very far from the initial $\sigma$.

\begin{table}[t]
\centering
\begin{tabular}{ |c|c|c|c|c|c|c|c| } 
\hline
\multirow{2}{4em}{Dataset} & \multirow{2}{2em}{Size} & 
\multirow{2}{5em}{Time,\\ CG, $t=5$ } & 
\multirow{2}{5em}{Time,\\ CG, $t=10$} & 
\multirow{2}{5em}{Time,\\ CG, $t=20$} & 
\multirow{2}{5em}{Accuracy, CG, $t=5$} &
\multirow{2}{5em}{Accuracy, CG, $t=10$} &
\multirow{2}{5em}{Accuracy, CG, $t=20$} \\
&&&&&&& \\
\hline
\multirow{3}{4em}{MNIST}
& 500 & 0.004 & 0.0041 & 0.004 & 0.9971 & 0.9971 & 0.9971\\
& 1000 & 0.0058 & 0.0058 & 0.006 & 0.9958 & 0.9958 & 0.9958\\
& 2000 & 0.0238 & 0.0234 & 0.0234 & 0.9956 & 0.9956 & 0.9956\\
\hline
\multirow{3}{4em}{Fashion-MNIST}
& 500 & 0.0041 & 0.004 & 0.004 & 0.9561 & 0.9561 & 0.9561\\
& 1000 & 0.0058 & 0.0059 & 0.0059 & 0.9544 & 0.9544 & 0.9544\\
& 2000 & 0.0235 & 0.0256 & 0.0241 & 0.9579 & 0.9579 & 0.9579\\
\hline
\multirow{3}{4em}{USPS}
& 500 & 0.0041 & 0.0041 & 0.004 & 0.9945 & 0.9945 & 0.9945\\
& 1000 & 0.0058 & 0.0056 & 0.006 & 0.9989 & 0.9989 & 0.9989\\
& 2000 & 0.0234 & 0.0234 & 0.0235 & 0.9792 & 0.9792 & 0.9792\\
\hline
\end{tabular}   
\caption{Optimal Accuracy/Average Time computing approximate matrix inverse via Algorithm \ref{algorithm: harmonic approx} (Averaged over 10 samples).}
\label{table:CGtimefull}
\end{table}

\begin{table}[h]
\centering
\begin{tabular}{ |c|c|c|c|c|c|c|c| } 
\hline
\multirow{2}{4em}{Dataset} & \multirow{2}{2em}{Size} & 
\multirow{2}{5em}{Time,\\ CG, $t=5$ } & 
\multirow{2}{5em}{Time,\\ CG, $t=10$} & 
\multirow{2}{5em}{Time,\\ CG, $t=20$} & 
\multirow{2}{5em}{Accuracy, CG, $t=5$} &
\multirow{2}{5em}{Accuracy, CG, $t=10$} &
\multirow{2}{5em}{Accuracy, CG, $t=20$} \\
&&&&&&& \\
\hline
\multirow{3}{4em}{MNIST}
& 500 & 0.0036 & 0.0034 & 0.0034 & 0.9988 & 0.9988 & 0.9988\\
& 1000 & 0.0052 & 0.005 & 0.0049 & 0.9865 & 0.9865 & 0.9865\\
& 2000 & 0.0224 & 0.0225 & 0.0225 & 0.9754 & 0.9754 & 0.9754\\
\hline
\multirow{3}{4em}{Fashion-MNIST}
& 500 & 0.0033 & 0.0034 & 0.0034 & 0.9692 & 0.9692 & 0.9692\\
& 1000 & 0.0053 & 0.0053 & 0.0052 & 0.9714 & 0.9714 & 0.9714\\
& 2000 & 0.0224 & 0.0224 & 0.0225 & 0.9723 & 0.9723 & 0.9723\\
\hline
\multirow{3}{4em}{USPS}
& 500 & 0.0033 & 0.0034 & 0.0035 & 1.0 & 1.0 & 1.0\\
& 1000 & 0.0048 & 0.0049 & 0.0048 & 1.0 & 1.0 & 1.0\\
& 2000 & 0.0225 & 0.0225 & 0.0226 & 0.9943 & 0.9943 & 0.9943\\
\hline
\end{tabular}   
\caption{Optimal Accuracy/Average Time computing approximate matrix inverse via Algorithm \ref{algorithm: harmonic approx} \textbf{with kNN=6} (Averaged over 10 samples)}
\label{table:CGtimekNN}
\end{table}


\subsection{Further Directions}
While we used our method on two algorithms, namely \cite{delalleau2005efficient} and \cite{zhu2003semi}, our method could be extended to other SSL labeling schemes that allow for a derivative $\frac{\partial f}{\partial \sigma}$ to be taken, where $f$ is the labeling function and $\sigma$ is a hyperparameter. \\
One technique not explored in this work is anchor graph regularization \cite{liu2010large}. In this method, a set of points is chosen to be "anchor points". These points are then labeled, and all other points are labeled via some weighted combination of the labels of the anchor points. Since these anchor points are chosen via a k-means clustering idea, in order to calculate $\frac{\partial f}{\partial \sigma}$, it is necessary to determine how the cluster centers, or anchor points, of a graph change as the parameter $\sigma$ changes. We leave this as a further direction. Another SSL technique that can be explored as a further direction is the use of leading eigenvectors as "anchor points" \cite{sinha2009semi}. In this method, a lasso least squares approximation is used with respect to certain eigenvectors of the kernel matrix to create a point classifier. While the lasso least squares method has a closed form solution, it is necessary to determine how these eigenvectors change as a function of $\sigma$. It is possible that this change could be approximated fast using eigenvector approximation techniques like Lanczos algorithm, implementation and verification is an interesting candidate for further research. 

While we use the time saved to run Algorithm \ref{algorithm: harmonic approx} faster by computing inverses from \cite{delalleau2005efficient} and \cite{zhu2003semi} quickly, it could also be used directly with results from any graph based SSL technique that employs radial basis kernels and takes inverses. Further, since it is standard to use kNN graphs when doing matrix inverses as in \cite{delalleau2005efficient} and \cite{zhu2003semi}, this would speedup an inverse on an $m \times m$ matrix from $O(m^3)$ to $O(m)$ for an $\epsilon$ approximation. As a result, larger subsets of data could be inverted. It has been shown that accuracy increases as more data is used for inversion \cite{delalleau2005efficient, zhu2003semi}, and we have shown that we can match optimal accuracy of full matrix inversion across hyperparameters with the CG-method. This signifies that using a larger dataset and finding approximate inverses with the CG method could lead to higher accuracy than using a smaller dataset and taking exact inverses.

We notice that larger datasets lead to much smaller interval sizes when using Algorithm \ref{algorithm: delalleau approx}. One way to combat this could be to find $(\epsilon, \epsilon)$-approximate intervals whose accuracy values all fall within some $\delta$ of each other, as opposed to being piecewise constant. In this way, we could find larger accuracy-approximate intervals, which then allow for a faster search of range $[\sigma_{\min}, \sigma_{\max}]$. 



% \section{Additional Simulation Results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.
\clearpage
% \red{Ideally all citations must appear in main body, and we shouldn't have references here.}
\bibliography{sharma_554}

\end{document}
