% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
% \usepackage{microtype}
\usepackage{graphicx}
\usepackage{float}
\usepackage[caption = false]{subfig}
\usepackage{booktabs} % for professional tables
\usepackage{bm}
%\usepackage{hyperref}
\usepackage{diagbox}
\usepackage{algorithm}
\usepackage[noend]{algorithmic}
\usepackage{thm-restate}
%\usepackage{algorithmic}
\usepackage[algo2e,ruled,vlined]{algorithm2e}
% Attempt to make hyperref and algorithmic work together better:
%\newcommand{\theHalgorithm}{\arabic{algorithm}}

%\algnewcommand\algorithmicinput{\textbf{Input:}}
%\algnewcommand\algorithmicoutput{\textbf{Output:}}
%\algnewcommand\INPUT{\item[\algorithmicinput]}
%\algnewcommand\OUTPUT{\item[\algorithmicoutput]}
%\algnewcommand{\LineComment}[1]{\Statex \(\triangleright\) #1}

% \usepackage{geometry}
% \geometry{top=1in,bottom=1in,left=0.94in,right=0.94in}

\usepackage{amssymb,amsthm}
\usepackage{hyperref}

\usepackage{tikz}
\usepackage{url}
\usepackage{setspace}
% \usepackage[pdftex,bookmarksnumbered,bookmarksopen,
% colorlinks,citecolor=blue,linkcolor=blue,urlcolor=blue]{hyperref}
%\usepackage{tablefootnote}
% \usepackage{framed}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{longtable}

\usepackage{times}
% \usepackage{enumitem}
\usepackage{varwidth}
\usepackage{graphicx}
\usepackage{wrapfig}
% \usepackage{enumerate}
\usepackage{caption}
% \usepackage{subcaption}
%\usepackage{subcaption}
%\usepackage{mwe}


\usepackage{amssymb}
\usepackage{multirow}
\usepackage{bbm}
\usepackage{graphicx}
\usepackage{url}
\usepackage{setspace}
%\usepackage[pdftex,bookmarksnumbered,bookmarksopen,
%colorlinks,citecolor=blue,linkcolor=blue]{hyperref}
%\usepackage{tablefootnote}
\usepackage{framed}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{longtable}

\usepackage{times}
% \usepackage{enumitem}
\usepackage{varwidth}
\usepackage{graphicx}
\usepackage{wrapfig}
% \usepackage{enumerate}
%\usepackage{subcaption}
%\usepackage{mwe}




\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
%\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
% \usepackage{microtype}      % microtypography




%\usepackage[tight]{subfigure}
% \usepackage{graphicx}
% \usepackage{appendix}
% \usepackage{amsmath,amsfonts,amsthm}
%\usepackage{algorithmic}
%\usepackage[algo2e,ruled,vlined]{algorithm2e}
%\setlength{\Algomargin}{-0.05em}
% \usepackage{mdwlist}
\usepackage{xspace}
%\usepackage{enumitem}
\usepackage{color}
\usepackage{mathrsfs}

\usepackage{booktabs}
\usepackage{comment}
%\usepackage{geometry}

\usepackage{multirow}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\ind}{\mathbb{I}}

% Functions using mathrm
\renewcommand{\dim}{\mathrm{dim}}
\newcommand{\OPT}{\textup{\textsf{OPT}}}
\newcommand{\range}{\mathcal{range}}
\newcommand{\CR}{\text{CR}}
\newcommand{\sign}{\textup{\textsf{sign}}}
\newcommand{\sgn}{\textup{\textsf{sign}}}
\newcommand{\diag}{\textsf{Diag}}
\newcommand{\ber}{\textup{\textsf{Ber}}}
\newcommand{\err}{\mathrm{err}}
\newcommand{\adv}{\mathrm{adv}}
\newcommand{\nat}{\mathrm{nat}}
\newcommand{\greedy}{\mathrm{greedy}}
\newcommand{\opt}{\mathrm{opt}}
\newcommand{\abstain}{\mathrm{abstain}}
\newcommand{\gen}{(\frac{\nu}{12})}
\newcommand{\error}{\mathrm{err}}
\newcommand{\hinge}{\mathrm{hinge}}
\newcommand{\minimax}{\mathrm{minimax}}
\newcommand{\boundary}{\mathrm{DB}}
\newcommand{\erf}{\mathrm{erf}}
\newcommand{\ERM}{\mathrm{ERM}}
\newcommand{\Appendix}[1]{the full version for}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}

\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\ltwonorm}[1]{\left\| #1 \right\|_2}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{remark}{Remark}
\newtheorem{claim}{Claim}
\newtheorem{fact}{Fact}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}
\newtheorem{conjecture}{Conjecture}
\newtheorem{condition}{Condition}
\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}
\renewcommand{\a}{\mathbf{a}}
\renewcommand{\b}{\mathbf{b}}
\renewcommand{\c}{\mathbf{c}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\g}{\mathbf{g}}
\renewcommand{\u}{\bm{u}}
\renewcommand{\v}{\mathbf{v}}
\newcommand{\w}{\bm{w}}
\newcommand{\x}{\bm{x}}
\newcommand{\y}{\bm{y}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\C}{\mathcal{C}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\F}{\mathbf{F}}
\newcommand{\G}{\mathcal{G}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\I}{\mathbf{I}}
\newcommand{\bI}{\mathbb{I}}
\newcommand{\K}{\mathcal{K}}
\renewcommand{\L}{\mathbf{L}}
\newcommand{\M}{\mathbf{M}}
\newcommand{\N}{\mathcal{N}}
\renewcommand{\P}{\mathcal{P}}
\newcommand{\Q}{\mathbf{Q}}
\newcommand{\R}{\mathbb{R}}
\renewcommand{\S}{\mathbf{S}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\U}{\mathbf{U}}
\newcommand{\V}{\mathbf{V}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\rank}{\textup{\textsf{rank}}}
\newcommand{\orthc}{\mathbf{orth}_c}
\newcommand{\orthr}{\mathbf{orth}_r}
\newcommand{\bLambda}{\mathbf{\Lambda}}
\newcommand{\RS}{\mathcal{R}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\1}{\mathbf{1}}
\renewcommand{\comment}[1]{}
\newcommand{\red}[1]{}%{\color{red}#1}}
% \newcommand{\red}[1]{{\color{white}#1}}
\newcommand{\blue}[1]{{\color{blue}#1}}
\newcommand{\tr}{\textsf{tr}}
\newcommand{\cA}{\mathcal{A}}
\newcommand{\cB}{\mathcal{B}}
\newcommand{\cC}{\mathcal{C}}
\newcommand{\cD}{\mathcal{D}}
\newcommand{\cF}{\mathcal{F}}
\newcommand{\cG}{\mathcal{G}}
\newcommand{\cH}{\mathcal{H}}
\newcommand{\cI}{\mathcal{I}}
\newcommand{\cK}{\mathcal{K}}
\newcommand{\cL}{\mathcal{L}}
\newcommand{\cS}{\mathcal{S}}
\newcommand{\cT}{\mathcal{T}}
\newcommand{\cU}{\mathcal{U}}
\newcommand{\cV}{\mathcal{V}}
\newcommand{\cN}{\mathcal{N}}
\newcommand{\cO}{\mathcal{O}}
\newcommand{\cP}{\mathcal{P}}
\newcommand{\cQ}{\mathcal{Q}}
\newcommand{\cR}{\mathcal{R}}
\newcommand{\cY}{\mathcal{Y}}
\newcommand{\cX}{\mathcal{X}}
\newcommand{\cZ}{\mathcal{Z}}
\newcommand{\bbB}{\mathbb{B}}
\newcommand{\bbE}{\mathbb{E}}
\newcommand{\bbN}{\mathbb{N}}
\newcommand{\bbS}{\mathbb{S}}
\newcommand{\Pro}{\text{Pro}}
\newcommand{\imperceptible}{\mathsf{imperceptible}}
\newcommand{\dist}{\mathsf{dist}}
\newcommand{\spann}{\mathsf{span}}
\newcommand{\vol}{\mathsf{vol}}
\newcommand{\Null}{\mathsf{null}}
\newcommand{\Area}{\mathsf{Area}}
\newcommand{\Agree}{\mathsf{Agree}}

\definecolor{colorY}{rgb}{0.7 , 0.7 , 0.2}

\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}

\newenvironment{proofoutline}{\noindent{\emph{Proof Sketch. }}}{\hfill$\square$\medskip}

\title{Efficiently Learning the Graph for Semi-supervised Learning}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dravyans@cs.cmu.edu>?Subject=Your UAI 2023 paper}{Dravyansh Sharma}{}}
\author[1]{\href{mailto:<mjones2@andrew.cmu.edu>?Subject=Your UAI 2023 paper}{Maxwell Jones}{}}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science.\\
    Carnegie Mellon University\\
    Pittsburgh, PA, 15213
}
  
  \begin{document}
\maketitle

\begin{abstract}
  Computational efficiency is a major bottleneck in using classic graph-based approaches for semi-supervised learning on datasets with a large number of unlabeled examples. Known techniques to improve efficiency typically involve an approximation of the graph regularization objective, but suffer two major drawbacks – first the graph is assumed to be known or constructed with heuristic hyperparameter values, second they do not provide a principled approximation guarantee for learning over the full unlabeled dataset. Building on recent work on learning graphs for semi-supervised learning from multiple datasets for problems from the same domain, and leveraging techniques for fast approximations for solving linear systems in the graph Laplacian matrix, we propose algorithms that overcome both the above limitations. 
  
  We show a formal separation in the learning-theoretic complexity of sparse and dense graph families. We further show how to approximately learn the best graphs from the sparse families efficiently using the conjugate gradient method. 
    Our approach can also be used to learn the graph efficiently online with sub-linear regret, under mild smoothness assumptions. Our online learning results are stated generally, and may be useful for approximate and efficient parameter tuning in other problems. We implement our approach and demonstrate significant ($\sim$10-100x) speedups over prior work on semi-supervised learning with learned graphs on benchmark datasets.
\end{abstract}

\section{Introduction}\label{sec:intro}

As machine learning finds applications in new domains like healthcare, finance and a variety of industrial sectors \citep{vamathevan2019applications,kumar2022kdd,larranaga2018industrial}, obtaining sufficiently large human-annotated datasets for applying supervised learning is often prohibitively expensive. Semi-supervised learning can solve this problem by utilizing unlabeled data, which is more readily available, together with a small amount of human-labeled data. Graph-based techniques, where the similarity of examples is encoded using a graph, are popular and effective for learning using unlabeled data \citep{zhu2009introduction}. Several heuristic approaches for learning given the graph are known, but the choice of a good graph is strongly dependent on the problem domain. How to create the graph has largely been `more of an art than science' \citep{zhu2005semi}, although recent work proposes how to provably learn the best graph for a given problem domain from the data \citep{balcan2021data}. A key limitation of the proposed techniques is their computational efficiency, as the proposed algorithms take $\Tilde{O}(n^4)$ time which make them impractical to run on real datasets. In this work we propose new and more practical approaches that exploit graph {\it sparsity} and employ {\it approximate optimization} to obtain more powerful graph learning techniques with formal guarantees for their effectiveness, and improved efficiency guarantees.

%{\it Related work.} 
Past work on improving the efficiency of graph-based semi-supervised learning has focused largely on selecting a subset of `important' unlabeled examples. One may use a greedy algorithm \citep{delalleau2005efficient} or a $k$-means based heuristic \citep{wang2016scalable}, run the graph-based algorithm only on the selected subset of examples and use some local interpolation for remaining nodes. In this work we provide more principled approaches that come with formal near-optimality guarantees, and demonstrate the trade-off between accuracy and efficiency. We focus on the data-driven setting, first studied by \citep{balcan2021data} for this problem, where one repeatedly solves multiple semi-supervised learning problems from the same problem domain, and hopes to learn a common graph that works well over the domain. 

We give tools for analysis of regret of online learning algorithms in data-driven algorithm design, applicable beyond semi-supervised graph learning, and  useful in any problem where the loss functions can be  easily approximated.
We also study sample efficiency of the number of problem samples needed to learning a good parameter when the problems come from a distribution over semi-supervised learning problems.
Our work extends prior theoretical results \citep{balcan2021data} on sample efficiency to additional graph families that capture sparsity, obtaining improved sample complexity for sparse graphs. We further propose algorithms which improve over the running time of previous proposed approaches for learning the graph. We employ the conjugate gradient method to compute fast, approximate inverses and optimize over new multi-parameter graph families that include sparse graphs which can be more efficiently optimized.
Empirically, we observe that our proposed approaches are computationally efficient, while retaining the effectiveness guarantees of learning the best graph for the given problem distribution.\looseness-1

In more detail, we learn the graph `bandwidth' hyperparameter for the commonly used Gaussian kernel, optimizing over a continuous parameter domain. This approach is more powerful than a grid search, which only computes results at some finite set of hyperparameter values. We extend the recent line of work on data-driven algorithm design (Section \ref{sec:rw}) to approximate online feedback, and achieve provably near-optimal hyperparameter selection.\looseness-1


\subsection{Main Contributions}\label{sec:contributions}

\begin{itemize}[leftmargin=*,topsep=4pt,partopsep=1ex,parsep=1ex]\itemsep=-4pt
    \item (Section \ref{sec:addag}) We provide a general analysis for online data-driven algorithm design with approximate loss functions, quantifying the accuracy-efficiency trade-off. While prior work on approximate algorithm selection \citep{balcan2020refined} studies bounded $\ell_\infty$-norm approximations in the distributional setting, we generalize along two axes -- we study a more general approximation class necessary to analyse our semi-supervised learning algorithm, and our results apply to online learning, even in the presence of (the more realistic) partial feedback.%\looseness-1
    \item (Section \ref{sec:sparse}) For graph-based semi-supervised learning, we show a formal gap in the pseudodimension of learning sparse and dense graphs. Concretely, if each graph node is connected to at most $K$ neighbors, the pseudodimension is $O(K+\log n)$, which implies an asymptotic gap relative to $\Omega(n)$ bound for learning complete graphs with Gaussian RBF kernels \citep{balcan2021data}. %In particular, Theorem \ref{thm:sigma-pdim} implies that our techniques are also more sample efficient, in addition to being more computational efficient.
    \item (Section \ref{sec:algo}) We propose an efficient algorithm based on approximate Laplacian inverse for approximately computing the hyperparameter intervals where the semi-supervised loss objective is constant. We prove  convergence guarantees for our algorithm% based on loss function
    , which capture a trade-off between the computational efficiency and the accuracy of loss estimation. We instantiate our approach for approximate graph learning for the classic harmonic-objective algorithm of \citet{zhu2003semi}, as well as the computationally efficient algorithm of \citet{delalleau2005efficient}.
    % \item We use our algorithm to compute hyperparameter intervals for the loss function of  as well as , and find specific convergence gaurantees for our algorithm when using these loss functions. 
    \item (Section \ref{sec:expt}) We implement our algorithm \footnote{\href{https://github.com/maxwelljones14/Efficient-SSL}{https://github.com/maxwelljones14/Efficient-SSL}} and provide extensive empirical study showing improvement over previously proposed approaches on standard datasets. Specifically, we improve the running time by about 1-2 orders of magnitude, while almost retaining (and in some cases slightly increasing) the accuracy.
\end{itemize}

\subsection{Related Work}\label{sec:rw}

% \red{Some of it could go into appendix.}

\textbf{Approximate Laplacian inverse}. The {\it conjugate gradient method} \citep{hestenes1952methods} is an iterative algorithm used to approximately solve a system $Ax = b$ for symmetric, positive definite matrices. Starting with the zero vector as an approximate solution, every iteration computes a gradient used to update this approximation in the direction of the exact solution. The exact solution itself is obtained in $n$ steps, but good approximate solutions can be found much sooner for graphs with low condition number $\kappa$ \citep{AXELSSON1976123,vishnoi2012laplacian}. Furthermore, each iteration computes a finite number of matrix-vector products on $A$, yielding good runtime guarantees. Many variants of the Conjugate gradient method exist \citep{hager2006survey}, in this work we use the original version. The conjugate gradient method is a tool in use for calculating fast matrix inverses across machine learning applications, in domains such as deep reinforcement learning \citep{schulman2015trust,rajeswaran2017towards}  and market forecasting \citep{SHEN2015243}. We choose the conjugate gradient method over other iterative techniques to solve $Ax = b$ like Lanczos iteration due to its stability, simplicity, and previous success in other machine learning applications. Further, the conjugate gradient method offers strong theoretical guarantees, leading to fast approximate convergence for our use case. %\red{Add more recent algorithms from spectral graph theory, with nearly linear-time approximate inverse computation.}

\textbf{Semi-supervised learning}. {\it Semi-supervised learning} is a paradigm for learning from labeled and unlabeled data (\citep{zhu2009introduction,balcan2010discriminative}). 
A popular approach for semi-supervised learning is to optimize a graph-based objective. %Here the compatibility assumption is that the labels are {\it smooth} over the graph, and as such the performance is highly sensitive to the graph structure and the edge weights. Since labels partition the graph, we seek a (possibly soft) graph cut as the predictor. 
Several methods have been proposed to predict labels {\it given a graph} including $st$-mincuts (\citep{blum2001learning}), soft mincuts that optimize a harmonic objective (\citep{zhu2003semi}), and label propagation (\citep{zhu2002learning}). %Table \ref{table:algos} summarizes the optimization involved in some prominent algorithms. % $\alpha=\infty$ corresponds to forcing labels of labeled examples $L$.
Prior research for efficient semi-supervised learning has also typically assumed that the graph $G$ is given \citep{delalleau2005efficient,wang2016scalable}.
 All algorithms have comparable performance provided the graph $G$ encodes the problem well \citep{zhu2009introduction}. \citet{balcan2021data} introduce a first approach to learn the graph $G$ with formal guarantees, and show that the performance of all the algorithms depends strongly on the graph hyperparameters. In this work, we provide computationally efficient algorithms for learning the graph parameters. While we focus on %learning the graph for 
 the  classical approaches, deep learning based approaches also typically assume a graph is available \cite{kipf2017semi}. %\red{TODO: talk about GNNs and other modern SSL methods and work on efficiency.}
 % However, it is not clear how to create the graph itself on which the extensive literature stands, barring some heuristics (\citet{zhu2005semi,zemel2004proximity}).
%\citet{zemel2004proximity} discuss how to create a robust graph by considering an ensemble of minimum spanning trees for several data perturbations and randomly retaining edges which appear often. The algorithm however uses a parameter $t$ for expected graph density and it is unclear how to set it for any given problem instance, and no theoretical guarantees are provided. 
% \citet{sindhwani2005beyond} construct {\it warped} kernels  aligned with the data geometry, but the performance may vary strongly with warping and it is not clear how to optimize over it. We provide the first techniques that yield provably near-optimal graphs.

\textbf{Data-driven algorithm design}. \citet{gupta2017pac} define a formal learning framework for selecting algorithms from a family of heuristics or setting hyperparameters. It is further developed by \citet{balcan2017learning,balcan2018dispersion} and surveyed in \citet{balcan2020data}. It has been successfully applied to several problems in machine learning like clustering, linear regression and low rank approximation \citep{balcan2017learning,balcan2022provably,bartlett2022generalization} (to list a few) and for giving powerful guarantees like differential privacy, adaptive learning and adversarial robustness \citep{balcan2018dispersion,sharma2020learning,balcan2020power}.  %\citet{balcan2020data} provides a simple introduction to and a comprehensive survey on this rapidly expanding research direction.
\citet{balcan2018dispersion,dick2020semi} introduce general data-driven design techniques under some smoothness assumptions, and \citet{balcan2020refined} study learning with approximate losses. We extend the techniques to broader problem settings (as noted in Section \ref{sec:contributions}, and detailed in Section \ref{sec:addag}), and investigate the structure of graph-based label learning formulation to apply the new techniques. Computational efficiency is an important concern for practical applicability of data-driven design. This includes recent and concurrent work which use fundamentally different techniques like output-sensitive enumeration from computational geometry \cite{balcan2022faster} and discretization for mechanism design applications \cite{balcan2023learning}. In contrast, we study the effectiveness of approximate loss estimation, as well as graph sparsification, in data-driven graph selection for semi-supervised learning.

% \red{Add related work on semi-supervised learning.}

% \red{NOTE: removed \citet{vishnoi2012laplacian} citation in favor of  \citet{AXELSSON1976123} citation for $\kappa$ number argument since the former is a textbook and the latter is the correct original location I think}

% \red{Add related work on data-driven algorithm design.}



% UAI 2023 papers have to be prepared using \LaTeX.
% To start writing your paper, copy \texttt{uai2023-template.tex} and replace title, authorship, and content with your own.

% The UAI 2023 paper style is based on a custom \textsf{uai2023} class.
% The class file sets the page geometry and visual style.\footnote{%
%     The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
% }
% The class file also loads basic text fonts.\footnote{%
%     Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
% }
% \emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
% (Also do not use \verb|\vspace| for this.)
% Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
% The class enables hyperlinking by loading the \textsf{hyperref} package.

% You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
% (Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
% Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
% Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
%     See \url{https://ctan.org/pkg/l2tabu}.
% }

% \swap[ ]{in the header of your source file.}{Feel free to include your own macros}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.



\section{Notation and Formal Setup}

We are given some labeled points $L$ and unlabeled points $U$. One constructs a graph $G$ by placing (possibly weighted) edges $w(u,v)$ between pairs of data points $u,v$ which are `similar', and labels for the unlabeled examples are obtained by optimizing some graph-based score. We have an oracle $O$ which on querying provides us the labeled and unlabeled examples, and we need to pick $G$ from some family $\G$ of graphs. We commit to using some algorithm $A(G,L,U)$ (or $A_{G,L,U}$) which provides labels for examples in $U$, and we should pick a $G$ such that $A(G,L,U)$ results in small error in its predictions on $U$. To summarize more formally,


{\it Problem statement}: Given data space $\cX$, label space $\cY$ and an oracle $O$ which yields a number of labeled examples  $\emptyset\ne L\subset  \cX\times\cY$ and some unlabeled examples $\emptyset\ne U\subset \cX$ such that $|L|+|U|=n$. We are further given a parameterized family of graph construction procedures over parameter space $\cP$, $\G:\cP\rightarrow(\cX\times\cX\rightarrow \R_{\ge 0})$, graph labeling algorithm $A_{G,L,U}$ which takes a graph $G$ with labeled nodes $L$ and unlabeled nodes $U$ and provides labels for all unlabeled examples in $U$, a loss function $l:\cY\times\cY\rightarrow [0,1]$ and a target labeling $\tau:U\rightarrow \cY$. We need to select $\rho\in \cP$ such that corresponding graph $G(\rho)$ minimizes $\frac{1}{|U|}\sum_Ul(A_{G(\rho),L,U}(u),\tau(u))$ w.r.t. $\rho$.% with high probability, using a small number of queries to $O$.

% We will consider online and distributional settings of the above problem. In the online setting we make no distributional assumptions about the data and simply seek to minimize the regret, i.e. the loss suffered in an arbitrary online sequence of oracle queries $O$ relative to that endured by the best parameter $\rho^*$ in hindsight. In the distributional setting we will assume that the data and labels supplied by $O$ come from an underlying distribution $\cD$ and we would like to minimize the expected loss suffered on test examples drawn from the distribution with high probability. We will present further details and notations for the respective settings in the subsequent sections.

We will now describe graph families $\G$ and algorithms $A_{G,L,U}$ considered in this work. We restrict our attention to binary classification, i.e. $\cY=\{0,1\}$, and note that all proposed algorithms  naturally extend to multiclass problems, using the standard one-vs-all trick. We assume there is a feature based {\it similarity function} $d:\cX\times\cX\rightarrow\R_{\ge 0}$, a metric which monotonically captures similarity between the examples. In Definition \ref{defn:g} we formally introduce parametric families to build a graph using the similarity function, which capture and interpolate well-known approaches such as $k$-nearest neighbor graphs, $r$-neighborhood graphs and Gaussian RBF kernels.
In this work, we will consider three parametric families of graph construction algorithms defined below. $\bI[\cdot]$ is the indicator function taking values in $\{0,1\}$. Let $N_k(v)$ denote the set of nodes of $G$ which are the $k$-nearest neighbors of node $v$ under the metric $d(\cdot,\cdot)$. Define $k$-mutual neighborhood as the set of edges for which each end-point is a $k$-nearest neighbor of the other, i.e. $N'_k=\{(u,v)\mid u\in N_k(v) \text{ and } v\in N_k(u)\}$ \citep{ozaki2011using}.\looseness-1
%TODO cite...

\begin{definition}\label{defn:g} Sparse graph families.\\
a) Thresholded nearest neighbors, $G(k,r)$, (with $k\in\Z^+,r\in\R^+$): $w(u,v)=\bI[d(u,v)\le r \text{ and } (u,v)\in N_k']$ .\\
b) Gaussian nearest neighbors, $G(k,\sigma)$, (with $k\in[K]$ for $K\in\Z^+$, $\sigma\in\R^+$):  $w(u,v)=e^{-\frac{d(u,v)^2}{\sigma^2}}\bI[(u,v)\in N'_k]$.
%c) Thresholded exponential kernel, $G(\sigma,r)$. Parameterized by $\sigma,r\in\R^+$,  $w(u,v)=e^{-\frac{d(u,v)^2}{\sigma^2}}\bI[d(u,v)\le r]$.
% \begin{enumerate}
%   \item[a)] Threshold graph, $G(k,r)$. Parameterized by a threshold $r$, we set $w(u,v)=\bI[d(u,v)\le r]$.
%     % alt k-nn for different k; but this is probably better in practice
%     \item[b)] Polynomial kernel, $G(\Tilde{\alpha})$. $w(u,v)=(\Tilde{d}(u,v)+\Tilde{\alpha})^d$ for fixed degree $d$, parameterized by $\Tilde{\alpha}$.
%     \item[c)] Gaussian RBF or exponential kernel, $G(\sigma)$. $w(u,v)=e^{-d(u,v)^2/\sigma^2}$, parameterized by $\sigma$.
% \end{enumerate}
\end{definition} 
% \begin{remark}
% Another popular family of graphs used in practice is the $k$ nearest neighbor graphs, where $k\in\{0,1,\dots,n-1\}$, $n$ is the number of nodes in the graph, is the parameter. Even though $k$-NN graphs may result in different graphs the ones considered in the paper, learning how to build an optimal graph over the algorithm family $G(k)$ is much simpler. Online learning of the parameter $k$ in this setting can be recognized as an instance of learning with experts advice for a finite hypothesis class (Section 3.1 of \citet{shalev2011online}), where an upper bound of $O(\sqrt{T\log n})$ is known for the Weighted Majority algorithm. Online-to-batch conversion provides generalization guarantees in the distributional setting (Section 5 of \citet{shalev2011online}). We remark that our algorithm families need more sophisticated analysis due to continuous ranges of the algorithm parameters.
% \end{remark}
The thresholded nearest neighbor graph adds unweighted edges to $G(k,r)$ only when the examples are closer than some $r\in\R_{\ge 0}$, and are mutual $k$-nearest neighbors. The Gaussian (or RBF) kernel is more powerful and allows weighted edges that depend on the metric distance and the bandwidth parameter $\sigma$. %The weighted graph can be made sparse by removing edges where the weight is too small ($G(\sigma,r)$), or by only keeping edges which connect mutual $k$-nearest neighbors ($G(k,\sigma)$). 
We will use $\rho$ to denote a general graph parameter (e.g. $\rho=(k,r)$ for thresholded nearest neighbors) and denote the general parameterized graph family by $G(\rho)$. %When the analysis for polynomial kernels is similar to that of the Gaussian RBF, we will only discuss the latter.
% For example, in \citet{zhu2003semi} predictions can be made by rounding a soft labeling $f(u)$ which minimizes $\sum_{u,v}w(u,v)(f(u)-f(v))^2$. Notice that if we require hard labels, the objective gives the min-cut of the graph separating the labeled examples, so this may be viewed as a soft min-cut. \\\\
Once the graph is constructed using one of the above families, we can assign labels using a suitable algorithm $A_{G,L,U}$. A popular and effective approach is by optimizing a quadratic objective $\frac{1}{2}\sum_{u,v}w(u,v)(f_u-f_v)^2=f^T(D-W)f$. Here $f$ may either be discrete $f_v\in\{0,1\}$ which corresponds to finding a graph mincut separating the oppositely labeled vertices \citet{blum2001learning}, or $f$ may be continuous, i.e. $f\in[0,1]$, and we can round $f$ to obtain the labels \citep{zhu2003semi}.\looseness-1 %These correspond to algorithms A and B respectively from Table \ref{table:algos}. It is noted that all algorithms have comparable performance provided the graph $G$ encodes the problem well \citet{zhu2009introduction}. %We restrict our attention to these two algorithms for simplicity of presentation, although our algorithms and proofs may be extended to most quadratic objective based algorithm in Table \ref{table:algos}\footnote{Specifically by extending arguments for algorithm B since the optimization is similar. In contrast, Algorithm A is combinatorial and the reasoning diverges somewhat.}.

In the {\it distributional setting}, we are presented with several instances of the graph semi-supervised learning problem assumed to be drawn from an unknown distribution $\cD$ and want to learn the best value of the graph parameter $\rho$. We also assume we get all the labels for the `training' problem instances. A choice of $\rho$ uniquely determines the graph $G(\rho)$ and we use some algorithm $A_{G(\rho),L,U}$ to make predictions (e.g. minimizing the quadratic penalty score above) and suffers loss $l_{A(G(\rho),L,U)}:=\frac{1}{|U|}\sum_Ul(A_{G(\rho),L,U}(u),\tau(u))$ which we seek to minimize relative to smallest possible loss by some graph in the hypothesis space, in expectation over the data distribution $\cD$. We also define the family of loss functions $\mathcal{H}_{\rho}=\{l_{A(G(\rho),L,U)}\mid \rho\in\cP \}$. For example, $\mathcal{H}_{k,r}=\{l_{A(G(k,r),L,U)}\mid (k,r)\in\Z^+\times\R^+ \}$. As a shorthand, we will often denote the loss on a fixed problem instance as a function of the graph hyperparameter $\rho$ as simply $l(\rho)$, and refer to it as the {\it dual semi-supervised loss}.

% We will show a divergence in the weighted and unweighted graph learning problems. {We analyze and provide asymptotically tight bounds for the pseudodimension of the set of loss functions (composed with the graph creation algorithm family and the optimization algorithm for predicting labels) prarmeterized by the graph family parameter $\rho$, i.e. $\mathcal{H}_{\rho}=\{l_{A(G(\rho),L,U)}\mid \rho\in\cP \}$.}

Finally we note definitions of some useful learning theoretic complexity measures. First recall the definition of pseudodimension \citep{pollard2012convergence} which generalizes VC dimension to real-valued functions, and is a well-known measure for hypothesis-space complexity in statistical learning theory. %Bounding these quantities implies immediate bounds on learning error using classic learning theoretic results. %In  Section \ref{sec: distrib} we will bound the pseudodimension and Rademacher complexity for the problems of learning unweighted and weighted graphs.
\begin{definition}[Pseudo-dimension] Let $\cH$ be a set of real valued functions from input space $\cX$. We say that
$C = (x_1, \dots, x_m)\in \cX^m$ is pseudo-shattered by $\cH$ if there exists a vector
$r = (r_1, \dots, r_m)\in\R^m$ (called ``witness”) such that for all
$b= (b_1, \dots, b_m)\in \{\pm 1\}^m $ there exists $h_b\in \cH$ such that $\text{sign}(h_b(x_i)-r_i)=b_i$. Pseudo-dimension of $\cH$  is the cardinality of the largest set
pseudo-shattered by $\cH$.
\end{definition}

% \begin{defn}%TODO: [0,1]---> [0,H]
% Rademacher complexity \citep{bartlett2002rademacher}. Let $\F=\{f_\rho:\X\rightarrow [0,1], \rho\in\C\subset\R^d\}$ be a parameterized family of functions, and sample $\s=\{x_i,\dots,x_T\}\subseteq\X$. The empirical Rademacher complexity of $\F$ with respect to $\s$ is defined as $\hat{R}(\F,\s)=\E_{\mathbf{\sigma}}\left[\sup_{f\in\F}\frac{1}{T}\sum_{i=1}^T\sigma_if(x_i)\right]$, where $\sigma_i\sim U(\{-1,1\})$ are Rademacher variables.
% \end{defn}

We will also need the definition of {\it dispersion} \citep{balcan2018dispersion} which, informally speaking, captures how amenable a non-Lipschitz function is to online learning. As noted in \citet{balcan2018dispersion,sharma2020learning}, dispersion is necessary and sufficient for learning piecewise Lipschitz functions online.


\begin{definition}[\label{def:dis}Dispersion]
The sequence of random loss functions $l_1, \dots,l_T$ is $\beta$-{\it dispersed} for the Lipschitz constant $L$ if, for all $T$ and for all $\epsilon\ge T^{-\beta}$, we have that, in expectation, at most
$\Tilde{O}(\epsilon T)$ functions (here $\tilde{O}$ suppresses dependence on quantities beside $\epsilon,T$ and $\beta$, as well as logarithmic terms)
are not $L$-Lipschitz for any pair of points at distance $\epsilon$ in the domain $\C$. That is, for all $T$ and  $\epsilon\ge T^{-\beta}$, $\E\!\!\left[\!
\max\!\!\!\!_{\substack{\rho,\rho'\in\C\\||\rho-\rho'||_2\le\epsilon}}\!\big\lvert
\{ t\!\in\![T] \mid l_t(\rho)\!-\!l_t(\rho')>L||\rho\!-\!\rho'||_2\} \big\rvert \!\right]
\le  \Tilde{O}(\epsilon T)$.
% \begin{align*}
%     \E\left[
% \max_{\substack{\rho,\rho'\in\C\\||\rho-\rho'||_2\le\epsilon}}\big\lvert
% \{ t\in[T] \mid l_t(\rho)-l_t(\rho')>L||\rho-\rho'||_2\} \big\rvert \right] 
% \le  \Tilde{O}(\epsilon T).
% \end{align*}
\end{definition}



\section{Approximate %and composite 
Data-driven Algorithm Design} \label{sec:addag}


\begin{figure}
% \tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt        
\centering
% \begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1]
% %uncomment if require: \path (0,2679); %set diagram left start at 0, and has height of 2679

% %Shape: Rectangle [id:dp5376731512542594]
% \draw  [draw opacity=0][fill={rgb, 255:red, 249; green, 180; blue, 180 }  ,fill opacity=0.31 ] (459.04,2398.24) -- (470.71,2398.24) -- (470.71,2590.8) -- (459.04,2590.8) -- cycle ;
% %Shape: Rectangle [id:dp31984730022593344]
% \draw  [draw opacity=0][fill={rgb, 255:red, 167; green, 194; blue, 227 }  ,fill opacity=0.61 ][dash pattern={on 0.84pt off 2.51pt}] (471.04,2398.24) -- (539.23,2398.24) -- (539.23,2433.24) -- (471.04,2433.24) -- cycle ;
% %Shape: Rectangle [id:dp7731368775451168]
% \draw  [draw opacity=0][fill={rgb, 255:red, 167; green, 194; blue, 227 }  ,fill opacity=0.61 ][dash pattern={on 0.84pt off 2.51pt}] (390.5,2398.24) -- (459.04,2398.24) -- (459.04,2433.24) -- (390.5,2433.24) -- cycle ;
% %Shape: Rectangle [id:dp6667181380599851]
% \draw  [draw opacity=0][fill={rgb, 255:red, 167; green, 194; blue, 227 }  ,fill opacity=0.61 ][dash pattern={on 0.84pt off 2.51pt}] (76.33,2437.33) -- (223.67,2437.33) -- (223.67,2472.33) -- (76.33,2472.33) -- cycle ;
% %Shape: Rectangle [id:dp5446598438169576]
% \draw  [draw opacity=0][fill={rgb, 255:red, 167; green, 194; blue, 227 }  ,fill opacity=0.61 ][dash pattern={on 0.84pt off 2.51pt}] (239.04,2352.24) -- (387.01,2352.24) -- (387.01,2387.24) -- (239.04,2387.24) -- cycle ;
% %Shape: Rectangle [id:dp6365430514880266]
% \draw  [draw opacity=0][fill={rgb, 255:red, 249; green, 180; blue, 180 }  ,fill opacity=0.31 ] (386.38,2352.24) -- (390.5,2352.24) -- (390.5,2588.8) -- (386.38,2588.8) -- cycle ;
% %Shape: Rectangle [id:dp7397249109259643]
% \draw  [draw opacity=0][fill={rgb, 255:red, 249; green, 180; blue, 180 }  ,fill opacity=0.31 ] (223.88,2352.24) -- (239.04,2352.24) -- (239.04,2590.3) -- (223.88,2590.3) -- cycle ;
% %Shape: Axis 2D [id:dp7096049370932567]
% \draw  (67.01,2529.79) -- (584.01,2529.79)(67.01,2300.24) -- (67.01,2529.79) -- cycle (577.01,2524.79) -- (584.01,2529.79) -- (577.01,2534.79) (62.01,2307.24) -- (67.01,2300.24) -- (72.01,2307.24)  ;
% %Straight Lines [id:da10768647039315482]
% \draw    (77,2453.68) -- (228,2452.69) ;
% \draw [shift={(229.01,2452.69)}, rotate = 359.63] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% \draw [shift={(77,2453.68)}, rotate = 359.63] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][fill={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% %Straight Lines [id:da09691701252159679]
% \draw [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ]   (235,2370.68) -- (392.25,2370.68) ;
% \draw [shift={(393.26,2370.68)}, rotate = 0] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% \draw [shift={(235,2370.68)}, rotate = 0] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ][fill={rgb, 255:red, 0; green, 0; blue, 0 }  ,fill opacity=1 ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% %Straight Lines [id:da41659886256601797]
% \draw    (393.6,2416.69) -- (542,2416.69) ;
% \draw [shift={(543.01,2416.69)}, rotate = 0] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% \draw [shift={(393.6,2416.69)}, rotate = 0] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][fill={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]      (0, 0) circle [x radius= 2.01, y radius= 2.01]   ;
% %Shape: Wave [id:dp6861819205798203]
% \draw  [line width=1.5]  (74.01,2438.8) .. controls (76.55,2439.74) and (78.8,2446.31) .. (81.14,2453.18) .. controls (83.66,2460.61) and (86.08,2467.69) .. (88.89,2467.69) .. controls (91.69,2467.69) and (94.11,2460.61) .. (96.64,2453.18) .. controls (99.16,2445.76) and (101.58,2438.68) .. (104.39,2438.68) .. controls (107.19,2438.68) and (109.61,2445.76) .. (112.14,2453.18) .. controls (114.66,2460.61) and (117.08,2467.69) .. (119.89,2467.69) .. controls (122.69,2467.69) and (125.11,2460.61) .. (127.64,2453.18) .. controls (130.16,2445.76) and (132.58,2438.68) .. (135.39,2438.68) .. controls (138.19,2438.68) and (140.61,2445.76) .. (143.14,2453.18) .. controls (145.66,2460.61) and (148.08,2467.69) .. (150.89,2467.69) .. controls (153.69,2467.69) and (156.11,2460.61) .. (158.64,2453.18) .. controls (161.16,2445.76) and (163.58,2438.68) .. (166.39,2438.68) .. controls (169.19,2438.68) and (171.61,2445.76) .. (174.14,2453.18) .. controls (176.66,2460.61) and (179.08,2467.69) .. (181.89,2467.69) .. controls (184.69,2467.69) and (187.11,2460.61) .. (189.64,2453.18) .. controls (192.16,2445.76) and (194.58,2438.68) .. (197.39,2438.68) .. controls (200.19,2438.68) and (202.61,2445.76) .. (205.14,2453.18) .. controls (207.66,2460.61) and (210.08,2467.69) .. (212.89,2467.69) .. controls (215.69,2467.69) and (218.11,2460.61) .. (220.64,2453.18) .. controls (221.77,2449.86) and (222.88,2446.61) .. (224.01,2444.03) ;
% %Straight Lines [id:da40237113944407055]
% \draw [line width=1.5]    (246.01,2361.69) -- (224.01,2444.69) ;
% %Shape: Wave [id:dp8229409419034717]
% \draw  [color={rgb, 255:red, 0; green, 0; blue, 0 }  ,draw opacity=1 ][line width=1.5]  (246.01,2361.66) .. controls (247.49,2358.13) and (249.01,2355.68) .. (250.66,2355.68) .. controls (253.46,2355.68) and (255.88,2362.76) .. (258.41,2370.18) .. controls (260.93,2377.61) and (263.35,2384.69) .. (266.16,2384.69) .. controls (268.96,2384.69) and (271.38,2377.61) .. (273.91,2370.18) .. controls (276.43,2362.76) and (278.85,2355.68) .. (281.66,2355.68) .. controls (284.46,2355.68) and (286.88,2362.76) .. (289.41,2370.18) .. controls (291.93,2377.61) and (294.35,2384.69) .. (297.16,2384.69) .. controls (299.96,2384.69) and (302.38,2377.61) .. (304.91,2370.18) .. controls (307.43,2362.76) and (309.85,2355.68) .. (312.66,2355.68) .. controls (315.46,2355.68) and (317.88,2362.76) .. (320.41,2370.18) .. controls (322.93,2377.61) and (325.35,2384.69) .. (328.16,2384.69) .. controls (330.96,2384.69) and (333.38,2377.61) .. (335.91,2370.18) .. controls (338.43,2362.76) and (340.85,2355.68) .. (343.66,2355.68) .. controls (346.46,2355.68) and (348.88,2362.76) .. (351.41,2370.18) .. controls (353.93,2377.61) and (356.35,2384.69) .. (359.16,2384.69) .. controls (361.96,2384.69) and (364.38,2377.61) .. (366.91,2370.18) .. controls (369.43,2362.76) and (371.85,2355.68) .. (374.66,2355.68) .. controls (377.46,2355.68) and (379.88,2362.76) .. (382.41,2370.18) .. controls (383.92,2374.63) and (385.39,2378.95) .. (386.93,2381.7) ;
% %Straight Lines [id:da4324707387669846]
% \draw [line width=1.5]    (386.6,2381.3) -- (400.93,2428.3) ;
% %Shape: Wave [id:dp8483414616667126]
% \draw  [line width=1.5]  (401.01,2427.36) .. controls (402.1,2429.41) and (403.22,2430.69) .. (404.42,2430.69) .. controls (407.22,2430.69) and (409.64,2423.61) .. (412.17,2416.18) .. controls (414.69,2408.76) and (417.11,2401.68) .. (419.92,2401.68) .. controls (422.72,2401.68) and (425.14,2408.76) .. (427.67,2416.18) .. controls (430.19,2423.61) and (432.61,2430.69) .. (435.42,2430.69) .. controls (438.22,2430.69) and (440.64,2423.61) .. (443.17,2416.18) .. controls (445.69,2408.76) and (448.11,2401.68) .. (450.92,2401.68) .. controls (453.72,2401.68) and (456.14,2408.76) .. (458.67,2416.18) .. controls (458.78,2416.52) and (458.89,2416.85) .. (459.01,2417.18) ;
% %Shape: Wave [id:dp5227589453570192]
% \draw  [line width=1.5]  (472.01,2417) .. controls (472.21,2416.4) and (472.42,2415.79) .. (472.63,2415.18) .. controls (475.15,2407.76) and (477.57,2400.68) .. (480.38,2400.68) .. controls (483.18,2400.68) and (485.6,2407.76) .. (488.13,2415.18) .. controls (490.65,2422.61) and (493.07,2429.69) .. (495.88,2429.69) .. controls (498.68,2429.69) and (501.1,2422.61) .. (503.63,2415.18) .. controls (506.15,2407.76) and (508.57,2400.68) .. (511.38,2400.68) .. controls (514.18,2400.68) and (516.6,2407.76) .. (519.13,2415.18) .. controls (521.65,2422.61) and (524.07,2429.69) .. (526.88,2429.69) .. controls (529.68,2429.69) and (532.1,2422.61) .. (534.63,2415.18) .. controls (536,2411.15) and (537.34,2407.22) .. (538.73,2404.47) ;
% %Shape: Wave [id:dp02294763625655083]
% \draw  [line width=1.5]  (458.01,2412.79) .. controls (459.3,2414.61) and (460.43,2439.68) .. (461.61,2465.96) .. controls (462.83,2493.26) and (464,2519.24) .. (465.36,2519.24) .. controls (466.71,2519.24) and (467.88,2493.26) .. (469.11,2465.96) .. controls (470.07,2444.52) and (471,2423.89) .. (472.01,2416.03) ;
% %Straight Lines [id:da29592436930278]
% \draw [line width=1.5]  [dash pattern={on 1.69pt off 2.76pt}]  (539.23,2398.24) -- (576.84,2398.24) ;
% %Straight Lines [id:da9208062392621394]
% \draw [line width=1.5]  [dash pattern={on 1.69pt off 2.76pt}]  (544.01,2416.69) -- (575.34,2417.2) ;
% %Straight Lines [id:da08164886136853022]
% \draw  [dash pattern={on 0.84pt off 2.51pt}]  (459.04,2398.24) -- (459.04,2333.47) ;
% %Straight Lines [id:da7706943602543304]
% \draw  [dash pattern={on 0.84pt off 2.51pt}]  (471.04,2398.24) -- (471.04,2332.81) ;
% %Straight Lines [id:da12737017272289086]
% \draw    (584.67,2377.47) -- (584.67,2398.14) ;
% \draw [shift={(584.67,2398.14)}, rotate = 270] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)(10.93,-4.9) .. controls (6.95,-2.3) and (3.31,-0.67) .. (0,0) .. controls (3.31,0.67) and (6.95,2.3) .. (10.93,4.9)   ;
% %Straight Lines [id:da7394930259057153]
% \draw    (585.75,2435.81) -- (584.67,2417.14) ;
% \draw [shift={(584.67,2417.14)}, rotate = 86.69] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)(10.93,-4.9) .. controls (6.95,-2.3) and (3.31,-0.67) .. (0,0) .. controls (3.31,0.67) and (6.95,2.3) .. (10.93,4.9)   ;
% %Straight Lines [id:da10667775327896911]
% \draw    (440.41,2324.14) -- (457.33,2324.14) ;
% \draw [shift={(457.33,2324.14)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)(10.93,-4.9) .. controls (6.95,-2.3) and (3.31,-0.67) .. (0,0) .. controls (3.31,0.67) and (6.95,2.3) .. (10.93,4.9)   ;
% %Straight Lines [id:da12592400209364607]
% \draw    (496.3,2324.14) -- (471.67,2324.14) ;
% \draw [shift={(471.67,2324.14)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)(10.93,-4.9) .. controls (6.95,-2.3) and (3.31,-0.67) .. (0,0) .. controls (3.31,0.67) and (6.95,2.3) .. (10.93,4.9)   ;
% %Straight Lines [id:da5581274443249484]
% \draw    (77,2584.57) -- (224,2584.57) ;
% \draw [shift={(224,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% \draw [shift={(77,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% %Straight Lines [id:da019262930872751216]
% \draw    (240,2584.57) -- (386,2584.57) ;
% \draw [shift={(386,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% \draw [shift={(240,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% %Straight Lines [id:da7219845539458014]
% \draw    (391.08,2584.57) -- (459.5,2584.57) ;
% \draw [shift={(459.5,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% \draw [shift={(391.08,2584.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% %Straight Lines [id:da017762059002189856]
% \draw    (471,2585.05) -- (536.24,2585.05) ;
% \draw [shift={(536.24,2585.05)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% \draw [shift={(471,2585.05)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% %Straight Lines [id:da9617091303495224]
% \draw    (77,2616.57) -- (230,2616.38) ;
% \draw [shift={(230,2616.38)}, rotate = 179.93] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,0)   ;
% \draw [shift={(77,2616.57)}, rotate = 179.93] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% %Straight Lines [id:da3600040071261601]
% \draw    (230,2616.38) -- (391.08,2616.57) ;
% % \draw    (391.08,2611.38) -- (391.08,2616.57) ;
% % \draw [shift={(391.08,2616.57)}, rotate = 180.07] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-0.09)   ;
% % \draw [shift={(230,2616.38)}, rotate = 180.07] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-0.09)   ;
% %Straight Lines [id:da8499502481675647]
% \draw    (391.08,2616.57) -- (536.2,2616.57) ;
% \draw [shift={(536.2,2616.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,-5.59)   ;
% \draw [shift={(391.08,2616.57)}, rotate = 180] [color={rgb, 255:red, 0; green, 0; blue, 0 }  ][line width=0.75]    (0,5.59) -- (0,0)   ;
% %Shape: Brace [id:dp5605576433350821]
% \draw   (533,2576.5) .. controls (533,2571.83) and (530.67,2569.5) .. (526,2569.5) -- (473.48,2569.5) .. controls (466.81,2569.5) and (463.48,2567.17) .. (463.48,2562.5) .. controls (463.48,2567.17) and (460.15,2569.5) .. (453.48,2569.5)(456.48,2569.5) -- (397,2569.5) .. controls (392.33,2569.5) and (390,2571.83) .. (390,2576.5) ;

% % Text Node
% \draw (608.87,2406.3) node   [align=left] {\begin{minipage}[lt]{27.79pt}\setlength\topsep{0pt}
% $\displaystyle \leq \gamma $
% \end{minipage}};
% % Text Node
% \draw (471.65,2309.88) node   [align=left] {\begin{minipage}[lt]{27.66pt}\setlength\topsep{0pt}
% $\displaystyle \leq \epsilon $
% \end{minipage}};
% % Text Node
% \draw (258,2623.24) node [anchor=north west][inner sep=0.75pt]   [align=left] {A partition of domain $\cP$};
% % Text Node
% \draw (142,2549) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \hat{A}$\textsubscript{1}
% \end{minipage}};
% % Text Node
% \draw (325.67,2549) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \hat{A}$\textsubscript{2}
% \end{minipage}};
% % Text Node
% \draw (468.67,2549) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \hat{A}$\textsubscript{3}
% \end{minipage}};
% % Text Node
% \draw (148,2602.05) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \tilde{A}$\textsubscript{1}
% \end{minipage}};
% % Text Node
% \draw (330,2602.05) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \tilde{A}$\textsubscript{2}
% \end{minipage}};
% % Text Node
% \draw (471.21,2602.33) node   [align=left] {\begin{minipage}[lt]{21.76pt}\setlength\topsep{0pt}
% $\displaystyle \tilde{A}$\textsubscript{3}
% \end{minipage}};
% % Text Node
% \draw (46,2307.9) node [anchor=north west][inner sep=0.75pt]    {$l_{t}$};


% \end{tikzpicture}
\includegraphics[scale=0.4]{images/approx-feedback.png}
\caption{A depiction of $(\epsilon,\gamma)$-approximate feedback (Definition \ref{def:fb}) for a one dimensional loss function. Here, the true loss $l_t$ is given by the solid curve, and approximate loss $\Tilde{l}_t$ is piecewise constant.\label{fig:approx-feedback}}

\end{figure}

\citet{balcan2021data} show that the dual loss $l(\rho)$ is a piecewise constant function of the graph hyperparameter $\rho$, for any fixed problem instance. Suppose the problem instances arrive {\it online} in rounds $t=1,\dots,T$, and the learner receives some feedback about her predicted parameter $\rho_t$. A standard performance metric for the online learner is her expected regret, $R_T=\E\left[\sum_{t=1}^Tl_t(\rho_t) -\min_{\rho\in\cP}\sum_{t=1}^Tl_t(\rho)\right]$.  Since the {\it full information} setting where all the labels are revealed is not very practical (it assumes all labels of previous problem instances are available), \citet{balcan2021data} also consider the more realistic semi-bandit feedback setting of \citet{dick2020semi}, where only the loss corresponding to hyperparameter $\sigma_t$
selected by the online learner in round $t$ is revealed, along with the end points of the piece $A_t$ containing $\sigma_t$ where $l_t$ is contant. We consider a generalization of this setting where only an approximation to the loss value at $\sigma_t$ is revealed, along with an approximation to the piece $A_t$.\looseness-1

Although our formulation below is motivated by considerations for graph parameter tuning for semi-supervised learning, we provide very general definitions and results that apply to approximate online data-driven parameter selection more generally \citep{balcan2020data}.%, for online learning with partial feedback.\looseness-1

\begin{definition}%[$(\epsilon,\gamma)$-approximate semi-bandit feedback] 
An online optimization problem with loss functions $l_1,l_2,\dots$ is said to have \textbf{$(\epsilon,\gamma)$-approximate semi-bandit feedback} with system size $M$ if for each time $t=1,2,\dots$, there is a partition $\Tilde{A}_t^{(1)},\dots,\Tilde{A}_t^{(M)}$ of the parameter space $\cP\subset\R^d$, called an approximate feedback system, such that if the learner plays point $\rho_t \in \Tilde{A}_t^{(i)}$, she observes the approximate feedback set  $\Tilde{A}_t^{(i)}$%such that $\Big\lvert\text{Vol}\left(A_t^{(i)}\setminus\Tilde{A}_t^{(i)} \bigcup \Tilde{A}_t^{(i)}\setminus{A}_t^{(i)}\right)\Big\lvert\le \epsilon$
, and observes approximate loss $\Tilde{l}_t(\rho)$ for all $\rho\in\Tilde{A}_t^{(i)}$ such that $\sup_{\rho\in \hat{A}_t^{(i)}}|\Tilde{l}_t(\rho)-l_t(\rho)|\le \gamma$, for some (unknown) $\hat{A}_t^{(i)}\subseteq \Tilde{A}_t^{(i)}$ with $\Big\lvert\vol\left(\Tilde{A}_t^{(i)}\setminus\hat{A}_t^{(i)}\right)\Big\rvert \le \epsilon$. Here $\vol(A)$ denotes the $d$-dimensional volume of set $A$. We let $\Tilde{A}_t(\rho)$ denote the approximate feedback set that contains $\rho\in\cP$.\looseness-1\label{def:fb}
\end{definition}



For example, let the parameter space $\cP$ be one-dimensional, and in round $t$ the learner plays point $\rho_t\in\cP$. Now suppose the approximate loss functions are also piecewise constant with pieces $\Tilde{A}_t^{(1)},\dots,\Tilde{A}_t^{(M)}$ that partition $\cP$, and she receives information about the constant piece $\Tilde{A}_t(\rho_t)$ containing the played point  by receiving the ends points of interval $\Tilde{A}_t$ %approximating the piece with $|\Tilde{A}_t|\ge |A_t(\rho_t)|-\epsilon$ 
and approximate loss value $\Tilde{l}_t$ for the observed piece $\Tilde{A}_t$ with $|\Tilde{l}_t-l_t|\le \gamma$ for most of the interval $\Tilde{A}_t$, except possibly finitely many small intervals with total length $\epsilon$, where $l_t$ is the true loss function. This satisfies the definition of $(\epsilon,\gamma)$-approximate semi-bandit feedback. See Figure \ref{fig:approx-feedback} for an illustration. This simple example captures the semi-supervised loss $l_{A(G(\rho),L,U)}$ (where in fact the true loss function is also piecewise constant \citep{balcan2021data}), but our analysis in this section applies to more general {\it piecewise-Lipschitz} loss functions, and for high dimensional Euclidean action space. This approximate feedback model generalizes the ``exact'' semibandit feedback model of \citet{dick2020semi} (which in turn generalizes the standard `full information' setting that corresponds to $M=1$) and is useful for cases where computing the exact feedback set or loss function is infeasible or computationally expensive. Our model also generalizes the approximate loss functions of \citet{balcan2020refined} where positive results (data-dependent generalization guarantees) are shown for $(0,\gamma)$-approximate full-information ($M=1$) feedback in the distributional setting. This extension is crucial for applying our techniques of efficient graph learning by computing approximate loss values for the learned graph.


\begin{algorithm}
\caption{$\textsc{Approximate Continuous Exp3-Set}(\lambda)$}
\label{algorithm: semibandit}
\begin{algorithmic}[1]
\STATE {\bfseries Input:} step size $\lambda\in[0,1]$.
\STATE {Initialize $w_1(\rho)=1$ for all $\rho\in\cP$}.
\FOR{$t=1,\dots,T$}
\STATE{Sample $\rho_t$ according to $p_t(\rho)=\frac{w_t(\rho)}{\int_{\cP}w_t(\rho)d\rho}$.}

\STATE{Play $\rho_t$ and suffer loss $l_t(\rho_t)$.}
\STATE{Observe $(\gamma,\epsilon)$-approximate feedback $\Tilde{l}_t(\rho)$ over set $\Tilde{A}_t$ with $\rho_t\in \Tilde{A}_t$}
\STATE{Update $w_{t+1}(\rho)=w_t(\rho)\exp(-\lambda \hat{l}_t(\rho))$, where $\hat{l}_t(\rho)=\frac{\I\{\rho\in\Tilde{A}_t\}}{\int_{\Tilde{A}_t}p_t(\rho)d\rho}\Tilde{l}_t(\rho)$.}
\ENDFOR
\end{algorithmic}
\end{algorithm}
% Two main results (informal):
\iffalse
Informal summary
\begin{itemize}
    \item Suppose we have piecewise constant loss functions at each round. The  $\ell_\infty$ distance of the observed loss to the true loss function is bounded by $\epsilon$ (i.e. $\sup_\rho|\Tilde{u_j}(\rho)-u_j(\rho)|$), for an ``$\epsilon$-approximation'' of the feedback set $F$ (i.e. receives closed, compact $\tilde{F}$ such that $|\text{Vol}(\tilde{F})-\text{Vol}(F)|\le \epsilon$) for the played point. If $\epsilon=T^{-\beta'}$, we can show the regret for the approximate exponential forecaster is $\tilde{O}(T^{1-\min\{\beta,\beta'\}})$. This result also allows us to implement efficient approximate ERMs for the distributional setting (Section 7 of \citet{balcan2018dispersion}).
    % \item Composition of 2 parametric algorithms. If algorithm $A$ is a composition of A1 and A2 ... ; write as piecewise structured functions, generalize G from real valued to arbitrary sets.
\end{itemize}


\red{Can this be made adaptive? Doubling trick to avoid knowledge of $T$?}
\fi

We give a general online learning algorithm in the presence of approximate semi-bandit feedback, and we show that our algorithm achieves sub-linear regret bounds. In particular, our results indicate how the approximation in the loss function impacts the regret of our algorithm and provides a way to quantify the accuracy-efficiency trade-off (better loss approximation can improve regret in Theorem \ref{thm:approx-feedback}, but at the cost of efficiency in Theorems \ref{thm:harmbound}, \ref{thm:delbound}).

\begin{theorem}\label{thm:approx-feedback}
Suppose $l_1,\dots,l_T:\cP\rightarrow[0,1]$ is a sequence of $\beta$-dispersed loss functions, and the domain  $\cP\subset\R^d$ is contained in a ball of radius $R$. The Approximate Continuous Exp3-Set algorithm (Algorithm \ref{algorithm: semibandit}) achieves expected regret $\tilde{O}(\sqrt{dMT\log(RT)}+T^{1-\min\{\beta,\beta'\}})$ with access to $(\epsilon,\gamma)$-approximate semi-bandit feedback with system size $M$, provided $\gamma \le T^{-\beta'},\epsilon\le {\vol(\cB(T^{-\beta}))} T^{-\beta'}$, where $\cB(r)$ is a $d$-ball of radius $r$.
\end{theorem}

\begin{proofoutline}
    We adapt the $\textsc{Continuous-Exp3-SET}$ analysis of \citet{alon2017nonstochastic,dick2020semi}. %, and uses similar arguments to the robust meta-learning guarantees in \citet{balcan2021learning} where the learner observes a perturbed version of the true loss function although in the full information setting. 
Define weights $w_t(\rho)$ over the parameter space $\cP$ as $w_{1}(\rho)=1$ and $w_{t+1}(\rho)=w_{t}(\rho)\exp(-\eta\hat{l}_t(\rho))$ and normalized weights $W_t=\int_{\cP}w_t(\rho)d\rho$. Note that $p_t(\rho)=\frac{w_t(\rho)}{W_t}$. We give upper and lower bounds on  the quantity $\E[\log W_{T+1}/W_{1}]$, i.e. the expected value of the log-ratio of normalized weights, and bound the slackness induced in these bounds due to $(\epsilon,\gamma)$-approximate feedback. Our analysis shows that, provided the error terms $\epsilon,\gamma$ are sub-constant in $T$ as stated, we achieve sublinear expected regret. \red{forward reference the implication on runtime complexity; potentially add more detail to sketch}
\end{proofoutline}

In Theorem \ref{thm:approx-feedback}. $\beta’$ measures the net impact of approximate feedback on the regret of Algorithm \ref{algorithm: semibandit}. In particular, it shows that approximation can affect regret when ($\gamma,\epsilon$ are such that) $\beta’<\beta$ and $\beta’<\frac{1}{2}$. The bound in Theorem \ref{thm:approx-feedback} is good for sufficiently small $\gamma,\epsilon$. However, very small $\gamma,\epsilon$ can come at the expense of speed. In more detail, our results in Section 5 discuss how approximate feedback can be algorithmically implemented and useful to obtain faster runtime (runtime bounds are weaker for smaller $\epsilon$). Together, the results quantify an accuracy-efficiency trade-off, and indicate how to set the approximation parameters to improve the efficiency (of graph hyperparameter tuning) without sacrificing the accuracy.\looseness-1


\red{add couple more remarks, e.g. compare with DP approach of BSV18..}

% \red{TODO: replace proof by sketch and move proof to appendix}





% TODO: {\it Comparison to \citet{balcan2020refined}.} 
%TODO: Distributional under dispersion.




\section{Learning Sparse Graph Families}\label{sec:sparse}

Using (weighted) edges for the $k$-nearest neighbors to use a sparse graph is well-known as an optimization for computational efficiency in semi-supervised learning \citep{delalleau2005efficient,wang2016scalable}. Here we will show that it also formally reduces the learning theoretic complexity, for the problem of graph hyperparameter tuning. Proofs from this section appear in Appendix A.%\ref{app:proof of approx feedback}.

We can upper bound the pseudodimension of the class of loss functions for sparse graph families, where only $k$ nearest neighbors are connected, for tunable parameter $k\le K$. This upper bound improves on the $O(n)$ bound from \citep{balcan2021data} since $K\le n$, and involves a more careful argument to bound the number of possible label patterns.\looseness-1


\begin{theorem}\label{thm:sigma-pdim}
The pseudo-dimension of $\mathcal{H}_{k,\sigma}$ is $O(K+\log n)$ when the labeling algorithm $A$ is the mincut approach of \citet{blum2001learning}.
\end{theorem}

\red{TODO: possibly replace proof by sketch and move proof to appendix}




The above argument gives a better sample complexity than dense graphs, for which the pseudo-dimension is known to be $\Theta(n)$ \citep{balcan2021data}. We can also give upper bounds on the pseudo-dimension for $H_{k,r}$, the $k$-nearest neighbor graph that adds edges only in $r$-neighborhood, which implies existence of sample and computationally efficient algorithms for learning the best graph parameter $\rho=(k,r)$ using standard results.


\begin{figure*}
\centering
\subfloat[Gradient Descent Only]{\includegraphics[width = 1.9in]{images/Convergence_GD.png}} 
\subfloat[Newton's Method Only]{\includegraphics[width = 1.9in]{images/Convergence_Newtons.png}}
\subfloat[Our Method]{\includegraphics[width = 1.9in]{images/Convergence_both.png}} 
\caption{An instance of a node $u$ for graph $G$ on a subset of the MNIST dataset, where finding local minima of $g_u(\sigma)=(f_u(\sigma)-\frac{1}{2})^2$ is challenging for both Gradient descent and Newton steps.}
\label{fig: gd+ns}
\end{figure*}

\begin{theorem}\label{thm:threshold}
The pseudo-dimension of $\mathcal{H}_{k,r}$ is $O(\log n)$ for any labeling algorithm $A$.
\end{theorem}

\red{TODO: possibly replace proof by sketch and move proof to appendix; replace O() by Theta()?}


Note that the lower bounds from \citet{balcan2021data} imply that the above bound is asymptotically tight. %Essentially this means that thr pseudo-dimension for learning $k$ and $r$ is within a constant factor of the pseudo-dimension of learning just the threshold $r$.
Our bounds in this section imply upper bounds on number of problem instances needed for learning the best parameter values for the respective graph families \citep{balcan2020data} in the distributional setting. More precisely, we can bound the sample complexity of $(\epsilon,\delta)$-uniformly learning (Appendix A.1).\looseness-1

\begin{theorem}[\cite{anthony1999neural}] \label{thm:pdim-generalization}
 Suppose $\cH$ is a class of  functions $\cX\rightarrow[0,1]$ having pseudo-dimension $\textsc{PDim}(\cH)$. For every $\epsilon>0,\delta\in (0,1)$, the sample complexity of $(\epsilon,\delta)$-uniformly learning the class $\cH$ is $O\left(\frac{1}{\epsilon^2}\left(\textsc{PDim}(\cH)\ln\frac{1}{\epsilon}+\ln\frac{1}{\delta}\right)\right)$.
\end{theorem}

%We can also give online learning guarantees by extending corresponding results from \citet{balcan2021data}, which make mild assumptions on data smoothness.

\iffalse
We can also bound the pseudo-dimension of $H_{k,\sigma}$ for $k\in [K]$. %This implies sample complexity polynomial in $n$, the number of examples.

{\color{red}TODO(extension to harmonic objective): The above result also holds for the harmonic objective, but with a slightly more involved proof.}

{\color{red}Potential addition -- extension of lower bound in [BS21] to possibly give a matching lower bound here.}

Finally we note that the results of \citet{balcan2021data} imply that the pseudo-dimension of $H_{\sigma, r}$ is $\Omega(n)$. However, we can show dispersion in this case under mild assumptions on the smoothness of the data distribution.

\begin{restatable}{theorem}{thmdisp}\label{thm:dispersion}
Let $l_1,\dots, l_T:\R^2\rightarrow\R$ denote an independent sequence of losses as a function of parameter $\rho=(\sigma,r)$, when the graph is created using a thresholded exponential kernel $w(u,v)=e^{-\frac{d(u,v)^2}{\sigma^2}}\bI[d(u,v)\le r]$ and labeled by optimizing the quadratic objective $\sum_{u,v}w(u,v)(f(u)-f(v))^2$, where $d(\cdot,\cdot)$ is the distance metric. If $d(u,v)$ follows a $\kappa$-bounded distribution with a closed and bounded support, the sequence is $\frac{1}{2}$-dispersed, and the regret of Algorithm \ref{alg:ddssl}{\color{red}{TODO}: Add 2D version of the algorithm in [BS21]} may be upper bounded by $\Tilde{O}(\sqrt{T})$.
\end{restatable}

{\color{red}{TODO}: Write proof.}
\fi







\section{Scalability with Approximation Guarantees}\label{sec:algo}

We will now present and analyse an algorithm (Algorithm \ref{algorithm: semi harmonic}) for computing approximate semi-bandit feedback for the dual semi-supervised loss $l(\sigma)$ over  $\sigma\in[\sigma_{\min},\sigma_{\max}]$ (we assume number of nearest-neighbors $k$ is a fixed constant in the following), where $\sigma$ is the Gaussian bandwidth parameter (Def. \ref{defn:g}). Our algorithm is a scalable version of Algorithm 4 of \citet{balcan2021data}. % mention local minima needs to be computed.
Our proposed approach involves two main modifications noted below. %We also provide theoretical guarantees in the presence of these modifications.



\begin{itemize}[leftmargin=*,topsep=4pt,partopsep=1ex,parsep=1ex]\itemsep=-4pt
    \item Our Algorithm \ref{algorithm: semi harmonic} uses approximate soft labels $f(\sigma)_\epsilon$ and gradients $\frac{\partial f}{\partial \sigma}_\epsilon$. %, and can be applied to any labeling algorithm where...
    We use the {\it conjugate gradient} method to compute these approximations, and provide implementations for the harmonic objective minimization approach of \citet{zhu2003semi}, as well as the efficient algorithm of \citet{delalleau2005efficient} with time complexity bounds (Algorithm \ref{algorithm: harmonic approx} and \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} resp.\ below). %Algorithm \ref{algorithm: harmonic approx} simply replaces the matrix inverse computations in computing the soft label function and its derivative with respect to $\sigma$. Algorithm \ref{algorithm: delalleau approx}  applies this approach to the scalable algorithm of \citet{delalleau2005efficient}.
    % \item We use the conjugate gradient method to approximately compute matrix inverses when finding soft labels, and to compute approximate gradients respectively.  %TODO: empirical convergence? 
    \item We use the approximate gradients to locate points where $f(\sigma^*)=\frac{1}{2}$, corresponding to $\sigma$ value where the predicted label flips. We use these points to find $(\epsilon, \epsilon)$-approximate feedback sets. We propose the use of smaller of Newton's step and gradient descent for better convergence to these points (line \ref{algSemiHarmonic-linemin} in Algorithm \ref{algorithm: semi harmonic}; \cite{balcan2021data} use only Newton's method). We motivate this step by giving an example (from a real dataset) where the gradients are both too small and too large near the minima (Figure \ref{fig: gd+ns}). This makes convergence challenging for both gradient descent and Newton's method, but the combination is effective even in this setting. We also give convergence guarantees and runtime bounds for Algorithm \ref{algorithm: semi harmonic} in the presence of approximate gradients (Theorems \ref{thm:harmbound}, \ref{thm:delbound}).
\end{itemize}

We first describe how to instantiate the sub-routine $\cA$ to compute approximate soft labels in Algorithm \ref{algorithm: semi harmonic} (full details in Appendix B for the interested reader).

% \subsection{Guarantees for Approximation Algorithms \ref{algorithm: harmonic approx} and \ref{algorithm: delalleau approx}}

%TODO: cite icml15 paper.
% \begin{theorem}\label{thm:cg}
% All CG steps of Algorithms \ref{algorithm: harmonic approx} and \ref{algorithm: delalleau approx} need at most $O(e^{\tilde{O}(\sigma^2_{\max})}\log\frac{1}{\epsilon})$ iterations of the conjugate gradient method to give an $\epsilon$-approximate solution to the linear system on graph $G(\sigma)$ using the RBF kernel if $\sigma\le\sigma_{\max}$ for some fixed $\sigma_{\max}$.
% \end{theorem}

% \begin{proofoutline}
% First, by \citet{AXELSSON1976123}, we have that $T$ iterations of the CG method results in an $\epsilon$ approximation for $x$ when solving $Ax = b$ given $A \in \mathbb{R}^{N \times N}$ and $b \in \mathbb{R}^{N}$, where $$T \in O\left(\sqrt{\kappa (A)} \log (1 / \epsilon)\right).$$ Then, we note that it is shown by \citet{diederichs2019improved} that $$\kappa_G \leq e^{(\sigma^2 \pi^2) / 2}\left(10 + \frac{5\sigma}{\sqrt{\pi}}\right)$$ 
% for Gaussian kernal graphs, so  plugging this back into our initial estimate, we get 
% $$T \in O\left(e^{(\sigma^2 \pi^2) / 4}\sqrt{10 + \frac{5\sigma}{\sqrt{\pi}}}\log (1 / \epsilon )\right) \in O\left(e^{\tilde{O}(\sigma_{\max}^2)}\log (1 / \epsilon)\right)$$
% iterations of the main CG method to achieve an $\epsilon$ approximation when we assume $\sigma$ is bounded by some constant $\sigma_{\max}$.

% % Note: removed citations to \citet{vishnoi2012laplacian}\citet{shewchuk1994introduction} for condition number and instead use \citet{AXELSSON1976123}
% \end{proofoutline}

\phantomsection\label{summary: harmonic approx}
Algorithm \ref{algorithm: harmonic approx} computes the soft label that optimizes the harmonic function objective \cite{zhu2003semi} and gradient for a given value of graph parameter $\sigma$ for a fixed unlabeled node $u$. This is accomplished by running the conjugate gradient for given number of iterations to solve systems corresponding to the harmonic function objective and its gradient.

\begin{algorithm}[h]
\caption{$\textsc{HarmonicApproximation}(G,f_L,u, \sigma,\epsilon)$}
\label{algorithm: harmonic approx}
\begin{algorithmic}[1]
\STATE {\bfseries Input:} Graph $G$ with labeled nodes $f_L$, unlabeled node $u$, query parameter $\sigma$, error tolerance $\epsilon$.
\STATE {\bfseries Output:} approximate soft label $f_{u, \epsilon}$ and approximate gradient $\frac{\partial f_u}{\partial \sigma}_\epsilon$.
\STATE{Let $\text{CG}(A, b, t)$ represent running the conjugate gradient method for $t$ iterations to solve  $Ax = b$.}
\STATE{Let $t_\epsilon$ indicate the number of iterations sufficient for $\epsilon$-approximation of $f_u(\sigma)\frac{\partial f_u}{\partial \sigma}$ (Theorem B.2, appendix).}
\STATE {Let $f_{U, \epsilon}(\sigma)=\text{CG}\left((I - P_{UU}), P_{UL}f_L, t_\epsilon\right)$, where $D_{ij}:=\bI[i=j]\sum_{k}W_{ik}, P=D^{-1}W$}.
\STATE{Let $\frac{\partial f}{\partial \sigma}_{\epsilon} \!=  \text{CG}\!\left((I-P_{UU}),\left(\frac{\partial P_{UU}}{\partial \sigma}\!f_{U,\epsilon}+\frac{\partial P_{UL}}{\partial \sigma}\!f_L\right)\!, t_\epsilon\!\right)$}, where
\begin{align*}
    \frac{\partial P_{ij}}{\partial \sigma}&=\frac{\frac{\partial w(i,j)}{\partial \sigma}-P_{ij}\sum_{k\in L+U}\frac{\partial w(i,k)}{\partial \sigma}}{\sum_{k\in L+U}w(i,k)},\\
    \frac{\partial w(i,j)}{\partial \sigma}&=\frac{2w(i,j)d(i,j)^2}{\sigma^3}.
\end{align*}
\RETURN{$f_{u, \epsilon}(\sigma), \frac{\partial f_u}{\partial \sigma}_\epsilon$.}
\end{algorithmic}
\end{algorithm}
% $\textsc{ZGL03Approx}(G,f_L,u, \sigma,\epsilon)$: This algorithm computes the soft label that optimizes the harmonic function objective \citep{zhu2003semi} and gradient for a given value of graph $G$ with parameter $\sigma$ for a fixed unlabeled node $u$ by running the conjugate gradient for some number of iterations. The full algorithm is presented as Algorithm 1 in Appendix B.1. The algorithm finds an $\epsilon$ approximation of $f_{u}(\sigma)\frac{\partial f_u}{\partial \sigma}$
% using $O\left(\sqrt{\kappa(I - P_{UU})}\log\left(\frac{n}{\epsilon \lambda_{\min}(I - P_{UU})}\right)\right)$ conjugate gradient iterations, where $\kappa(\cdot)$ denotes the condition number, and $P_{UU}$ is the "grounded" Laplacian of matrix $P$. Formal statement and proof is in the appendix (Theorem B.2). 

\phantomsection\label{summary: delalleau approx}
(Informal) $\textsc{DBLR05Approx}(G,f_L, i, \sigma, \epsilon)[\tilde{U},\lambda]$: This algorithm computes the soft label and gradient corresponding to the efficient algorithm of \citet{delalleau2005efficient} for a graph $G$ with parameter $\sigma$ for a fixed unlabeled node $i \in U$. Unlike Algorithm \ref{algorithm: harmonic approx}, a matrix inverse is approximated via iterations of the CG method for the Laplacian of a small subset of unlabeled 'training' nodes $\tilde{U} \subset U$ along with a set of labeled nodes $L$. The labels of $i \in U \setminus \tilde{U}$ ('testing' nodes) are determined by summing the labels of each $x \in \tilde{U} \cup L$, weighted by $W_{ij}(x, i)$. The full algorithm is presented as Algorithm 1 in section B.3 of the appendix. The algorithm finds an $\epsilon$ approximation of $\tilde{f}_{u}(\sigma)\cdot\frac{\partial \tilde{f}_u}{\partial \sigma}$
using $O\left(\sqrt{\kappa(A)}\log\left(\frac{\lambda (|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\epsilon \sigma_{\min}\lambda_{\min}(A)}\right)\right)$ conjugate gradient iterations, where $\kappa$ is  the condition number,  $\tilde{U} \subset U$ is a small subset, and $\lambda$ is a hyperparameter. Formal statement and proof is in the appendix (Theorem B.3).\looseness-1 


\begin{algorithm}[h]
\caption{$\textsc{ApproxFeedbackSet}(G,f_L,\sigma_0,\epsilon, \eta, \cA)$}
\label{algorithm: semi harmonic}
\begin{algorithmic}[1] 
\STATE {\bfseries Input:} Graph $G$ with unlabeled nodes $U$, labels $f_L$, query parameter $\sigma_0$, error tolerance $\epsilon$, learning rate $\eta$, algorithm $\cA$ to estimate soft labels and derivatives at any $\sigma$ (e.g.  Algorithm \ref{algorithm: harmonic approx}).
\STATE {\bfseries Output:} Estimates for piecewise constant interval containing $\sigma_0$, and function value at $\sigma$.
\STATE{Initialize $\sigma_l=\sigma_h=\sigma_0$.}
\FORALL{$u\in U$}
\STATE{Initialize $n=0, \lambda_0 = 1, y_0=\sigma_0$.}
\WHILE{\label{algSemiHarmonic-linewhile} $|\sigma_{n+1}-\sigma_{n}|\ge \epsilon $}
\STATE{Compute $f_{u, \epsilon}(\sigma), \frac{\partial f_u}{\partial \sigma}_\epsilon$ as  $\cA(G,f_L,u,\sigma_n,\epsilon)$%, at $\sigma=\sigma_n$. %TODO with xxx iterations...
}
\STATE{Set $g_u(\sigma_n)=(f_{u, \epsilon}(\sigma_n)-\frac{1}{2})^2$, $g_u'(\sigma_n) = 2\left(f_{u, \epsilon}(\sigma_n)-\frac{1}{2}\right)\left(\frac{\partial f_u}{\partial \sigma}_{\epsilon}\right)$ .}
\STATE{$\xi_\text{GD} \leftarrow \eta  g_u'(\sigma_n)$; $\xi_\text{Newton} \leftarrow 2  \frac{g_u(\sigma_n)}{g_u'(\sigma_n)} $.}
\STATE{\label{algSemiHarmonic-linemin} $y_{n+1}=\sigma_n-\min\{\xi_\text{GD}, \xi_\text{Newton}\}.$}
\IF{ $\xi_\text{GD}<\xi_\text{Newton}$}
% \STATE{$y_{n+1}=\sigma_n-\text{Nesterov}$}
\STATE{\label{algSemiHarmonic-linenestupdate} $\lambda_{n + 1} = \frac{1 + \sqrt{1 + 4 \lambda^2_n}}{2}, \gamma_n = \frac{1 - \lambda_{n}}{\lambda_{n + 1}}, \sigma_{n + 1} = (1 - \gamma_n) y_{n + 1} + \gamma_ny_n$}
\ELSE
% \STATE{$y_{n+1}=\sigma_n-\text{Newton}$ \red{should we have an accelerated version of Newton's as well?}}
\STATE\label{algSemiHarmonic-linenewtupdate} {$\sigma_{n + 1} = y_{n + 1}$}
\ENDIF
\STATE{$n\leftarrow n+1$}
\ENDWHILE
% \STATE{$\sigma_u=\sigma_{n+1}$}
% \IF{$\sigma_u < \sigma_0$}
\STATE{\label{algSemiHarmonic-lineupdateint} $\sigma_l = \min \{\sigma_l, \sigma_{n+1}\}
, \sigma_h = \max\{\sigma_h, \sigma_{n+1}\}$.}
% \ENDIF
\ENDFOR
\RETURN{$[\sigma_l,\sigma_h]$, $f_\epsilon(\sigma_0)$.}
\end{algorithmic}
\end{algorithm}

% \subsection{Guarantees for Algorithm \ref{algorithm: semi harmonic}}

Our main result is the following guarantee on the performance of Algorithm \ref{algorithm: semi harmonic}, which captures the approximation-efficiency trade-off for the algorithm. Compared to the $\Tilde{O}(n^4)$ running time of the approach in \citet{balcan2021data}, our algorithm runs in time $\Tilde{O}(n^2)$ for sparse kNN graphs (i.e. $k$-nearest neighbors with small constant $k$). To achieve this speedup, we replace an $O(n^3)$ matrix inverse for a given unlabeled point with a fixed number of Conjugate Gradient iterations taking time $O(|E_G|)$, where $|E_G|$ is the number of edges for graph $G$ corresponding to the matrix being inverted. Combined with our general algorithm for approximate data-driven algorithm design (Theorem \ref{thm:approx-feedback}), we obtain $\Tilde{O}(\sqrt{T})$ expected regret for online graph parameter tuning with approximate semi-bandit feedback, provided we run Algorithm \ref{algorithm: semi harmonic} with $\epsilon\le\frac{1}{\sqrt{T}}$. For our proof, we will assume that the soft label function $f_u(\sigma)$ is convex and smooth (i.e. derivative w.r.t. $\sigma$ is Lipschitz continuous) for estimating the convergence rates. In Section \ref{sec:expt}, we observe that our algorithm works well in practice even when these assumptions on $f$ are not satisfied, and it would be interesting to extend our analysis to weaker assumptions on the soft label.

\begin{figure*}[t]
\centering
\subfloat[MNIST]{\includegraphics[width = 1.8in]{images/intervals/adjusted_MNIST_300_10_labels_seed_1_CG_20_kNN_6.png}}
\subfloat[Fashion-MNIST]{\includegraphics[width = 1.8in]{images/intervals/adjusted_FashionMNIST_300_10_labels_seed_4_CG_20_kNN_6.png}}
\subfloat[USPS]{\includegraphics[width = 1.8in]{images/intervals/adjusted_USPS_300_10_labels_seed_6_CG_20_kNN_6.png}}
\caption{Loss intervals calculated with approximate soft-labels via Algorithm \ref{algorithm: harmonic approx}, kNN = 6, $|U| = 300$. Blue line corresponds to true loss, black intervals are estimated constant loss intervals.}
\label{fig:mainint}
\end{figure*}

\begin{theorem}\label{thm:harmbound}
    Using Algorithm \ref{algorithm: harmonic approx} for computing $\epsilon$-approximate soft labels and gradients for the harmonic objective of \citet{zhu2003semi}, if  $f_u(\sigma)$ is convex and smooth, Algorithm \ref{algorithm: semi harmonic} computes $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ %outputs the interval $[\sigma_{\min, \epsilon}, \sigma_{\max, \epsilon}]$ containing $\sigma$ (of the piecewise constant loss function $l(\sigma)$) up to $\epsilon$ accuracy (i.e., $|\sigma_{\min, \epsilon} - \sigma_{\min}| < \epsilon, |\sigma_{\max, \epsilon} - \sigma_{\max}| < \epsilon$) 
    in time $O\!\left(\!|E_G|n\sqrt{\kappa(\cL_{UU})}\log\!\left(\!\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\!\right)\log\log\!\frac{1}{\epsilon}\!\right)$, where $|E_G|$ is the number of edges in graph $G$, $\cL_{UU}=I - P_{UU}$ is the normalized grounded graph Laplacian (with labeled nodes grounded), $\Delta=\sigma_{\max} - \sigma_{\min}$ is the size of the parameter range and $\kappa(M)=\frac{\lambda_{\max}(M)}{\lambda_{\min}(M)}$ denotes the condition number of matrix $M$.
\end{theorem}
% \red{TODO: assumes f(sigma) is convex; can we get a weaker-than-quadratic convergence for non-convex f?}
\begin{proofoutline}
As noted in \citet{balcan2021data}, the loss $l(\sigma)$ is discontinuous at $\sigma^*$ only if $f_{u}(\sigma^*) = \frac{1}{2}$. Algorithm \ref{algorithm: semi harmonic} finds these critical points by finding roots/zeros of $\left(f_u(\sigma) - \frac{1}{2}\right)^2$.  We show (Theorem C.1 in the appendix) that if $f$ is convex and smooth, Nesterov's accelerated descent \citep{nesterov1983method}  quadratically converges to within $\epsilon$ of such roots, given an $O(\frac{\epsilon}{\Delta})$-approximations of $f \frac{\partial f}{\partial \sigma}$ and $\left|\frac{\partial f}{\partial \sigma}\right| < \frac{1}{\epsilon \lambda_{\min}(G_A)}$. Newton's method converges quadratically to within $\epsilon$, given $\epsilon$-approximations of $f \frac{\partial f}{\partial \sigma}$ (Theorem C.2 in the appendix). We use Algorithm \ref{algorithm: harmonic approx} to find suitable $\epsilon$-approximations of $f \frac{\partial f}{\partial \sigma}$ in time $O\left(\sqrt{\kappa(\cL_{UU})}\log\left(\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\right)\right)$ (Theorem B.2). We argue that if the derivative $\frac{\partial f}{\partial \sigma}$ is large (i.e. the condition on $\frac{\partial f}{\partial \sigma}$ for Theorem C.2 does not hold), then the Newton step will be less than $\epsilon$. Since the algorithm uses the smaller of the Newton and Nesterov updates, Algorithm \ref{algorithm: semi harmonic} will terminate for given $u \in U$. By quadratic convergence, we need $O(\log \log \frac{1}{\epsilon})$ iterations in Algorithm \ref{algorithm: semi harmonic} for each of the $O(n)$ unlabeled elements. Finally, noting that the Conjugate Gradient method takes $O(|E_G|)$ time per iteration, we obtain the claimed bound on runtime.\looseness-1 %can  bound the final runtime complexity by the number of edges in the graph $G$, or number of nonzero entries in the matrix being inverted. This yields the desired bound.% of $O\!\left(\!|E_G|n\sqrt{\kappa(\cL_{UU})}\log\!\left(\!\frac{n\Delta}{\epsilon \lambda_{\min}(\cL_{UU})}\!\right)\log\log\!\frac{1}{\epsilon}\!\right)$.
% We present a proof similar to Theorem 14 in \citet{balcan2021data}.  First, we find an $\epsilon$-approximation of $f \frac{\partial f}{\partial \sigma}$ in $O\left(\sqrt{\kappa(\cL_{UU})}\log\left(\frac{n}{\epsilon' \lambda_{\min}(\cL_{UU})}\right)\right)$ CG steps using \hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}}. Next, we argue that both Newton's method and Nesterov's accelerated gradient with a bounded derivative converge quadraticallly via Theorem's C.1 and C.2 in the appendix. 
% Using \hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}}, we fi
% and Theorems C.1 and C.2 from the appendix to provide an overall time bound on the interval finding in Algorithm \ref{algorithm: semi harmonic} using $\epsilon$-approximations of the harmonic objective.
% We argue that Nesterov's accelerated gradient \citep{nesterov1983method} will converge given a bounded derivative, and if the derivative is very large, then the Newton's step will provide an upper bound of $\epsilon$ in the update step on either line \ref{algSemiHarmonic-linenestupdate} or \ref{algSemiHarmonic-linenewtupdate} of Algorithm \ref{algorithm: semi harmonic}, causing line \ref{algSemiHarmonic-linewhile} to return false and the algorithm to terminate. %Further, we define it in terms of the number of edges $E_G|$ to highlight time savings for kNN graphs. \\\\
\end{proofoutline}

Above analysis can be adapted to obtain the following guarantee for tuning $\sigma$ in the efficient algorithm of \citet{delalleau2005efficient}. While the above result guarantees a running time of $\Tilde{O}(n^2)$ for kNN graphs, learning the graph can be done even more efficiently for the scalable approach of \citet{delalleau2005efficient}. Their algorithm minimizes a proxy for the harmonic objective given by

$\frac{1}{2}\sum_{u,v\in\Tilde{U}}w(u,v)(f(u)-f(v))^2+\lambda\sum_{w\in L}(f(w)-y_w)^2,$

\noindent where $\Tilde{U}\subset U$ and $\lambda$ are hyperparameters. In particular, one chooses a small set $\Tilde{U}$ with $|\Tilde{U}|\ll n$ and efficiently extrapolates the harmonic labels on $\Tilde{U}$ to the rest of $U$ using a Parzen windows based extrapolation. As before, the success of this more efficient approach also depends on the choice of the graph $G$ used. Our Algorithm \ref{algorithm: semi harmonic} obtains good theoretical guarantees in this case as well, with appropriate choice of algorithm $\cA$ (namely \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}).

\red{possibly replace by informal statement, change all the D to E for the experiment section appendix references, regenenrate the images used}



% Similar to theorem 5.1, but applies ot \hyperref new thing. Runs in time ____

\begin{theorem}\label{thm:delbound} (Informal)
Given an algorithm for computing $\epsilon$-approximate soft labels and gradients for the efficient semi-supervised learning algorithm of \citet{delalleau2005efficient} (\hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}),
% with hyperparameters $\Tilde{U}\subset U$ and $\lambda$,  if the soft label function $f_u(\sigma)$ is convex and smooth, 
Algorithm \ref{algorithm: semi harmonic} computes $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ %outputs the interval $[\sigma_{\min, \epsilon}, \sigma_{\max, \epsilon}]$ containing $\sigma$ (of the piecewise constant loss function $l(\sigma)$) up to $\epsilon$ accuracy (i.e., $|\sigma_{\min, \epsilon} - \sigma_{\min}| < \epsilon, |\sigma_{\max, \epsilon} - \sigma_{\max}| < \epsilon$) 
    in time $O\!\!\left(\!|E_{G_{\tilde{U}}}|n\sqrt{\kappa(\cL_{A})}\!\log\!\!\left(\!\frac{\lambda(|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)\Delta}{\epsilon \sigma_{\min}\lambda_{\min}(\cL_{A})}\!\right)\!\log\log\!\frac{1}{\epsilon}\!\right)$,  where all values are defined as in \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}
    % $|E_{G_{\tilde{U}}}|$ is the number of edges in graph $G_{\tilde{U}}$, $\cL_{A}$ is the normalized grounded graph Laplacian, and $\tilde{U} \subseteq U$, where $U$ is the set of unlabeled examples and $|\Tilde{U}| \ll |U|$
    %TODO: add dellaleau algo in appendix to define A
    % , $\kappa(M)=\frac{\lambda_{\max}(M)}{\lambda_{\min}(M)}$ denotes the condition number of matrix $M$, $L_{\text{Labels}}$ and $\tilde{U}_{\text{Labels}}$ are sets of labels in \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}.
    % For the delalleau function approximation (Algorithm \ref{algorithm: delalleau approx}), Algorithm \ref{algorithm: semi harmonic} outputs the interval $[\sigma_{\min, \epsilon}, \sigma_{\max, \epsilon}]$ containing $\sigma$ up to $\epsilon$ accuracy $(|\sigma_{\min, \epsilon} - \sigma_{\min}| < \epsilon, ...)$ in time \\$O(|E_G|\sqrt{\kappa(A)}\left(\log\left(\frac{\lambda(|L_{\text{Labels}}| + |\tilde{U}_{\text{Labels}}|)}{\epsilon \sigma_{\min}\lambda_{\min}(A)(\sigma_{\max} - \sigma_{\min})}}\right)\right))\log\log(\frac{1}{\epsilon}))$, where $|E_G|$ is the number of edges in graph $G$, where $L_{\text{Labels}}$ and $\tilde{U}_{\text{Labels}}$ are sets of labels as described in Algorithm \ref{algorithm: delalleau approx}.
\end{theorem}

\begin{proofoutline}
    The proof follows in the same manner as Theorem \ref{thm:harmbound}, except we now use \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} to bound  the number of iterations of the CG method. 
\end{proofoutline}

We empirically observe (Figure 5, Appendix E) that sparsity (using only kNN edges, and nodes in $\tilde{U}$) results in well-conditioned matrices (bounded $\sqrt{\kappa(A)}$) in the considered parameter range $[\sigma_{\min},\sigma_{\max}]$.
%are reasonable within a bounded parameter domain, and we use these values to motivate choices for $\sigma_{\min}$ and $\sigma_{\max}$ Section \ref{sec:expt}.  
\begin{remark}
    In this work we have focused on efficient graph learning for the harmonic objective approach of \citet{zhu2003semi} and the efficient algorithm of \citet{delalleau2005efficient}. Developing approaches that work for other efficient algorithms from the literature \citep{sinha2009semi,wang2016scalable} constitutes interesting future work.
\end{remark}


\section{Experiments}\label{sec:expt}

% In this section we evaluate computational speedups as a result of using the conjugate gradient method and implement Algorithm \ref{algorithm: semi harmonic} to compute pieces of the loss function under different labeling techniques. The optimal parameters can differ greatly across subsets and dataset used (\citet{balcan2021data}, Figure 1 in the appendix). We motivate use of the conjugate gradient method on grounded graph Laplacians for fast matrix inversion by showing that the CG method can find soft labels for unlabeled data with similar accuracy to full inversion (Tables 1, 2, and 3 in the appendix) while saving significant time across different datasts and matrix sizes. We motivate design choices in Algorithm \ref{algorithm: semi harmonic} by examining $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ produced by the algorithm. Finally, we show that Algorithm \ref{algorithm: semi harmonic} is effective in practice at finding these pieces across settings, subset choices, and labeling techniques (Figure \ref{fig:mainint}; Figures 2, 3, 4 in the appendix), and provides large speedups over performing matrix inverses (Table \ref{table:regularIntervals}). \red{what processor it was run on}


% \noindent \textit{Setup}: We consider the task of semi-supervised binary classification (classes 0 and 1) on image datasets. As in \citet{delalleau2005efficient}, we pre-process data instances via Principal Component Analysis, keeping the first 45 principal components. We measure {\it distance} between any pairs of images by $L_2$ distance between principal components, and set weights via Gaussian Kernel parameterized by $\sigma$. %We predict labels by rounding fractional labels $f$. 
% When testing computational speedup using the CG method, we draw random subsets of the full dataset at varying sizes of $n$, with labeled set size $L = \frac{n}{10}$. When computing approximate matrix inverses, we use $t = 20$ conjugate gradient iterations.

%  \noindent\textit{Datasets}:
% We use three established benchmark image datasets -- MNIST, Fashion-MNIST, and USPS . Both the MNIST dataset (handwritten digits, \cite{726791}) and the FashionMNIST dataset (mock fashion items, \cite{xiao2017fashion}) consist of 28 by 28 grayscale images with 10 classes, and 6000 images per class.  The US Postal Service (USPS) dataset \citep{291440} has 7291 handwritten digits downscaled to 16 by 16 grayscale images.%, and is used as a benchmark in \citet{delalleau2005efficient} as well.
% % {'MNIST_': 1.2334856918619421, 'MNIST_/kNN': 3.3331210520435715, 'delalleau_MNIST_/delalleau': 30.148780419414205, 'FashionMNIST_': 0.8878777916287328, 'FashionMNIST_/kNN': 7.985769378126057, 'delalleau_FashionMNIST_/delalleau': 19.820095020866738, 'USPS_': 0.6618597312962814, 'USPS_/kNN': 2.2255248626073203, 'delalleau_USPS_/delalleau': 36.78950537338319}
% % {'MNIST_intervals/full_inv/kNN': 10.10865132705025, 'MNIST_intervals/full_inv/nokNN': 3.220042748949421, 'FashionMNIST_intervals/full_inv/kNN': 12.485808468828298, 'FashionMNIST_intervals/full_inv/nokNN': 3.5550738290035526, 'USPS_intervals/full_inv/kNN': 12.525875799796161, 'USPS_intervals/full_inv/nokNN': 3.474689617491605}

% % The CIFAR-10 dataset \citet{7298594} is made up of 3-color 32 by 32 images with classes such as plane and boat, however the feature maps used for $L_2$ distance were of size 128 by 8 by 8. \\\\

In this section we evaluate computational speedups as a result of using the conjugate gradient method and implement Algorithm \ref{algorithm: semi harmonic} to compute pieces of the loss function under different labeling techniques. 


\noindent \textit{Setup}: We consider the task of semi-supervised binary classification (classes 0 and 1) on image datasets. As in \citet{delalleau2005efficient}, we pre-process data instances via Principal Component Analysis, keeping the first 45 principal components. We measure {\it distance} between any pairs of images by $L_2$ distance between principal components, and set weights via Gaussian Kernel parameterized by $\sigma$. %We predict labels by rounding fractional labels $f$. 
When testing computational speedup using the CG method, we draw random subsets of the full dataset at varying sizes of $n$, with labeled set size $L = \frac{n}{10}$. When computing approximate matrix inverses, we use $t = 20$ conjugate gradient iterations.

 \noindent\textit{Datasets}:
We use three established benchmark image datasets -- MNIST, Fashion-MNIST, and USPS. Both the MNIST dataset (handwritten digits, \cite{726791}) and the FashionMNIST dataset (mock fashion items, \cite{xiao2017fashion}) consist of 28 by 28 grayscale images with 10 classes, and 6000 images per class. The US Postal Service (USPS) dataset \citep{291440} has 7291 handwritten digits downscaled to 16 by 16 grayscale images. For MNIST and USPS, binary classification between classes 0 and 1 corresonds to classifying between handwritten 0s and 1s. For FashionMNIST, it corresponds to classifying between classes T\_shirt and Trouser.\looseness-1


\subsection{Efficient Feedback Set Computation  (Algorithm \ref{algorithm: semi harmonic})}\label{sec:FeedbackSet}



\begin{table*}[h]
\centering
\caption{Time (in seconds) per Interval (TpI) and Number of Intervals ($M$) using both the Conjugate Gradient method (CG, $t=20$ iterations) and Matrix Inverse (MI) on full graphs or kNN, $k = 6$ graphs using Algorithm \ref{algorithm: semi harmonic}. Approximate soft labels are computed using one of Algorithm \ref{algorithm: harmonic approx}, \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}.}

\begin{tabular}{ |c|c||c|c|c|c||c|c|c|c||c|c| } 
\hline
\multicolumn{1}{|c|}{\multirow{3}{*}{Dataset}} & \multicolumn{1}{c||}{\multirow{3}{*}{Size}} & \multicolumn{4}{c||}{Algorithm \ref{algorithm: harmonic approx}} & \multicolumn{4}{c||}{Algorithm \ref{algorithm: harmonic approx} (kNN)} &  \multicolumn{2}{c|}{\hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} (kNN)} \\ \cline{3-12}


\multicolumn{1}{|c|}{} & \multicolumn{1}{c||}{} & \multicolumn{2}{c|}{TpI} & \multicolumn{2}{c||}{$M$} & \multicolumn{2}{c|}{TpI} & \multicolumn{2}{c||}{$M$} & \multicolumn{1}{c|}{TpI} & \multicolumn{1}{c|}{$M$} \\  \cline{3-12}

\multicolumn{1}{|c|}{} & \multicolumn{1}{c||}{} & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c|}{MI} & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c||}{MI} & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c|}{MI} & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c||}{MI} & \multicolumn{2}{c|}{CG} \\ \hline

\multirow{4}{4em}{MNIST}
& 100 &  \textbf{1.50} & 3.22  & 38.9 & 26.0 &  6.33 & 10.11 & 17.36 & 5.7 &  2.22 & 11.5\\
& 300  &  15.87 & 346.46 & 22.1 &  26.8 &  19.98 & 2405.70 & 22.1 & 7.9 &  \textbf{5.98} & 26.2\\
& 500  &  57.90 & 818.11  & 26.6 & 23.6 & 20.99  & 6791.79 & 26.6 & 7.9 & \textbf{7.60} & 36.1\\
& 12615  &  - & -  & - & - & -  & - & - & - & 46.36 & -\\
\hline
\multirow{4}{4em}{Fashion-MNIST} 
& 100  &  \textbf{1.79} & 3.56 & 39.0 & 23.9 & 11.79 & 12.49 & 18.55 & 8.3 &  2.91 & 12.7\\
& 300   &  11.73 & 268.20 & 45.6 & 38.9 & 21.08 & 1447.56 & 35.9 & 21.2 &  \textbf{7.24} & 33.6\\
& 500  &  39.98 & 766.73 & 50.9 & 37.6 & 35.91 & 6311.03 & 38.7 & 29.0 &  \textbf{9.21} & 44.6\\
& 11950  &  - & -  & - & - & -  & - & - & - & 32.3 & -\\
\hline
\multirow{4}{4em}{USPS} 
& 100  & \textbf{1.58} & 3.47 & 25.4 & 18.6 &  6.91 & 12.53 & 4.7 & {1.3} &  2.12 & 6.67\\
& 300  & 16.63 & 238.16 & 30.1 & 18.8 & 29.86 & 68.70 & 5.6 & {1.2} & \textbf{6.34} & 16.14 \\
& 500 & 57.50 & 755.94 & 39.1 & 17.8 & 63.18 & 31.54 & 6.1 & {1.0} &  \textbf{12.37} & 20.4\\
& 2149  &  - & -  & - & - & -  & - & - & - & 27.28 & -\\
\hline

\end{tabular}   
\label{table:regularIntervals}
\end{table*}

% \begin{table}[h]
% \centering
% \caption{Time (s) per Interval (TpI) and Number of Intervals (NI) using both the CG method (CG) and Full Inverse (Full) on kNN = 6 graphs using Algorithm \ref{algorithm: semi harmonic}. Labels computed with \hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}}.}
% \begin{tabular}{ |c|c|c|c|c|c| } 
% \hline
% \multicolumn{1}{|c|}{\multirow{2}{*}{Dataset}} & \multicolumn{1}{c|}{\multirow{2}{*}{Size}} & \multicolumn{2}{c|}{TpI}                            & \multicolumn{2}{c|}{NI}                             \\ \cline{3-6} 
% \multicolumn{1}{|c|}{}                         & \multicolumn{1}{c|}{}                      & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c|}{Full} & \multicolumn{1}{c|}{CG} & \multicolumn{1}{c|}{Full} \\ \hline
% \multirow{2}{4em}{MNIST}
% & 100  &  6.33 & 10.11 & 17.36 & 5.7\\
% & 300  &  19.98 & 2405.70 & 22.1 & 7.9\\
% & 500  & 20.99  & 6791.79 & 26.6 & 7.9 \\
% \hline
% \multirow{2}{4em}{Fashion-MNIST} 
% & 100  & 11.79 & 12.49 & 18.55 & 8.3\\
% & 300  &  21.08 & 1447.56 & 35.9 & 21.2\\
% & 500  & 35.91 & 6311.03 & 38.7 & 29.0 \\
% \hline
% \multirow{2}{4em}{USPS} 
% & 100  &  6.91 & 12.53 & 4.7 & 1.3\\
% & 300  & 29.86 & 68.70 & 5.6 & 1.2\\
% & 500  & 63.18 & 31.54 & 6.1 & 1.0\\
% \hline
% \end{tabular}   

% \label{table:kNNintervals}
% \end{table}

% \begin{table}[h]
% \centering
% \caption{Times for interval finding on kNN=6 graphs using Algorithm \ref{algorithm: semi harmonic}, computing labels with \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}\\We let $|L| = 10, |\Tilde{U}| = 50$, and extend to $[\text{size} - 50]$ new points}
% \label{table:delIntervals}
% \begin{tabular}{ |c|c|c|c|c| } 
% \hline
% Dataset & Size & TpI & NI \\
% \hline
% \multirow{2}{4em}{MNIST}
% & 100  &  2.22 & 11.5\\
% & 300  &  5.98 & 26.2\\
% & 500  & 7.60 & 36.1\\
% \hline
% \multirow{2}{4em}{Fashion-MNIST} 
% & 100  &  2.91 & 12.7\\
% & 300  &  7.24 & 33.6\\
% & 500  &  9.21 & 44.6\\
% \hline
% \multirow{2}{4em}{USPS} 
% & 100  &  2.12 & 6.67\\
% & 300  &  6.34 & 16.14 \\
% & 500  &  12.37 & 20.4\\
% \hline
% \end{tabular}   
% \end{table}


% {'MNIST_': (1.2334856918619421, 5.956598508553343), 'MNIST_/kNN': (3.3331210520435715, 7.927338957171614), 'delalleau_MNIST_/delalleau': (30.148780419414205, 337.9461302440087), 'FashionMNIST_': (0.8878777916287328, 9.926759373714857), 'FashionMNIST_/kNN': (7.985769378126057, 37.691171257386785), 'delalleau_FashionMNIST_/delalleau': (19.820095020866738, 0.0), 'USPS_': (0.6618597312962814, 1.502348145871924), 'USPS_/kNN': (2.2255248626073203, 1.5598759695925264), 'delalleau_USPS_/delalleau': (36.78950537338319, 205.67373313570658)}
% {'MNIST_intervals/full_inv/kNN': (10.10865132705025, 3.10507952319194), 'MNIST_intervals/full_inv/nokNN': (3.220042748949421, 16.265651681859286), 'FashionMNIST_intervals/full_inv/kNN': (12.485808468828298, 8.299127092061404), 'FashionMNIST_intervals/full_inv/nokNN': (3.5550738290035526, 20.31007472044839), 'USPS_intervals/full_inv/kNN': (12.525875799796161, 2.839328078668921), 'USPS_intervals/full_inv/nokNN': (3.474689617491605, 19.471467742961227)}


% \noindent \textit{Setup:} 
We consider the problem of finding approximate intervals of the piecewise constant loss $l(\sigma)$ using Algorithm \ref{algorithm: harmonic approx} with the number of unlabeled points $n \in \{100, 300, 500\}$. By finding a set of these piecewise constant componenets, we are able to search the continuous paramter space exhaustively, with an optimal hyperparameter being any parameter in the loss interval with lowest loss. We do this for both the complete graph, as well as kNN with $k=6$, setting number of labeled examples $|L|=10$.
% NEWLY REMOVED
% The optimal parameters can differ greatly across subsets and dataset used (\citet{balcan2021data}, Figure 1 in the appendix). We motivate use of the conjugate gradient method on grounded graph Laplacians for fast matrix inversion by showing that the CG method can find soft labels for unlabeled data with similar accuracy to full inversion (Tables 1, 2, and 3 in the appendix) while saving significant time across different datasts and matrix sizes. We motivate design choices in Algorithm \ref{algorithm: semi harmonic} by examining $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ produced by the algorithm. Finally, we show that Algorithm \ref{algorithm: semi harmonic} is effective in practice at finding these pieces across settings, subset choices, and labeling techniques (Figure \ref{fig:mainint}; Figures 2, 3, 4 in the appendix), and provides large speedups over performing matrix inverses (Table \ref{table:regularIntervals}).
%The first is to use \hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}} to find soft labels for unlabeled data. The second is to use \hyperref[summary: harmonic approx]{\textsc{ZGL03Approx}} to find soft labels for unlabeled data while considering a kNN grounded Laplacian with $k = 6$ instead of the full grounded Laplacian for calculation. For both, we consider 10 labeled examples and $n$ unlabeled examples. 
We do the same with \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} as algorithm $\cA$, with (uniformly random) subset $\Tilde{U}$ size 50 and hyperparameter $\lambda=1.4$.

We motivate design choices in Algorithm \ref{algorithm: semi harmonic} by examining $(\epsilon,\epsilon)$-approximate semi-bandit feedback for the semi-supervised loss $l(\sigma)$ produced by the algorithm.
% Finally, we use \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} with 10 labeled points, subset $\Tilde{U}$ size 50, and the other $n - 50$ extra points labeled via the weighted distance function of \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}. We use kNN graphs as in \citet{delalleau2005efficient}, \citet{zhu2003semi} with $k = 6$ for matrix $A$ in \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}}. 
For full implementation details, see Appendix E.1.

\noindent \textit{Results: } Figure \ref{fig:mainint} (as well as Fig.\ 2, 3, and 4 in Appendix) indicates that the CG method can be used to find accurate piecewise intervals on real loss functions. We see over 10x speedup for using the CG method as opposed to using matrix inversion for computing soft labels via Algorithm \ref{algorithm: harmonic approx} (Table \ref{table:regularIntervals}). This is due to the speedup in inversion time between a full matrix inverse and the CG method. For specific times for these two inversion techniques, see Tables 1-4 in Appendix. When using \hyperref[summary: delalleau approx]{\textsc{DBLR05Approx}} to compute labels, we see a speedup of 100x with slower asymptotic increase over time in comparison to the full inverse using Algorithm \ref{algorithm: harmonic approx}. Here the speedup is due to the fact that the CG method is only run for size 50 subsets of $U$, with $O(|\Tilde{U}|)$ algebraic steps afterwards to find soft labels/gradients for a given unlabeled point in $U$. %After considering the fact that the full inverse loss function produces slightly fewer intervals on our sigma range, the CG method is still orders of magnitude faster. 
We find that the calculation of feedback sets in kNN graphs takes longer to find a single interval on smaller data subsets, but the runtime asymptotically grows slower than for complete graphs. This is likely due to longer piecewise constant intervals for kNN graphs, leading to a larger amount of matrix inverse calculations performed per interval (consistent with $M$ values in Table \ref{table:regularIntervals}). %This is evidenced by the smaller amount of intervals in sigma range $[1,7]$ for size 500 kNN graphs as opposed to size 500 full graphs in Table \ref{table:regularIntervals}. \\\\
%\noindent The matrix inverse (MI) loss function has slightly fewer intervals than the CG method loss function, indicating that there may be more sub-optimal intervals for the CG method, even while the optimal intervals of both are similar in accuracy. 
We also find that the CG method is more robust to higher condition number than the full inverse for low $\sigma$ values, indicating that the CG method is not only faster, but also can be more stable than full matrix inversion for ill-conditioned grounded Laplacians.\looseness-1 %[Seems too specific, could possibly be moved to appendix later] For USPS dataset, kNN graphs using full inverse, in the majority of cases we only have to find one interval. Via a code optimization (Appendix D.2), the algorithm can terminate early without finding an interval, which is what causes the outlier times.

\paragraph{CG Method Details.}%\label{sec:cgmethscal}
% \begin{figure}
% \centering
% \subfloat[MNIST]{\includegraphics[width = 2in]{images/Times_And_Accuracys/MNIST_times_kNN.png}} 
% \subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{images/Times_And_Accuracys/FashionMNIST_times_kNN.png}}
% \subfloat[USPS]{\includegraphics[width = 2in]{images/Times_And_Accuracys/USPS_times_kNN.png}} \\
% \caption{Time Per Matrix Inverse at different sizes, kNN = 6}
% \label{fig:times}
% \end{figure}
% \begin{figure}
% \centering
% \subfloat[MNIST]{\includegraphics[width = 2in]{images/Times_And_Accuracys/MNIST_accuracys_kNN.png}}
% \subfloat[Fashion-MNIST]{\includegraphics[width = 2in]{images/Times_And_Accuracys/FashionMNIST_accuracys_kNN.png}} 
% \subfloat[USPS]{\includegraphics[width = 2in]{images/Times_And_Accuracys/USPS_accuracys_kNN.png}}
% \caption{Optimal Accuracy Values Across different sizes, kNN = 6}
% \label{fig:accuracys}
% \end{figure}
    We run $t$ iterations of the CG method to approximate  grounded Laplacian inverses for finding soft labels in Algorithm \ref{algorithm: harmonic approx}. We consider $t \in \{5,10, 20\}, \sigma' \in [1,7]$ when finding optimal values in terms of unlabeled classification accuracy. %, with $n=10$ subsets for each dataset.
    We use SciPy \citep{2020SciPy-NMeth} for performing the conjugate gradient method as well as storing nearest-neighbor matrices sparsely for time speedup. For all three datasets, we  find parameters to closely match/surpass the performance of the  harmonic solution with matrix inverse (i.e. prior work) for optimal $\sigma$ in both the complete graph and kNN setting with an order of magnitude or more speedup (Tables 1 and 2 in the appendix). We find that there is little time difference between $t = 5, 15$ and $20$ conjugate gradient iterations. %, likely due to the fact that the majority of the time cost comes from initialization of the cg function in SciPy and not the actual iterative process. 
    We also find slight speedup %when calculating the CG method 
    for kNN graphs.
% \begin{figure}
% \subfloat[Algorithm \ref{algorithm: harmonic approx} \\ $|U| = 100$]{\includegraphics[width = 2in]{images/full_accuracys/normal.png}}
% \subfloat[Algorithm \ref{algorithm: harmonic approx} (kNN) \\ $|U| = 100$]{\includegraphics[width = 2in]{images/full_accuracys/kNN.png}}
% \subfloat[Algorithm \ref{algorithm: delalleau approx} (kNN) \\ $|\Tilde{U}| = 100, |U| = 1000$]{\includegraphics[width = 2in]{images/full_accuracys/delalleau.png}}
% \caption{Accuracy values across sigma/method using CG Method with 20 iterations}
% \label{fig:intervals}
% \end{figure}



\section{Conclusion}

We provide a general analysis for approximate data-driven parameter tuning in the presence of approximate feedback. We show how this approximate feedback may be efficiently implemented for learning the graph in semi-supervised learning using the conjugate gradient method, specifically when learning the `bandwidth' parameter in popularly used Gaussian RBF kernels. We further show the significance of using sparse nearest neighborhood graphs for semi-supervised learning -- formally they need provably fewer samples for learning compared to using dense or complete graphs, and moreover, in practice, they lead to better conditioned matrices for which our approach converges faster. 

We quantify the efficiency versus accuracy trade-off for our approach, and empirically demonstrate its usefulness in more efficiently learning the graph for classic harmonic objective based algorithms \citep{zhu2003semi,delalleau2005efficient}. We believe this is an important step in making the data-driven approach practical for semi-supervised learning, and would potentially be useful for making data-driven algorithm design more useful for other problems. Interesting future directions include extending this approach to learning the graph for modern graph neural network based methods (which still assume a given graph) for semi-supervised learning, and applications to other parameter tuning problems where exact feedback may be computationally expensive to obtain.\looseness-1


\section{Acknowledgement}

We thank Nina Balcan for suggesting the problem and for useful discussions. We also thank David Tang and Advait Nene for useful discussions.




\iffalse

\subsection{Authorship}
Reviewing is double-blind.
However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

\subsection{Sectioning}
Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
One unnumbered sectioning command is provided, \verb|\paragraph|.
It can be used directly below any numbered section level.
Do not use any other sectioning commands.

\subsubsection{Typing the Section Titles}
{\color{red}The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
Please type them in title case.
(This is used in the PDF bookmarks.)
Please also write the \verb|\subsubsection| titles in title case.}

\paragraph{What is title case?}
\href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
\begin{quote}
    Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
    When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
\end{quote}

\subsection{References, Citations, Footnotes}\label{sec:etc}
\subsubsection{Cross-Referencing}
Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
For example, this subsection is Section~\ref{sec:etc}.

\subsubsection{Citations}
{\color{red}Citations should include the author's last name and year.
They should be part of the sentence.
An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
Do not use a parenthetical citation where a textual one is appropriate.
An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
The reference style you use should be compatible with the author-year citations.
Both the citation style and reference style used should be consistent.}

For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
For example, writing
“I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

\subsubsection{Footnotes}
You can include footnotes in your text.\footnote{
    Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
}
The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
    A footnote is material put at the foot of a page.
}
for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.









\section{Math}\label{sec:math}
The class file does not load any math support package like \textsf{amsmath}\footnote{%
  See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
}.
We advise using the \textsf{mathtools}\footnote{%
  See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
}
package, which extends \textsf{amsmath} with fixes and even more useful commands.
Feel free to load other support packages for symbols, theorems, etc.

Use the \textsf{amsmath} environments for displayed equations.
So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
An \texttt{equation}:
\begin{equation}\label{eq:example}
  0 = 1 - 1.
\end{equation}
Two \texttt{align}'ed equations:
\begin{align*} % no numbers with starred version
  1 + 2 &= 3,\\
  1 - 2 &= -1.
\end{align*}
Equations can also be put inline, of course.
For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
(Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.


\section{Floats}\label{sec:floats}
Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
Please do not force them to go in the middle of a paragraph.
They must respect the column width.

Two-column floats are possible.
They appear at the top of the next page, so strategic placement may be necessary.
For an example, see Figure~\ref{fig:tikz}.
They may not enter the margins.
\begin{figure*}
    \centering
    \begin{tikzpicture}[xscale=1.5]
        \coordinate (origin);
        \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
        \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
        \fill[gray] (45:1cm) circle[radius=.2cm];
    \end{tikzpicture}
    \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
\end{figure*}

All material in floats should be legible and of good quality.
So avoid very small or large text and pixelated or fuzzy lines.

\subsection{Figures}\label{sec:figures}
Figures should go in the \texttt{figure} environment and be centered therein.
The caption should go below the figure.
Use \verb|\includegraphics| for external graphics files but omit the file extension.
Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
Do not use \verb|\epsfig| or \verb|\psfig|.
If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
For example, see Figure~\ref{fig:pitt}.

\begin{figure}[!htb]
  \centering
  \includegraphics[width=0.7\linewidth]{pitt}
  \caption{A View of a Nice City.}\label{fig:pitt}
\end{figure}

Do not use \verb|\graphicspath|.
If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

\subsection{Tables}\label{sec:tables}
Tables should go in the \texttt{table} environment and be centered therein.
The caption should go above the table and be in title caps.
For an example, see Table~\ref{tab:data}.
\begin{table}
    \centering
    \caption{An Interesting Table.}\label{tab:data}
    \begin{tabular}{rl}
      \toprule % from booktabs package
      \bfseries Dataset & \bfseries Result\\
      \midrule % from booktabs package
      Data1 & 0.12345\\
      Data2 & 0.67890\\
      Data3 & 0.54321\\
      Data4 & 0.09876\\
      \bottomrule % from booktabs package
    \end{tabular}
\end{table}

\subsection{Algorithms}\label{sec:algorithms}
You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
Use the environment defined in the package to create a centered float with an algorithm inside.

\section{Back Matter}
There are a some final, special sections that come at the back of the paper, in the following order:
\begin{itemize}
  \item Author Contributions (optional)
  \item Acknowledgements (optional)
  \item References
\end{itemize}
They all use an unnumbered \verb|\subsubsection|.

For the first two special environments are provided.
(These sections are automatically removed for the anonymous submission version of your paper.)
The third is the ‘References’ section.
(See below.)

(This ‘Back Matter’ section itself should not be included in your paper.)


\begin{contributions} % will be removed in pdf for initial submission 
					  % (without ‘accepted’ option in \documentclass)
                      % so you can already fill it to test with the
                      % ‘accepted’ class option
    Briefly list author contributions. 
    This is a nice way of making clear who did what and to give proper credit.
    This section is optional.

    H.~Q.~Bovik conceived the idea and wrote the paper.
    Coauthor One created the code.
    Coauthor Two created the figures.
\end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    Briefly acknowledge people and organizations here.
    \red{Acknowledge Nina; cf. email}

    \emph{All} acknowledgements go in this section.
\end{acknowledgements}

\fi

% References
\bibliography{sharma_554}
\end{document}
