\documentclass[accepted,onecolumn]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

\usepackage{hyperref}       % hyperlinks
%\usepackage{xr}
%\externaldocument{suehiro_uai2022}
\usepackage{nameref} 
\usepackage{zref-xr}
\zxrsetup{toltxlabel} 
\zexternaldocument*{suehiro_124}

\renewcommand{\theequation}{A-\arabic{equation}}

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\mathtoolsset{showonlyrefs=true}
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

\usepackage{amssymb}
%% The amsthm package provides extended theorem environments
%% \usepackage{amsthm}

%% The lineno packages adds line numbers. Start line numbering with
%% \begin{linenumbers}, end it with \end{linenumbers}. Or switch it on
%% for the whole article with \linenumbers.
%% \usepackage{lineno}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

%\usepackage{jmlr2e}
\usepackage{times}
\usepackage{lscape}
%\usepackage[utf8]{inputenc} % allow utf-8 input
%\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
%\usepackage[pdftex]{hyperref}       % hyperlinks
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{caption}
\usepackage{enumitem}
%%%% mine %%%%%
\usepackage{amsfonts}
\usepackage{amsmath}
%\usepackage{tabularx}
\usepackage{braket}
\usepackage{boxedminipage}
\usepackage{epsf}
\usepackage{bm}
\usepackage{amsthm}
\usepackage{xspace}
\usepackage{wrapfig}
\usepackage{algorithm,algpseudocode}
\usepackage{multirow}
%\usepackage[numbers]{natbib}
%%%% natbib %%%%
%\newcommand{\citet}[1]
%{\citeauthor{#1}~\shortcite{#1}}
%\newcommand{\citep}{\cite}
%\newcommand{\citealp}[1]
%{\citeauthor{#1}~\citeyear{#1}}
%%%%%%%%%%%
\algnewcommand{\Inputs}[1]{%
  \State \textbf{Inputs:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Initialize}[1]{%
  \State \textbf{Initialize:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\def\indot<#1>{\langle #1 \rangle}

% Definitions of handy macros can go here

% local.sty: local settings


\newtheorem{defi}{Definition}
\newtheorem{theo}{Theorem}
\newtheorem{prop}[theo]{Proposition}
\newtheorem{coro}[theo]{Corollary}
\newtheorem{rem}[theo]{Remark}
\newtheorem{lemm}{Lemma}
%\newtheorem{claim}{Claim}
\newtheorem{fact}{Fact}
\newtheorem{ex}{Example}
%\newcommand{\square}{\rlap{$\sqcup$}$\sqcap$}
%\def\beginproof{\noindent {\bf Proof.~}}
%\def\endproof{~~\square\bigskip}
\def\remark{\par\noindent\hangindent0pt{\bf Remark.}~}

% Proofs
%\def\remark{\par\noindent\hangindent0pt{\bf Remark.~}}
%\def\beginsome#1{\paragraphskip\noindent{\bf #1~}}
%\def\endsome{\paragraphskip}
%\def\beginproof{\par\noindent{\bf Proof.~}}
%\def\beginproofarg#1{\par\noindent{\bf #1.~}}
%\def\square{\rlap{$\sqcup$}$\sqcap$}
%\def\endproof{~~\square\paragraphskip}
%\def\endproofarg#1{~~\square~#1\paragraphskip}

% others
\def\OMIT#1{}
\def\newwd#1{{\em #1}}



% local.mac
\newcommand{\mnote}[1]{\marginpar{#1}}
\newcommand{\mynote}[1]{{\bf {#1}}}


%
% symbol.tex
%


\newcounter{nombre}
\renewcommand{\thenombre}{\arabic{nombre}}
\setcounter{nombre}{0}
\newenvironment{OP}[1][]{\refstepcounter{nombre}\par\bigskip \abovedisplayskip=0.5\abovedisplayskip \noindent{\sf OP \thenombre : #1}}{\par}

%\newcommand{\bphi}{\boldsymbol{\phi}}
\newcommand{\bphi}{{\mathbf \phi}}
%\newcommand{\bx}{\boldsymbol{x}}
\newcommand{\bx}{{\mathbf x}}
\newcommand{\by}{{\mathbf y}}
\newcommand{\bzero}{{\mathbf 0}}
%\newcommand{\bw}{\boldsymbol{w}}
\newcommand{\bmu}{\boldsymbol{\mu}}
%\newcommand{\bmu}{{\mathbf \mu}}
\newcommand{\bsigma}{\boldsymbol{\sigma}}
\newcommand{\bomega}{{\boldsymbol{\omega}}}
\newcommand{\blambda}{\boldsymbol{\lambda}}
\newcommand{\kernel}{\boldsymbol{\mathrm{K}}}
%\newcommand{\kernel}{\mathrm{K}}
\newcommand{\Vmat}{\boldsymbol{\mathrm{V}}}
\newcommand{\Xmat}{\boldsymbol{\mathrm{X}}}
\newcommand{\bw}{{\mathbf w}}
\newcommand{\bW}{{\mathbf W}}
\newcommand{\bd}{{\mathbf d}}
\newcommand{\bk}{{\mathbf{k}}}
\newcommand{\bv}{{\mathbf v}}
\newcommand{\bu}{{\mathbf u}}
\newcommand{\bz}{{\mathbf z}}
\newcommand{\allins}{P_S}
\newcommand{\hatallins}{\hat{P}_S}
\newcommand{\multiallins}{P_S}
\newcommand{\op}{\textsf{OP}}
%\newcommand{\bz}{\boldsymbol{z}}
%\newcommand{\bs}{\boldsymbol{s}}
\newcommand{\bs}{{\mathbf s}}
%\newcommand{\bt}{\boldsymbol{t}}
\newcommand{\btau}{{\boldsymbol{\tau}}}
\newcommand{\bt}{{\mathbf t}}
\newcommand{\balpha}{\boldsymbol{\alpha}}
\newcommand{\bbeta}{\boldsymbol{\beta}}
\newcommand{\bgamma}{\boldsymbol{\gamma}}
\newcommand{\bxi}{\boldsymbol{\xi}}
\newcommand{\bzeta}{\boldsymbol{\zeta}}
\newcommand{\sbsq}{\mathrm{sub}}
\newcommand{\edge}{\mathrm{edge}}
\newcommand{\Ksub}{K_{\mathrm{sub}}}
\newcommand{\Ourmethod}[0]{our method\xspace}
\newcommand{\Ourshape}[0]{our shape.\xspace}
\newcommand{\hsigma}{\widehat\sigma}
\newcommand{\convhull}{\mathcal{H}}
\newcommand{\err}{\mathrm{err}}
\newcommand{\RW}{\mathrm{RW}}
\newcommand{\RCS}{\mathrm{RCS}}
\newcommand{\RV}{\mathrm{RV}}
\newcommand{\SRS}{\mathrm{SRS}}
\newcommand{\RSG}{\mathrm{RSG}}
\newcommand{\sign}{\mathrm{sign}}
\newcommand{\dom}{\mathcal{X}} %domain of interest
\newcommand{\domp}{\mathcal{X}^{pos}} 
\newcommand{\domn}{\mathcal{X}^{neg}} %
\newcommand{\range}{\mathcal{Y}} %range
\newcommand{\Natural}{\mathbb{N}} % 
\newcommand{\Real}{\mathbb{R}} % Eucledian space
\newcommand{\Hilbert}{\mathbb{H}} % Hilbert space
\newcommand{\Prob}{\mathbb{P}}
\newcommand{\F}{\mathrm{False}}
\newcommand{\T}{\mathrm{True}}
\newcommand{\calL}{\mathcal{L}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calF}{\mathcal{F}}
\newcommand{\calG}{\mathcal{G}}
\newcommand{\calT}{\mathcal{T}}
\newcommand{\calY}{\mathcal{Y}}
\newcommand{\Hyp}{\mathcal{H}}
\newcommand{\calW}{\mathcal{W}}
\newcommand{\EX}{\mathrm{EX}}
\newcommand{\filtEX}{\mathrm{FiltEX}}
\newcommand{\HSelect}{\mathrm{HSelect}}
\newcommand{\WL}{\mathrm{WL}}
%\newcommand{\breg}{D}
\newcommand{\vecx}{\boldsymbol{x}}
\newcommand{\vecy}{\mbox{\boldmath $y$}}
\newcommand{\vecw}{\boldsymbol{w}}
\newcommand{\vecz}{\boldsymbol{z}}
\newcommand{\vecg}{\mbox{\boldmath $g$}}
\newcommand{\veca}{\mbox{\boldmath $a$}}
\newcommand{\vecd}{\boldsymbol{d}}
\newcommand{\vecell}{\mbox{\boldmath $\ell$}}
\newcommand{\vecsigma}{\boldsymbol{\sigma}}
\newcommand{\vecpi}{\boldsymbol{\pi}}
%\newcommand{\vecv}{\mbox{\boldmath $v$}}
\newcommand{\vecxi}{\boldsymbol{\xi}}
\newcommand{\vece}{\mbox{\boldmath $e$}}
\newcommand{\vecB}{\mbox{\boldmath $B$}}
\newcommand{\vecD}{\mbox{\boldmath $D$}}
\newcommand{\vecI}{\mbox{\boldmath $I$}}
%\newcommand{\vecpi}{\mbox{\boldmath $\pi$}}
\newcommand{\tr}{\mathrm{tr}}
\newcommand{\vecG}{\mbox{\boldmath $G$}}
\newcommand{\vecF}{\mbox{\boldmath $F$}}
\newcommand{\tvecu}{\tilde{\mbox{\boldmath $u$}}}
\newcommand{\tvecw}{\tilde{\mbox{\boldmath $w$}}}
\newcommand{\tvecx}{\tilde{\mbox{\boldmath $x$}}}
\newcommand{\tw}{\tilde{w}}
\newcommand{\tx}{\tilde{x}}
\newcommand{\haty}{\hat{y}}
\newcommand{\hata}{\hat{a}}
\newcommand{\vecf}{\mbox{\boldmath $f$}}
\newcommand{\vectheta}{\mbox{\boldmath $\theta$}}
\newcommand{\vecalpha}{\boldsymbol{\alpha}}
\newcommand{\vecbeta}{\mbox{\boldmath $\beta$}}
\newcommand{\vectildealpha}{\widetilde{\vecalpha}}
\newcommand{\vectildebeta}{\widetilde{\vecbeta}}
\newcommand{\tildealpha}{\widetilde{\alpha}}
\newcommand{\tildebeta}{\widetilde{\beta}}
\newcommand{\vechatalpha}{\widehat{\vecalpha}}
\newcommand{\vechatbeta}{\widehat{\vecbeta}}
\newcommand{\hatalpha}{\widehat{\alpha}}
\newcommand{\hatbeta}{\widehat{\beta}}
\newcommand{\vectau}{\mbox{\boldmath $\tau$}}
\newcommand{\veclambda}{\bm{\lambda}}
\newcommand{\vecu}{\mbox{\boldmath $u$}}
\newcommand{\vecv}{\mbox{\boldmath $v$}}
\newcommand{\vecp}{\boldsymbol{p}}
\newcommand{\vecq}{\mbox{\boldmath $q$}}
\newcommand{\vecr}{\boldsymbol{r}}
\newcommand{\vecc}{\boldsymbol{c}}
\newcommand{\fp}{\mathrm{fp}}
\newcommand{\fn}{\mathrm{fn}}
\newcommand{\ouralg}{{Our algorithm}~}
\newcommand{\Ouralg}{PUMMA~}%{Modified ROMMA~}

\newcommand{\bn}{\Delta_2} % binary entropy
\newcommand{\psimp}{\mathcal{P}} %
\newcommand{\hatgamma}{\hat{\gamma}}
%\newcommand{\myexample}{\langle x,f(x) \rangle}
\newcommand{\indctr}[1]{I(#1)}
\newcommand{\CLASS}{\mathcal{C}}
\newcommand{\VC}{\mathrm{VC}}

\newcommand{\reg}{\mathcal{R}}
\newcommand{\breg}{D}

\newcommand{\filtex}{\mathrm{GenD_t}}
\newcommand{\gensamp}{\mathrm{GenSample}}

\newcommand{\argmax}{\mathop{\rm arg~max}\limits}
\newcommand{\argmin}{\mathop{\rm arg~min}\limits}
%\newcommand{\Expo}{\mathop{\rm  E}\limits}
\newcommand{\Expo}{\mathop{\mathbb{E}}\limits}
%\newcommand{\Expo}{\mathbb{E}}

\newcommand{\half}{\frac{1}{2}}
\newcommand{\eps}{\varepsilon}

\newcommand{\hp}{\hat{p}}
\newcommand{\hmup}{\hat{\mu}[+]}
\newcommand{\hmun}{\hat{\mu}[-]}
\newcommand{\hgp}{\hat{\gamma}[+]}
\newcommand{\hgn}{\hat{\gamma}[-]}
\newcommand{\gain}{\Delta}
\newcommand{\hgain}{\hat{\Delta}}
\newcommand{\vecdelta}{\boldsymbol{\delta}}

\newcommand{\tildeO}{\Tilde{O}}
\newcommand{\permset}{S}
\newcommand{\base}{\boldsymbol{B}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calP}{\mathcal{P}}
\newcommand{\calX}{\mathcal{X}}
\newcommand{\calZ}{\mathcal{Z}}
\newcommand{\calV}{\mathcal{V}}
\newcommand{\calH}{\mathcal{H}}
\newcommand{\calE}{\mathcal{E}}
\newcommand{\calD}{\mathcal{D}}
\newcommand{\Rdm}{\mathfrak{R}}
\newcommand{\GC}{\mathfrak{G}}
\newcommand{\hullC}{\mathrm{conv}(\calC)}
\newcommand{\hullH}{\mathrm{conv}(H)}
\newcommand{\conv}{\mathrm{conv}}

\newcommand{\calHMC}{\mathcal{H}^{\mathrm{MC}}}
\newcommand{\calHLC}{\mathcal{H}^{\mathrm{LC}}}
\newcommand{\calHTR}{\mathcal{H}^{\mathrm{TR}}}
\newcommand{\calHMI}{\mathcal{H}^{\mathrm{MI}}}
\newcommand{\calhatHMC}{\widehat{\mathcal{H}}^{\mathrm{MC}}}
\newcommand{\calhatHLC}{\widehat{\mathcal{H}}^{\mathrm{LC}}}
\newcommand{\calhatHTR}{\widehat{\mathcal{H}}^{\mathrm{TR}}}
\newcommand{\calhatHMI}{\widehat{\mathcal{H}}^{\mathrm{MI}}}
\newcommand{\calhatH}{\widehat{\mathcal{H}}}
\newcommand{\calhatG}{\widehat{\mathcal{G}}}

\newcommand{\ellMI}{\ell^{\mathrm{MI}}}
\newcommand{\ellb}{\ell_{\mathrm{b}}}
\newcommand{\ellsb}{\ell_{\mathrm{sb}}}

\newcommand{\SMI}{S_{\mathrm{MI}}}

\newcommand{\Rmin}{R_{\mathrm{min}}}
\newcommand{\Rmax}{R_{\mathrm{max}}}
\newcommand{\RMC}{R^{\mathrm{MC}}}
\newcommand{\RLC}{R^{\mathrm{LC}}}
\newcommand{\RTR}{R^{\mathrm{TR}}}
\newcommand{\emRmin}{\widehat{R}_{\mathrm{min}}}
\newcommand{\emRmax}{\widehat{R}_{\mathrm{max}}}
\newcommand{\emRMC}{\widehat{R}^{\mathrm{MC}}}
\newcommand{\emRLC}{\widehat{R}^{\mathrm{LC}}}
\newcommand{\emRTR}{\widehat{R}^{\mathrm{TR}}}
\newcommand{\emR}{\widehat{R}}
\newcommand{\MLMI}{\ell^{\mathrm{MI}}}
\newcommand{\MLTR}{\ell^{\mathrm{TR}}}
\newcommand{\MLMC}{\ell^{\mathrm{MC}}}
\newcommand{\RMI}{R^{\mathrm{MI}}}
\newcommand{\emRMI}{\widehat{R}^{\mathrm{MI}}}
\newcommand{\ROMI}{R^{\mathrm{OMI}}}
\newcommand{\emROMI}{\widehat{R}^{\mathrm{OMI}}}
\newcommand{\fv}{f^{(v)}}
\newcommand{\Fv}{F^{(v)}}
\newcommand{\Fvi}{F^{(v_i)}}

\newcommand{\nP}{{n_{\mathrm{P}}}}
\newcommand{\nN}{{n_{\mathrm{N}}}}
\newcommand{\nL}{{n_{\mathrm{L}}}}
\newcommand{\nC}{{n_{\mathrm{C}}}}
%
%
%
\def\ceil#1{%
\left\lceil #1 \right\rceil}

\def\defeq{%
\stackrel{\mathrm{def}}{=}}

\def\floor#1{%
\lfloor #1 \rfloor}

\def\myhang{%
    \par\noindent\hangindent20pt\hskip20pt}
\def\nitem#1{%
    \par\noindent\hangindent40pt
    \hskip40pt\llap{#1~}}


\newcommand{\E}{\boldsymbol{E}}
%\newcommand{\note}{}

\newcommand{\pdiff}{\Phi_{\mathrm{diff}}(\multiallins)}
\newcommand{\calI}{\mathcal{I}}

\newcommand{\GMIL}{MIL\xspace}






%\newcommand{\dataset}{{\cal D}}
%\newcommand{\fracpartial}[2]{\frac{\partial #1}{\partial  #2}}

% Heading arguments are {volume}{year}{pages}{submitted}{published}{author-full-names}

%\jmlrheading{1}{2000}{1-48}{4/00}{10/00}{Marina Meil\u{a} and Michael I. Jordan}

% Short headings should be running head and authors last names

%\ShortHeadings{Learning with Mixtures of Trees}{Meil\u{a} and Jordan}
%\firstpageno{1}





%\title{Reduction Scheme for Empirical Risk Minimization and Its Applications to Multiple-Instance Learning}
\title{Simplified and Unified Analysis of Various Learning Problems \\by Reduction to Multiple-Instance Learning \\(Supplementary materials)}

\author[1, 2]{Daiki Suehiro}

\affil[1]{Kyushu University, Department of Advanced Information Technology, %Department and Organization
            744 Motooka, 
            Fukuoka, Japan}
\affil[2]{RIKEN, Center for Advanced Intelligence Project, %Department and Organization
            Nihonbashi 1-chome Mitsui Building, 15th floor,1-4-1 Nihonbashi, Chuo-ku, 
            Tokyo, Japan}

\author[3]{Eiji Takimoto}
\affil[3]{Kyushu University, Department of Informatics,%Department and Organization
            744 Motooka, 
            Fukuoka,
            Japan}

\begin{document}
\maketitle
\appendix
%\onecolumn

%% extension of sabato's theorem



\section{Proof of Theorem~\ref{theo:sabato}}
\begin{proof}
The theorem is based on Theorem 20 of~\citep{Sabato:2012:MLA}.
Using the fact that $\psi_p$ is $1$-Lipschitz for all $p$
and $\Rdm_S$ which is shown in the proof of Theorem 20 of~\citep{Sabato:2012:MLA},
we can obtain the target theorem.
\end{proof}


%% convex if y=-1

\section{Proof of Proposition~\ref{prop:poly}}
\begin{proof}
First we have that $\hat{f}=f_2 \circ g$ is a convex function of $w'$ because
$f_2$ is a nondecreasing convex and $\langle w', z \rangle$ is a convex function of $w'$
(see, e.g., Eq. (3.11) in~\cite{boyd-vandenberghe:book04}).
Subsequently, we show that $\Psi_p \circ \hat{f}$ is a convex function.
Without loss of generality, we can consider $\Psi_p$ as a function $\Real^m \to \Real$ where $m$ is the size of the set $x'$. $\Psi_p$ is a nondecreasing function in each argument and $\hat{f}$ is convex and thus $\Psi_p \circ \hat{h}$ is convex.
Finally, because $-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \})$ is concave and
$f_1$ is nonincreasing convex, $f_1(-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \})$ is convex~\cite{boyd-vandenberghe:book04}.
\end{proof}

\section{Proof of Proposition~\ref{prop:DC}}
%% DC
\begin{proof}
Because $f_1(c)$ is a homogeneous function of degree $1$ for $c \in [-1,1]$, we have
$f_1(-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \})) = -f_1(\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \}))$.
As we proved in Proof of Proposition~\ref{prop:poly}, $f_1(-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \}))$ is convex. Moreover, we have $f_1(\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \})) = -f_1(-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \}))$ 
and thus $f_1(\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \}))$ is concave.
Therefore, we have that $f_1(\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \})) + f_1(-\Psi_p(\{f_2(\langle w', z \rangle) \mid z \in x' \}))$ is a DC function.
\end{proof}


%% DC algorithm
\section{DC algorithm for the reduced MIL problem}
The algorithm is shown in Algorithm~\ref{alg:DCA}. The subproblem~\eqref{align:subprob} is a convex programming problem
that can be solved in polynomial time.
\label{sec:appendix_DCA}
%In practice, DCA often converges global optima~\cite{LeThi2018}.
\begin{algorithm}[h!]
\caption{\GMIL optimization via DC Algorithm}
\label{alg:DCA}
\begin{algorithmic}[0]
\Inputs{$S'$, $\lambda$}
 \Initialize{${w'}_0 \in \mathbb{R}^{d'}$}
 \For{$t=1,\dots,$ (until convergence)}
\State Compute the subgradient:
\begin{align}
\label{align:subgrad}
    s_t \in \nabla_{w'}\left(\sum_{i:y_i=-1} f_1
    \left( \Psi_p \left(\left\{f_2 \left(\langle w', z \rangle \right) \mid z \in x'_i \right\} \right)\right)\right) 
\end{align}
\State at $w'_{t-1}$.
\State Solve the following subproblem:
\begin{align}\label{align:subprob}
w_t' \leftarrow \arg\min_{w': \|w'\| \leq C_1}
~\lambda \|w'\|^2 
+ \sum_{i:y_i=+1} f_1 
    \left( \Psi_p \left(\left\{f_2 \left(\langle w', z \rangle \right) \mid z \in x'_i \right\} \right)\right) - s_t^{\top} w'
\end{align}    
\State
\begin{align}
%w_t' = \mathrm{ProjectionL2ball}(w_t', C_1)    
\end{align}
\EndFor \\
\Return{$w_t$}
\end{algorithmic}
\end{algorithm}

% $\mathrm{ProjectionL2ball}(w, C)$ is a function which project $w$ onto 
% $C$-ball with Euclidean norm. It has an analytical solution~\citep{boyd-vandenberghe:book04}.
%The solution $w_t'$ is a local optima of~\eqref{align:erm_optprob}.


%% MCL
%\section{Proof of~Theorem~\ref{theo:mcl_reduction}}
%\label{sec:mcl_reduction_proof}
% \begin{proof}
% For any $(x,y)$, we define
% \begin{align}
%   \label{align:dk_z}
% \eta_{(x,y)} = (\bzero, \ldots, \bzero, \underbrace{x}_{y\mathrm{-th~block}}, \bzero, \ldots, \bzero),
% \end{align}
% where $\bzero$ is a vector, the elements of which are all $0$.
% On the \GMIL-reduction framework, 
% suppose that
% $p=\infty$; $f_1(c)=\Gamma(2cC_1C_2)$, $f_2(c)=c/2C_1C_2$ (shifting function to $[-1,+1]$); $\alpha(x,y) = (x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{\eta_{(x,j)} - \eta_{(x,y)} \mid \forall j \in \calY \backslash y\}$; $y'=-1$; for any $z \in \Real^{kd}$,
% $\calG=\{g: z \mapsto \langle (w'_1, \ldots, w'_k), z \rangle \mid 
% w'_j \in \Real^d, \forall j\in [k], \|W'\| \leq C_1 \}$ where 
% $W'=(w'_1, \ldots, w'_k)$ and
% $\|W'\| = \sqrt{\sum_{j=1}^k\|w'_j\|^2}$; $\beta(h'): x \mapsto \arg\max_{j \in [k]} \langle (w'_1, \ldots w'_k), x \rangle$.
% Then, for any $(x,y)$ and $h \in \calH$,
% \begin{align}
% \ell'(x', y', h')
% =&
% f_1\left(
% y' \Psi_p\left( 
% \{
% f_2 \left(
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \right)
% \\
% =&
% \Gamma\left(
% - \Psi_\infty\left( 
% \{
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \\
% =& \Gamma\left(
% -\left(\max_{j \in \calY\backslash y}
% \left(\langle w_j, x \rangle - \langle w_y, x \rangle\right) \right)
% \right)\\
% = &
% \ell(x, y, h) 
% \end{align}
% \end{proof}

%% lemma for comple. learning
\section{Proof of~Lemma~\ref{lemm:comp_gen}}
\begin{proof}
  Based on the assumption of $\calD'$, the expected risk $\RLC_{\calD'}(h)$ is represented using $\calD$, $k$, and $\theta$
  as follows:
  \begin{align}
    \label{align:rlcd}
    \RLC_{\calD'}(h) = \Expo_{(x, y)\sim \calD}
     \left[\theta I\left((y \neq h(x))  \right) + 
      (1- \theta)\sum_{\bar{y}\neq y}\frac{1}{k-1}
      I\left(\bar{y}=h(x)  \right).
      \right]    
  \end{align}
  Let  $\rho_1=I \left(y \neq h(x) \right)$ in $\RMC_{\calD}(h)$ and
  let $\rho_2=\theta I\left((y \neq h(x))  \right) +  (1- \theta)\sum_{\bar{y}\neq y}\frac{1}{k-1}  I\left((\bar{y}=h(x))  \right)$ in $\RLC_{\calD'}(h)$.
  We consider two cases of $h$ for any $h \in \calH$ as follows:
  For a fixed $(x, y)$,
  (i) If $h(x) = y$: $\rho_1=0$ and $\rho_2=0$, and thus there is no gap. %$\rho=0$.
  (ii) If $h(x) \neq y$:,  the first term of $\rho_2$
  is $\theta$ and the second term is equal to $(1-\theta)/(k-1)$,
  %$\rho \leq 1$
  because there exists a unique $\hat{y}:\hat{y} \neq y$ that satisfies $\hat{y} = h(x)$.
  Therefore, $\rho_2$ is equal to $\theta + \frac{1-\theta}{k-1}$.
  In this case, $\rho_1 = 1$.
  Thus, we have the bound $\frac{k-1}{\theta(k -2)+1}\RLC_{\calD'}(h) =  \RMC_{\calD}(h)$.
\end{proof}

%% 
\section{Proof of~Theorem~\ref{theo:cll_reduction}}
\begin{proof}
We use $\eta_{(x,y)}$ defined in~(\ref{align:dk_z}).
On the \GMIL-reduction scheme, suppose that 
$p=\infty$; $f_1(c)=\Gamma(2cC_1C_2)$; $f_2(c)=c/2C_1C_2$ (shifting function to $[-1,+1]$); $\alpha(x,(\gamma,y)) = (x'_{(x,y)}, y')$ 
where $x'_{(x,y)}=\{\eta_{(x,j)} - \eta_{(x,y)} \mid \forall j \in \calY \backslash y\}$; $y'=I(\gamma=\T)$; for any $z \in \Real^{kd}$,
$\calG=\{g: z \mapsto \langle (w'_1, \ldots, w'_k), z \rangle \mid 
w'_j \in \Real^d, \forall j\in [k], \|W'\| \leq C_1 \}$ where $W'=(w'_1, \ldots, w'_k)$ and $\|W'\| = \sqrt{\sum_{j=1}^k\|w'_j\|^2}$; $\beta(h'): x \mapsto \arg\max_{j \in [k]} \langle w'_j , x \rangle$.
Then, for any $(x,y)$ and $h \in \calH$,
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\{
f_2 \left(
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\right)
\\
=&
\Gamma\left(
I(\gamma=\T) \times \Psi_\infty\left( 
\{
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\\
=& \Gamma\left(
I(\gamma=\T) \times \left(\max_{j \in \calY\backslash y}
\left(\langle w_j, x \rangle - \langle w_y, x \rangle\right) \right)
\right)\\
=& 
\ell(x, (\gamma, y), h).
\end{align}
\end{proof}




\section{Multi-task learning problem}
\label{sec:multi-task}
In multi-task learning, the learner finds a common rule in
multiple-tasks,
which correctly predicts the outputs of the instances.
For example, in the multi-classification-task problem, there are three different binary classification tasks for image data, cat or dog, car or train, and apple or tomato.
\par
\paragraph{Problem setting}
Let $\calX \subseteq \Real^d$ be an input space and $\calY \in \{-1,1\}$ be an output space.
We assume that the learner has $T$ different tasks with different data
distributions.
The learner receives $T$ sets of samples $S=S_1, \ldots, S_T$ where
$S_t = ((x_1^t, y_1^t), \ldots, (x_n^t, y_n^t))$ is drawn i.i.d. according to unknown distribution $\calD_t$.
$(x^t, y^t)$ denote an instance and its label, respectively.
Let $\calH=\{h: (x^t) \mapsto \sign(\langle w_t, x^t \rangle) \mid w_t \in \Real^d \rangle\}$ be a hypothesis class. %, where $W = (w_1, \ldots, w_T)$. 
Let $\ell: ((x^1,\ldots, x^T), (y^1, \ldots, y^T), h) \mapsto \frac{1}{T} \sum_{t=1}^T\Gamma(-y^t \langle w_t, x^t \rangle)$ where $\Gamma: \Real \rightarrow [0,1]$ is a convex, nondecreasing and $b$-Lipschitz function.
The generalization risk and empirical risk are formulated as:
\begin{align}
  &\Expo_t[R_{\calD_t}(h)] = \frac{1}{T}\sum_{t=1}^T \Expo_{(x^t,y^t)\sim \calD_t} 
  \left[\Gamma(-y^t \langle w_t, x^t \rangle)\right], \\
  &\emR_S(h) =  \frac{1}{T}\sum_{t=1}^T \frac{1}{n}\sum_{i=1}^{n} \Gamma(-y^t_i \langle w_t, x^t_i \rangle)
  =\frac{1}{n}\sum_{i=1}^{n} \ell \left((x^1_i,\ldots, x^T_i), (y^1_i, \ldots, y^T_i), h \right).
\end{align}
\paragraph{Reduction to \GMIL}
\begin{theo}
\label{theo:mtl_reduction}
Multi-task learning is \GMIL-reducible.
\end{theo}
% Since multi-task learning is based on average risk over the tasks,
% $\alpha$ can be constructed by the similar way as multi-label case.
% That is, $\alpha(x,y)=(x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{(y^1 x,1), \ldots, (y^T x, T)\}$
% and $y=-1$ for all $i \in [n]$.
\begin{proof}
For simplicity, we denote $(x^1,\ldots, x^T)$ by $\bx$ and denote $(y^1, \ldots, y^T)$ by $\by$.
On the \GMIL-reduction scheme, suppose that $p=1$; $f_1: f_1(a) = -a$;
$f_2$ is $\Gamma$; $\alpha(\bx, \by)=(x'_{(\bx, \by))}, y')$ where $x'_{(\bx, \by)}=\{(y^1 x^1,1), \ldots, (y^T x^T, T)\}$; $y'=-1$; $\calG=\{g: (z,t) \mapsto \langle w'_t, z \rangle \mid 
\forall j\in [T], w'_t \in \Real^d~\mathrm{and}~ \|W'\| \leq C_1 \}$ where $W' = (w'_1, \ldots, w'_T)$; $\beta(h'): (x^t) \mapsto \sign(\langle w'_t, x^t) \rangle$.
For any $((x^1,\ldots, x^T), (y^1, \ldots, y^T))$ and $h\in \calH$, we have that
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\left\{
f_2 \left(
g(z)\right) \mid z \in x'_{(\bx, \by)}
\right\}
\right)
\right)
\\
=&
 \frac{1}{|x'_{(\bx, \by)}|}\sum_{(x,t) \in x'_{(\bx, \by)}} 
\Gamma \left(-\langle w_t, y^t x^t \rangle 
\right)\\
=&
\ell((x^1,\ldots, x^T), (y^1, \ldots, y^T), h) 
\end{align}
\end{proof}
\paragraph{ERM algorithm}
\begin{coro}
The reduced ERM of the MIL from multi-task learning is a convex programming problem.
\end{coro}
As shown in the proof of Theorem~\ref{theo:mtl_reduction},
$f_1$ is nonincreasing and $y_i'=-1$ for all $i \in [n]$.
Thus, by Proposition~\ref{prop:poly}, if we consider $\Gamma$ that is nondecreasing and convex, the reduced MIL problem is 
a convex programming problem and solved in polynomial time.
\paragraph{Generalization bound}
%The generalization bound for the multi-task learning is given as:
 \begin{coro}
% Let $S'= ((x'_1, y'_1), \ldots, (x'_n, y'_n))$. Let $\calH' = \Psi_\infty(\{f_2(g(z))\mid z \in x'\})$ and let $\calH_\beta = \{\beta(h') \mid h' \in \calH'\}$. Let $\calhatH' = \{f_1(y'\Psi_\infty(\{f_2(g(z))\mid z \in x'\})) \mid g \in \calG \}$. 
% The following bound holds with a high probability of at least $1-\delta$ for all $h_W \in \calH_\beta$:
% \begin{align}
%     R_{\calD}(h_W) \leq \emR_{S'}(h'_W) + 2\Rdm_{S'}(\calhatH')+ 3\sqrt{\frac{\log \frac{2}{\delta}}{2n}}.
% \end{align}
% where
We assume that $\|x^t_i\| \leq C_2$ for any $i \in [n]$ and $t \in [T]$.
In the reduced problem,
the empirical Rademacher complexity of $\calhatH'$ is given as follows:
\begin{align}
\Rdm_{S'}(\calhatH') = O\left(
    \frac{
    \log \left(2n^2 T \right) 
    \left({bC_1C_2}\ln(n) \right)
    }
    {\sqrt{n}}
    \right),
    % \frac{
    % 4+10\log \left(16e a^2 n^2 T \right) 
    % \left(N+ {2a}\ln(16a^2n) \right)
    % }
    % {\sqrt{n}}.
%\quad \mathrm{or} \quad 
\end{align}
where we assume $\|w'\| \leq C_1$. 
%and $\|z\| \leq 2C_2$ for any $z \in x_i', \forall i \in [n]$ in the reduced MIL.
\end{coro}
We can derive the above from the same argument from the proof of Theorem~\ref{theo:mtl_reduction}.
Using Corollary~\ref{coro:risk_bound_reduced}, we can obtain the generalization
risk bound for the multi-task learning problem.

%% reduction from MLL
% \section{Proof of~Theorem~\ref{theo:mll_reduction}}
% \begin{proof}
% On \GMIL-reduction framework, suppose that $p=1$; $f_1: f_1(a) = -a$ for $a \in \Real$;
% $f_2$ is $\Gamma$; $\alpha(x,y)=(x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{(y^1 x,1), \ldots, (y^k x,k)\}$; $y'=-1$; $\calG=\{g: (z,j) \mapsto \langle w'_j, z \rangle \mid 
% w'_j \in \Real^d, \forall j\in [k], \|W'\| \leq C_1 \}$ where $W' = (w'_1, \ldots, w'_k)$; $W' = (w'_1, \ldots, w'_k)$; $\beta(h'): (x,j) \mapsto \langle w'_j, x \rangle$.
% For any $(x,y)$ and $h\in \calH$, we have that
% \begin{align}
% \ell'(x', y', h')
% =&
% f_1\left(
% y' \Psi_p\left( 
% \left\{
% f_2 \left(
% g(z)\right) \mid z \in x'_{(x,y)}
% \right\}
% \right)
% \right)
% \\
% =&
%  \frac{1}{|x'_{(x,y)}|}\sum_{(y^j x,j) \in x'_{(x,y)}} 
% \Gamma \left(\langle w_j, y^j x \rangle 
% \right)\\
% =&
% \ell(x, y, h) 
% \end{align}
% \end{proof}


% %% reduction from MTL
% \section{Proof of~Theorem~\ref{theo:mtl_reduction}}
% \begin{proof}
% On \GMIL-reduction framework, suppose that $p=1$; $f_1: f_1(a) \mapsto -a$ for $a \in \Real$;
% $f_2$ is $\Gamma$; $\alpha(x,y)=(x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{(y^1 x,1), \ldots, (y^T x, T)\}$; $y'=-1$; $\calG=\{g: (z,j) \mapsto \langle w'_j, z \rangle \mid 
% w'_j \in \Real^d, \forall j\in [T] \|W'\| \leq C_1 \}$ where $W' = (w'_1, \ldots, w'_T)$; $\beta(h'): (x,j) \mapsto \langle w'_j, x \rangle$.
% For any $(x,y)$ and $h\in \calH$, we have that
% \begin{align}
% \ell'(x', y', h')
% =&
% f_1\left(
% y' \Psi_p\left( 
% \left\{
% f_2 \left(
% g(z)\right) \mid z \in x'_{(x,y)}
% \right\}
% \right)
% \right)
% \\
% =&
%  \frac{1}{|x'_{(x,y)}|}\sum_{(x,j) \in x'_{(x,y)}} 
% \Gamma \left(\langle w_j, y^j x \rangle 
% \right)\\
% =&
% \ell(x, y, h) 
% \end{align}
% \end{proof}


%% reduction from MLL with perfectionistic loss
\section{Proof of~Theorem~\ref{theo:mllp_reduction}}
\label{sec:proof_mllp_reduction}
\begin{proof}
On the \GMIL-reduction scheme, suppose that $p=\infty$; $f_1: f_1(a) = -a$ for $a \in \Real$;
$f_2$ is $\Gamma$; $\alpha(x,y)=(x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{(-y^1 x,1), \ldots, (-y^k x,k)\}$; $y'=-1$; $\calG=\{g: (z,j) \mapsto \langle w'_j, z \rangle \mid 
w'_j \in \Real^d, \forall j\in [k], \|W'\| \leq 1 \}$ where $W' = (w'_1, \ldots, w'_k)$; $W' = (w'_1, \ldots, w'_k)$; $\beta(h'): (x,j) \mapsto \langle w'_j, x \rangle$.
For any $(x,y)$ and $h\in \calH$, we have that
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\left\{
f_2 \left(
g(z)\right) \mid z \in x'_{(x,y)}
\right\}
\right)
\right)
\\
=&
 \max_{(y^j x,j) \in x'_{(x,y)}} 
\Gamma \left(-\langle w_j, y^j x \rangle 
\right)\\
=&
\ell(x, y, h) 
\end{align}
\end{proof}


%% reduction from top-1 ranking

\section{Proof of~Theorem~\ref{theo:trl_reduction}}
\label{sec:proof_trl_reduction}
\begin{proof}
On the \GMIL-reduction scheme, suppose that $p=\infty$; $f_1(c)=\Gamma(2cC_1C_2)$; $f_2(c)=c/2C_1C_2$;
$\alpha(A, x^*) = (x', y')$ where $x'= \{x - x^* \mid x \in A\backslash x^*  \}$; 
$y'=-1$; 
$\calG = \{g: z \mapsto \langle w', z \rangle \mid \|w'\| \leq C_1 \}$;
$\beta(h'): A \mapsto \arg\max_{x \in A} \langle w', x\rangle$.
For any $(A, x^*)$ and $h \in \calH$, the following holds:
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\{
f_2 \left(
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\right)
\\
=&
\Gamma\left(
- \Psi_\infty\left( 
\{
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\\
=& \Gamma\left(
-\left(\max_{j \in A\backslash x^*}
\left(\langle w, x \rangle - \langle w, x^* \rangle\right) \right)
\right)\\
=& 
\ell(A, x^*, h) 
\end{align}
\end{proof}


%% reduction to top-1 ranking with negative feedback
\section{Top-1 ranking learning with negative feedback}
\label{sec:trl_neg}
As an extension of the Top-1 rank learning problem,
we consider the following scenario.
In practice, some item sets do not include the user-preferred item.
Therefore, we assume that the item sets are partitioned into two types:
the item sets that include the most preferred item
and those that do not include the preferred item.
For the second type of item set, we assume that we can receive 
information on non-preferred items as negative feedback from the user.
\par
More formally, we assume that the target user has a scoring function $s$
and a parameter $\gamma_i \in \{-1, +1\}$,
where $\gamma$ takes $+1$ for an item set that includes the preferred
item and takes $-1$ otherwise.
The learner receives the sequence of the sets of items
and the chosen item with positive or negative information
$S = (A_1, (x^*_1, \gamma_1)), \ldots, (A_n, (x^*_n, \gamma_n)$.
$\gamma_i=+1$ indicates that item set $A_i$ includes the preferred item,
and $\gamma_i=-1$ indicates that the item set $A_i$ does not include the preferred item.
For the item set $A_i$ with $\gamma=+1$, 
$x_i^* = \max_{x \in A_i} s(x)$.
Conversely, for the item set $A_i$ with $\gamma=-1$, 
$x_i^* \in \{A'=A \backslash x' \mid x'=\max_{x \in A_i} r(x)\}$,
that is, if $\gamma=-1$,
the user selects an item except for the best-scored item by $s$.
Note that we assume that $\gamma$ is a known parameter 
only in the training phase.
The other settings are the same as those in Sec.~\ref{subsec:trl}.
\par
A reasonable goal of the learner is to predict the best item from a given set of items even in this setting.
Therefore, the learner can recommend the most preferred item if $\gamma=+1$ and
can recommend a preferable item if $\gamma=-1$.
Similar to top-1 ranking learning, we consider a loss function $\ell: (A,(x^*,\gamma), h) \mapsto \Gamma(\gamma (\langle w, x^* \rangle - \max_{x \in A\backslash x^*} \langle w, x \rangle))$ where $\Gamma: \Real \rightarrow [0,1]$ is a convex, nonincreasing and $a$-Lipschitz function.
The generalization risk and empirical risk are formulated as follows:
%\footnotesize
\begin{align}
  &R_\calD(h) = \Expo_{(A, \gamma) \sim \calD}
  \left[
    \ell \left(A, (x^*, \gamma), h \right)
    \right],\\
  &\emR_{S}(h)= \frac{1}{n} \sum_{i=1}^n \ell \left(A, (x_i^*, \gamma_i), h \right),
\end{align}
%\normalsize
where $x^* = \arg\max_{x \in A}s(x)$.
\paragraph{Reduction to MIL}
\begin{theo}
\label{theo:trln_reduction}
Top-1 ranking learning with negative feedback is \GMIL-reducible.
\end{theo}
The difference from the top-1 ranking learning is just
$y_i'=-\gamma_i$, and thus we can easily prove it.
\begin{proof}
On the \GMIL-reduction scheme, suppose that $p=\infty$; $f_1(c)=\Gamma(2cC_1C_2)$; $f_2(c)=c/2C_1C_2$;
$\alpha(A, x^*) = (x', y')$ where $x'= \{x - x^* \mid x \in A\backslash x^*  \}$; 
$y'=-\gamma$; 
$\calG = \{g: z \mapsto \langle w', z \rangle \mid \|w'\| \leq 1 \}$;
$\beta(h'): A \mapsto \arg\max_{x \in A} \langle w', x\rangle$.
For any $(A, x^*)$ and $h \in \calH$, the following holds:
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\{
f_2 \left(
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\right)
\\
=&
\Gamma\left(
\gamma\left( \Psi_\infty\left( 
\{
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\right)
\\
=& \Gamma\left(
\gamma\left(\max_{j \in A\backslash x^*}
\left(\langle w, x \rangle - \langle w, x^* \rangle\right) \right)
\right)\\
=& 
\ell(A, x^*, h) 
\end{align}
\end{proof}
% \begin{proof}
% On \GMIL-reduction framework, suppose that $p=\infty$; $f_1$ is 
% $\Gamma$; 
% $\alpha(A, x^*) = (x', y')$ where $x'= \{x - x^* \mid x \in A\backslash x^*  \}$; 
% $y'=-\gamma$; 
% $\calG = \{g_w: z \mapsto \langle w, z \rangle \mid \|w\| \leq 1 \}$;
% $\beta(h'_w) =h_w$.
% For any $(A, x^*)$ and $h_w \in \calH$, the following holds:
% \begin{align}
% \ell'(x', y', h'_W)
% =&
% f_1\left(
% y' \Psi_p\left( 
% \{
% f_2 \left(
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \right)
% \\
% =&
% \Gamma\left(
% \gamma\left( \Psi_\infty\left( 
% \{
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \right)
% \\
% =& \Gamma\left(
% \gamma\left(\max_{j \in A\backslash x^*}
% \left(\langle w, x \rangle - \langle w, x^* \rangle\right) \right)
% \right)\\
% =& 
% \ell(A, x^*, h_w) 
% \end{align}
% \end{proof}
\paragraph{Generalization bound}
%The generalization bound for the Top-1 rank learning is given as:
\begin{coro}
We assume that $\|x\| \leq C_2$ for any $x \in A_i  \forall i \in [n]$.
In the reduced MIL problem,
the empirical Rademacher complexity of $\calhatH'$ is given as follows:
% Let $S'= ((x'_1, y'_1), \ldots, (x'_n, y'_n))$. Let $\calH' = \Psi_\infty(\{f_2(g(z))\mid z \in x'\})$ and let $\calH_\beta = \{\beta(h') \mid h' \in \calH'\}$. Let $\calhatH' = \{f_1(y'\Psi_\infty(\{f_2(g(z))\mid z \in x'\})) \mid g \in \calG \}$. 
% The following bound holds with a high probability of at least $1-\delta$ for all $h_W \in \calH_\beta$:
% \begin{align}
%     R_{\calD}(h_W) \leq \emR_{S'}(h'_W) + 2\Rdm_{S'}(\calhatH')+ 3\sqrt{\frac{\log \frac{2}{\delta}}{2n}}.
% \end{align}
% where
\begin{align}
\Rdm_{S'}(\calhatH') = O\left(
    \frac{
    \log \left(\hata^2 n^2 (k-1) \right) 
    \left({2\hata}\ln(\hata^2n) \right)
    }
    {\sqrt{n}}
    \right),
    % \frac{
    % 4+10\log \left(16e a^2 n^2 (k-1) \right) 
    % \left(N+ {2a}\ln(16a^2n) \right)
    % }
    % {\sqrt{n}}.
%\quad \mathrm{or} \quad 
\end{align}
where $\hata=2aC_1C_2$ we assume $\|w'\| \leq C_1$.
\end{coro}
Using Corollary~\ref{coro:risk_bound_reduced}, we can obtain the generalization
risk bound for the Top-1 ranking learning with negative feedback.

\paragraph{ERM algorithm}
\begin{coro}
The reduced ERM of MIL from top-1 ranking learning with negative feedback
is a DC programming problem.
\end{coro}
In top-1 ranking learning, $y' \in \{-1, 1\}$.
By the proof of Theorem~\ref{theo:trln_reduction} and 
by Proposition~\ref{prop:DC}, if we consider a loss function $\Gamma(c)$ as a nondecreasing and homogeneous function of degree 1 for $c \in [-1,1]$ such as hinge-loss, 
we can solve the problem by DC algorithm as shown in Algorithm~\ref{alg:DCA}.

% \section{Proof of~Theorem~\ref{theo:trln_reduction}}
% \begin{proof}
% On \GMIL-reduction framework, suppose that $p=\infty$; $f_1(c)=\Gamma(cC_1C_2)$; $f_2(c)=c/C_1C_2$;
% $\alpha(A, x^*) = (x', y')$ where $x'= \{x - x^* \mid x \in A\backslash x^*  \}$; 
% $y'=-\gamma$; 
% $\calG = \{g: z \mapsto \langle w', z \rangle \mid \|w'\| \leq 1 \}$;
% $\beta(h'): A \mapsto \arg\max_{x \in A} \langle w', x\rangle$.
% For any $(A, x^*)$ and $h \in \calH$, the following holds:
% \begin{align}
% \ell'(x', y', h')
% =&
% f_1\left(
% y' \Psi_p\left( 
% \{
% f_2 \left(
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \right)
% \\
% =&
% \Gamma\left(
% \gamma\left( \Psi_\infty\left( 
% \{
% g(z) \mid z \in x'_{(x,y)}
% \}
% \right)
% \right)
% \right)
% \\
% =& \Gamma\left(
% \gamma\left(\max_{j \in A\backslash x^*}
% \left(\langle w, x \rangle - \langle w, x^* \rangle\right) \right)
% \right)\\
% =& 
% \ell(A, x^*, h) 
% \end{align}
% \end{proof}

\section{Proof of Theorem~\ref{theo:representer}}
\begin{proof}
For the optimization problem~\eqref{align:erm_optprob_ker}, we can apply the standard representer theorem (see, e.g., Theorem 6.11 of~\cite{mohri2018foundations}).
We define $\Hilbert_1$ as the subspace 
spanned by $\{\langle z, \cdot \rangle \mid z \in P_{S'}\}$, namely, 
$\Hilbert_1=\{w \in \Hilbert \mid w = \sum_{z\in P_{S'}} \mu_z z, \mu_z \in \Real \}$.
For any $w \in \Hilbert$, we can consider the decomposition $w = w_1 + w_1^\perp$, where $w_1 \in \Hilbert_1$, and $w_1^\perp \in \Hilbert_1^\perp$ is its orthogonal component. 
Because $\Hilbert_1$ is a subspace of $\Hilbert$, 
$\|w\|_{\Hilbert}=\sqrt{\|w_1\|_{\Hilbert}^2 + \|w_1^\perp\|_{\Hilbert}^2} \geq \|w_1\|_{\Hilbert}$.
Moreover, by the definition of $\Hilbert_1$,
$\langle w, z\rangle= \langle w_1, z\rangle$.
Thus, $f_1(y_i'\Psi_p(\{f_2(\langle w,z \rangle ) \mid z \in x'_{i} \})) =f_1(y_i'\Psi_p(\{f_2(\langle w_1,z \rangle ) \mid z \in x'_{i} \}))$ and 
$\|w_1\|_\Hilbert \leq \|w\|_\Hilbert$.
This implies that the optimal solution
is contained in $\Hilbert_1$.
\end{proof}

\section{DC algorithm for kernelized extension}
The algorithm is shown in Algorithm~\ref{alg:DCA_ker}.
\label{sec:DC_algorithm_ker}
\begin{algorithm}%[h!]
\caption{\GMIL optimization via DC Algorithm (kernelized)}
\label{alg:DCA_ker}
\begin{algorithmic}[0]
\Inputs{$S'$, $\lambda$}
 \Initialize{${\bmu}_0 \in \mathbb{R}^{|{P_{S'}}|}$}
 \For{$t=1,\dots,$ (until convergence)}
\State Compute the subgradient:
\begin{align}
\label{align:subgrad2}
    s_t \in \nabla_{\bmu}\left(\sum_{i:y_i=-1} f_1
    \left( \Psi_p \left(\left\{f_2 \left(\sum_{v \in P_{S'}}\mu_{v} \langle v, z\rangle \right) \mid z \in x'_i \right\} \right)\right)\right) 
\end{align}
\State at $\bmu_{t-1}$.
\State Solve the following subproblem:
\begin{align}\label{align:subprob_ker}
\nonumber
\bmu_t \leftarrow \arg\min_{{\bmu} \in \mathbb{R}^{|{P_{S'}}|}}
&~\lambda \sum_{v, \hat{v} \in P_{S'}}\mu_{v}\mu_{\hat{v}}\langle v,\hat{v} \rangle\\ 
&+ \sum_{i:y_i=+1} f_1
    \left( \Psi_p \left(\left\{f_2 \left(\sum_{v \in P_{S'}}\mu_{z} \langle z, x\rangle \right) \mid z \in x'_i \right\} \right)\right) \\
    &- s_t^{\top} \bmu
\end{align}
\EndFor \\
\Return{$\bmu_t$}
\end{algorithmic}
\end{algorithm}


\section{Example of the reduction of kernelized learning problems: multi-class learning}
\label{sec:example_ker}
\subsection{Reduction to MIL with kernel}
\begin{theo}
Multi-class learning with kernel is \GMIL-reducible.
\end{theo}
\begin{proof}
For any $(x,y)$, we define
\begin{align}
  \label{align:dk_z_ker}
\eta_{(x,y)} = (0_\Hilbert, \ldots, 0_\Hilbert, \underbrace{\Phi(x)}_{y\mathrm{-th~block}}, 0_\Hilbert, \ldots, 0_\Hilbert) \in \Hilbert^k,
\end{align}
where $0_\Hilbert$ is a point in $\Hilbert$ satisfying $\langle 0_\Hilbert, v \rangle=0$ for any $v \in \Hilbert$.
On the \GMIL-reduction scheme, 
suppose that
$p=\infty$; $f_1(c)=\Gamma(cC_1C_2)$; $f_2(c)=c/C_1C_2$; $\alpha(x,y) = (x'_{(x,y)}, y')$ where $x'_{(x,y)}=\{\eta_{(x,j)} - \eta_{(x,y)} \mid \forall j \in \calY \backslash y\}$; $y'=-1$;
$\calG=\{g: z \mapsto \langle (w'_1, \ldots, w'_k), z \rangle \mid \forall j \in [k], w'_j \in \Hilbert, \|W'\|_{\Hilbert^k} \leq C_1 \}$ where $W' = (w'_1, \ldots, w'_k)$, $\|W'\|_{\Hilbert^k} = \sqrt{\sum_{j=1}^k\|w'_j\|^2_{\Hilbert}}$.
Then, for any $(x,y)$ and $h \in \calH$,
\begin{align}
\ell'(x', y', h')
=&
f_1\left(
y' \Psi_p\left( 
\{
f_2 \left(
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\right)
\\
=&
\Gamma\left(
- \Psi_\infty\left( 
\{
g(z) \mid z \in x'_{(x,y)}
\}
\right)
\right)
\\
=& \Gamma\left(
-\left(\max_{j \in \calY\backslash y}
\left(\langle W', \eta_{(x,j)} - \eta_{(x,y)} \rangle\right) \right)
\right)\\
=& \Gamma\left(
-\left(\max_{j \in \calY\backslash y}
\left(\langle w_j, \Phi(x) \rangle - \langle w_y, \Phi(x) \rangle\right) \right)
\right)\\
= &
\ell(x, y, h) 
\end{align}
\end{proof}

\subsection{Construction of $\beta$}
By Theorem~\ref{theo:representer}, $W'$ is returned by using $\mu$ as
\begin{align}
    W' = \sum_{z \in P_{S'}} \mu_{z} z.
\end{align}
Moreover, $w'_j$ can be represented as:
\begin{align}
    w'_j = \sum_{z[j] \in P_{S',j}} \mu_{z[j]} v[j],
\end{align}
where $P_{S',j} = \{z[j] \mid z \in \bigcup_{i=1}^n x_i'\}$ and 
$z[j]$ is $j$-th block of $z$. That is, $z[j]$ can be rewritten as $\Phi(\tilde{x}_j)$ for some $\tilde{x}_j$.
Note that, 
because $z$ is based on $\eta_{(x,y)}$ as shown in~\eqref{align:dk_z_ker},
$z[j]$ is in the Hilbert space $\Hilbert$ in the original problem.
Based on the relationship between $W'=(w'_1, \ldots, w'_k)$ and 
$W = (w_1, \ldots, w_k)$,
therefore, the hypothesis $h(x)$ in the original problem is
obtained by:
\begin{align}
    h(x) = &\arg\max_{j \in [k]} \langle w_j, \Phi(x) \rangle \\
    =&\arg\max_{j \in [k]} \langle w'_j, \Phi(x) \rangle \\
    =&\arg\max_{j \in [k]} \sum_{z[j] \in P_{S',j}} 
    \mu_{z[j]} \langle z[j], \Phi(x) \rangle \\
    =&\arg\max_{j \in [k]} \sum_{\tilde{x_j}} 
    \mu_{\tilde{x}_j} K(\tilde{x_j}, x).
\end{align}

\subsection{Reduction of other kernelized learning problems}
We can show that the other learning problems presented in this paper
can be kernelized.
For the other learning problems introduced in this study,
there are two types of the domains of $z$: the concatenation of the Hilbert vector
(complementarily labeled learning problems, multi-label learning, multi-task learning) and difference of the Hilbert vector (top-1 ranking learning).
For the difference in the Hilbert vector, that is, for $z = \Phi(x_1) - \Phi(x_2)$ and $\Phi(x)$,
$\langle z, \Phi(x) \rangle$ can be computed as: 
\begin{align}
    &\langle z, \Phi(x) \rangle\\
    =&\langle \Phi(x_1) - \Phi(x_2), \Phi(x) \rangle\\
    =& K(x_1, x) - K(x_2, x),
\end{align}
and thus $h(x)$ is computed by $h'$ in polynomial time.

\section{Comparison to the existing generalization bound for complementarily labeled learning}
\label{sec:ishida-comparison}
\citet{ishida2017learning} stated that, for a linear-hypothesis class,
the following bound holds with a probability of at least $1-\delta$:
%\[
$\RMC_{\calD}(h) \leq \widehat{R}(h) + ak(k-1) \frac{C_1C_2}{\sqrt{n}} + (k-1)\sqrt{\nicefrac{8 \ln (2/\delta)}{n}}$.
%\]
They used the empirical risk $\widehat{R}(h)$ for complementarily labeled instances, which is
different from the risk that we defined~\cite[see details in][]{ishida2017learning}.
According to this difference,
the proposed generalization bound is incomparable to the
existing bound.
However, we can say that if we achieve a small empirical risk
close to zero,
the proposed risk bound
is $k$ times tighter than the existing bound.
% Hypothesis of the multi-class learning problem can be 
% represented as 
% \begin{align}
%     h: x \mapsto \arg\max_{y \in [k]} \sum_{\hat{x} \in S} \lambda_{\hat{x}, y} K(\hat{x}, x).
% \end{align}
% On the other hand,
% by Theorem~\ref{theo:representer}, the hypothesis of ERM of MIL is
% \begin{align}
%   h': x' \mapsto \max_{z \in x'} \sum_{v \in P_{S'}} \mu_v K'(v, z).  
% \end{align}
% \begin{align}
%     \sum_{v \in P_{S'}} \mu_v K'(v, z) = \sum_{j=1}^k\sum_{v \in P_{S'}} \mu_v K'(v, z)
% \end{align}

% $\sum_{v \in P_{S'}} \mu_v \tau(v)$.


\section{Artificial datasets on complementarily labeled learning}
\label{sec:art_data}
We prepared three datasets, artificial1, artificial2, and artificial3.
Each dataset has 1000 training and 1000 test instances.
The number of dimension $d$ is 50.
They have 5, 10, and 25 classes, respectively.
The feature values of each data is determined by the following rule:
If the data belongs to class $j$, $\{\frac{(j-1)d}{k} +1, \ldots, \frac{jd}{k} \}$-th features
have the values drawn according to $\mathcal{N}(2, 1)$ and other features have the values drawn according to $\mathcal{N}(0, 1)$.

\bibliography{suehiro_124}

\end{document}
