%\documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
% please comment out the following usepackage line and replace
% \usepackage{icml2023} with \usepackage[nohyperref]{icml2023} above.
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{comment}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{dassumption}{Diet Assumption}
\newtheorem{rem}{Remark}
% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[textsize=tiny]{todonotes}

\usepackage{multirow}
% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
%\icmltitlerunning{Submission and Formatting Instructions for ICML 2023}


%\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}

\usepackage{amsmath,amsthm,amssymb}
\usepackage{dsfont}
\usepackage{wrapfig}
%\usepackage{subfigure}

%
%
\usepackage[noabbrev,capitalize]{cleveref}

%
\usepackage{bbm}


%
\usepackage{booktabs}

%
\usepackage{bm}
\usepackage{paralist}
%
\usepackage{enumerate}
\usepackage[inline]{enumitem}

\usepackage{mathtools}

%
\usepackage{graphicx}

\usepackage{ifthen} %

%
\usepackage{caption}
%
%\usepackage{subfigure}

%
\usepackage{setspace}
\usepackage{algorithm}
\usepackage{algorithmic}


%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

\newcommand{\convD}{\stackrel{\mathcal{D}}{\longrightarrow}}
\newcommand{\convP}{\stackrel{\mathcal{P}}{\longrightarrow}}

%
\newcommand{\PA}{\operatorname{PA}}
\newcommand{\scm}{\mathcal{S}}

%
\newcommand{\VAR}{\mathbb{V}}
\renewcommand{\P}{\mathbb{P}}
\newcommand{\DO}{\operatorname{do}}

%
\newcommand{\mat}[1]{\begin{pmatrix} #1 \end{pmatrix}}
\newcommand{\ind}[1]{\mathbbm{1}_{#1}} %

\renewcommand{\epsilon}{\varepsilon}
\newcommand{\cP}{\mathcal{P}} %
\newcommand{\cQ}{\mathcal{Q}} %
\newcommand{\cX}{\mathcal{X}} %
\newcommand{\cZ}{\mathcal{Z}} %
\newcommand{\bX}{\mathbf{X}} %
\newcommand{\bZ}{\mathbf{Z}} %
\DeclareMathAlphabet\mathbfcal{OMS}{cmsy}{b}{n} %

\newcommand{\widebar}[1]{\bar{#1}} %


\newcommand{\nn}{\nonumber}
%\def\E{{\rm E}\,}
\def\arg{{\rm arg}\,}
\def\Cov{{\rm Cov}\,}
%\def\N{{\rm N}\,}
\def\Supp{{\rm Supp}\,}
\newcommand{\X}{\mathbf{X}}
\newcommand{\Y}{\mathbf{Y}}
\newcommand{\Z}{\mathbf{Z}}
\newcommand{\W}{\mathbf{W}}
\newcommand{\Q}{\mathbf{Q}}
\newcommand{\A}{\mathbf{A}}
\newcommand{\B}{\mathbf{B}}
\newcommand{\D}{\mathbf{D}}
\newcommand{\T}{\mathbf{T}}
\newcommand{\F}{\mathbf{F}}
\newcommand{\M}{\mathbf{M}}
\renewcommand{\O}{\mathbf{O}}
\renewcommand{\H}{\mathbf{H}}
\newcommand{\g}{\mathbf{g}}
\renewcommand{\r}{\mathbf{r}}
\newcommand{\e}{\mathbf{e}}
\newcommand{\s}{\mathbf{s}}
\newcommand{\uu}{\mathbf{u}}
\newcommand{\w}{\mathbf{w}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\Beta}{\boldsymbol{\beta}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bgamma}{\boldsymbol{\gamma}}
\newcommand{\bpi}{\boldsymbol{\pi}}
\newcommand{\arrowp}{\stackrel{p}{\rightarrow}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\bP}{\mathbf{P}}
\renewcommand{\listfigurename}{}


%\newcommand{\tens}[1]{%
%  \mathbin{\mathop{\otimes}\displaylimits_{#1}}%
%}



%\newcommand{\I}{\mathcal{I}} % influence function
\newcommand{\I}{\mathbb{I}} % indicator function
\newcommand{\indep}{\bot\!\!\!\!\bot}
\newcommand{\N}{\mathcal{N}}
\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}
%

%

\newcommand{\E}{\mathbb{E}}

\newcommand{\LP}{\mathcal{L}_\text{Prim}}
\newcommand{\LA}{\mathcal{L}_\text{Aux}}

\newcommand{\R}{\mathcal{R}}

\newcommand{\St}{\mathbf{S}}
%\newcommand{\td}[1]{\textcolor{red}{TODO: #1}}
\newcommand{\LT}{\mathcal{L}_2}

\NewDocumentCommand{\tens}{e{_^}}{%
  \mathbin{\mathop{\otimes}\displaylimits
    \IfValueT{#1}{_{#1}}
    \IfValueT{#2}{^{#2}}
  }%
}

\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator{\Var}{Var}

\newcommand{\Eb}{{\mathbb{E}}}
\newcommand{\En}{{\mathcal{E}}}
\newcommand{\rom}[1]{\uppercase\expandafter{\romannumeral #1\relax}}


\title{Implicit Training of Inference Network Models for Structured Prediction}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Shiv Shankar}{}}
% Add affiliations after the authors
\affil[1]{%
   University of Massachusetts, USA}
  
\begin{document}
\maketitle
\begin{abstract}

Most research in deep learning has predominantly focused on the development of new models and training procedures. In contrast, the exploration of training objectives has received considerably less attention, often limited to combinations of standard losses. When dealing with complex structured outputs, the effectiveness of conventional objectives as proxies for the true objective becomes can be questionable. In this study, we propose that existing inference network-based methods for structured prediction, as observed in previous works \citep{tu-18, tu2020improving}, indirectly learn to optimize a dynamic loss objective parameterized by the energy model. Based on this insight, we propose a method that treats the energy network as a trainable loss function and employs an implicit-gradient-based technique to learn the corresponding dynamic objective. We experiment with multiple tasks such as multi-label classification, entity recognition, etc. and find significant performance improvements over baseline approaches. Our results demonstrate that implicitly learning a dynamic loss landscape proves to be an effective approach for enhancing model performance in structured prediction tasks.


%
\end{abstract}


\section{Introduction}
%\vihari{Need not start with the broad introduction of ML. Instead of talking about ``meta-learning people'', there has been some interest in exploring alternate loss objectives.}\\
Deep neural networks have achieved widespread success in a multitude of applications such as translation \citep{vaswani2017attention}, image recognition \citep{he2016identity} and many others. This success has been enabled by the development of backpropagation based algorithms, which provide a simple and effective way to optimize a loss calculated on the training set.
Generally a large portion of existing work has focused only on designing of models and optimization algorithms. However with the increased prevalence of meta-learning, researchers are exploring new loss objectives and training algorithms~\citep{wu2018learning,huang2019addressing}.

%\vihari{Not convincing, what is the training objective you have in mind and why is ERM not suitable?}\\
Intuitively one would like to choose objectives which can dynamically refine the kind of signals it produces for a model to follow, in order to guide the model towards a better solution. Oftentimes standard objectives are pretty effective at this; however, these objectives have generally been explored for simple predictions. When dealing with complex outputs, there is a significant scope for improvement by designing better training objectives. A good example is structured prediction \citep{ belanger2016structured}, where the output includes multiple variables and it is important to model their mutual dependence.  One natural candidate is to use the likelihood under a probabilistic model that captures this dependence. Such models though cannot be used to efficiently predict the output and require inference.  

An ideal loss function in this case would naturally guide the model towards incorporating the output correlations while allowing a more standard feed-forward or similar predictive model to quickly and efficiently produce the output. Energy based structured prediction \citep{belanger2016structured} provide a natural framework in which one can explore learned losses by using the energy itself as the training objective. Existing works \citep{tu-18,tu2020improving} have looked at learning prediction networks to directly predict structured outputs, and not on the energy-based objective itself.

\textbf{Contributions} This work explores the thread of learning dynamic objectives for structured prediction. Using the insight of \citet{hazan2010direct}, we connect the existing paradigm of \citet{tu2020improving,lee2022structured} to a surrogate loss learning problem. This allows us to identify a key problem with the approach of \citet{tu2020improving,lee2022structured}, that it uses incorrect gradient for the surrogate objective problem. Building on this idea, we propose to use implicit gradients \citep{krantz2002implicit} for learning an energy based structured prediction model. We then use ideas from \citet{christianson1992automatic} to compute gradients at scale for the corresponding optimization problem. The experimental results show the effectiveness of our methods against SOTA baselines on three tasks and nine datasets.

%e ideas from \citet{lorraine2020optimizing} 

%to scalably deploy the implicit gradient method to solve the aforementioned problem.

%This brings us to the problem at hand: designing new loss objectives for structured prediction. 


% \vihari{what advantage do dynamic loss have over static loss and why this specific application of structured output? The transition to ``bi-level optimization'', ``implicit gradients'' should be better paced with more explanation with least amount of notation. And then what is the connection with Meta-learning?}\\

% \citet{rajeswaran2019meta} use the implicit-gradient method for meta-learning multiple tasks while \citet{lorraine2020optimizing} consider hyper-parameter tuning. Instead our paper looks at applying implicit gradient methods to learn energy functions (which serve as the training objective) for significantly more complex predictive tasks.

%\vihari{Intro should also contain a brief summary vital previous work and their drawbacks. Why do we expect to do better?}\\



\section{Preliminaries}




\subsection{Learning a Loss Function}
We describe in this section a formulation for learning a dynamic loss function. This loss function, which we call auxiliary loss is used to train a \emph{model}. The model trained on this auxiliary loss is then evaluated on a different loss function, called the primary loss. This second loss function is called primary loss because this is the 'true' loss of concern to the user. For example, in a standard supervised learning setting,  the primary loss could be the performance of the model on a validation set. The goal then is to learn the auxiliary loss in a way that the learnt model's performance as measured by the primary loss is optimized. 
If the model is denoted by $f_\theta$ with parameters $\theta$, auxiliary loss by $\LA$, and primary loss by $\LP$; then this problem can be written as:
\begin{align}
\begin{aligned}
    &\min_{\LA} \LP(\argmin_\theta \LA(\theta)) 
\end{aligned}
\end{align}
Variants of the same formulation have been explored for supervised learning \citep{huang2019addressing,wu2018learning} and reward learning \citep{Bechtle19,zheng2018learning}.
The outer problem is technically a problem of optimization over the space of functions. To be able to solve this computationally, the auxiliary loss $\LA$ is often parameterized with some parameters $\phi$; changing the problem to learning $\phi$. 

\begin{align}
\label{eqn:prob_form}
\begin{aligned}
    &\phi^* = \argmin_\phi \LP(\theta^*(\phi), \phi) \\
    &\text{such that} \\
    &\theta^*(\phi) = \argmin_\theta \LA(\theta, \phi)
\end{aligned}
\end{align}



\subsection{Implicit Gradient Method}
The aforementioned problem is a bi-level optimization problem. In such a case, a parameter ($\phi$) that influences $\LA$, can influence the primary objective $\LP$ via the dependence of the inner optimized parameters $\theta*$ on $\phi$.
The implicit gradient method \citep{krantz2002implicit,dontchev2009implicit} provides a way to compute the gradient of $\LP$ wrt $\phi$ due to this implicit dependence.

For the problem given in Equation \ref{eqn:prob_form}, under certain regularity conditions, $\LP$ is a differentiable function of $\phi$ and its gradient is given by:
\small
%\begin{small}
\begin{align}
&\frac{\partial \LP}{\partial \phi} = \underbrace{\frac{\partial (\LP(\theta^*(\phi), \phi) )}{\partial \phi}}_{\text{Explicit gradient}} - \notag \\
&\underbrace{\left[ \frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right] \left[ \frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right]^{-1} \frac{\partial ( \LP(\theta^*(\phi), \phi) }{\partial \theta}}_{\text{Implicit Gradient}} 
\label{eqn:grad_thm}
\end{align}
%\end{small}
\normalsize
The existence of gradient follows from Theorem 2G.9 in \citet{dontchev2009implicit}. The derivation of the Equation~\ref{eqn:grad_thm} is presented in the Appendix. 
As can be seen from the above equation, the true gradient has two terms: a standard component $\partial_\phi \LP$ and the implicit component due to the the dependence of optimal $\theta$ on $\phi$. We will sometimes abuse terminology to call this term as implicit or meta gradient.

\section{Structured Prediction with Dynamic Loss}

In this section we provide a brief overview of structured prediction, before we present a bi-level optimization based method for structured prediction. Unlike standard classification, structured prediction deals with predicting a multivariate structured output such as multi-label outputs, semantic labeling etc.

In this work we consider a structured prediction task as learning a mapping from an input space $\mathcal{X}$ to a exponentially large label space: $\mathcal{Y}$. The quality of a predicted output is determined by the score function $ s: \mathcal{Y} \times \mathcal{Y} \rightarrow \mathbb{R}$. The score function is used to compare the gold output $y \in \mathcal{Y}$ with another output $ y' \in \mathcal{Y} $ and can be interpreted as a measure of how good $ y' $ is compared to $y$. Some common scoring functions are BLEU used for translation~\citep{papineni2002bleu}, Hamming Distance for comparing strings, and F1-score for multilabel classification tasks~\citep{kong2011multi}.

While in principle, a powerful enough neural network can learn to directly predict the structured output; in practice they often not perform well  due to a) ignoring the dependencies between the labels and b) not using or optimizing for the task loss.
%One method to fix both these issues is to provide extra information to the prediction network v

\paragraph{Energy Based Structured Prediction}
Energy-based structured prediction aims to alleviate these challenges by proposing to learn an energy network $E_{\phi}: \mathcal{X} \times \mathcal{\bar{Y}} \rightarrow \mathbb{R}$ which provides the energy for pairs of inputs $x$ and the outputs $y$.  Here $\mathcal{\bar{Y}} $ refers to a suitable relaxation of $\mathcal{Y}\in \{0, 1\}^L$ to a continuous space: $\mathcal{\bar{Y}} \in [0, 1]^L$. The energy network is  trained to assign the correct output $y$  lower energy than incorrect outputs. At test time, predictions are recovered for an input $x$ by finding the structure $y$ with the lowest energy \citep{belanger2016structured}. Training energy models this way requires inference during training to produce high scoring negative samples negative $\bar{y}$. With such negative samples, a common practice is to use SSVM loss \citep{ssvm}.


\begin{align}
\begin{aligned}
\min_\phi \underbrace{[s(\bar{y}, y) - E_\phi(x,\bar{y} +  E_\phi(x,y)]_+}_{I} \\
\end{aligned}
\label{eqn:prob_orig}
\end{align}

SPEN \citep{belanger2016structured} propose using gradient based inference to find $\bar{y}$, which makes training and inference slow.
To make this eff \citet{tu-18, tu2020improving} propose using {\it inference networks} $F_\theta$ and $A_\theta$ to directly predict the output. $F_\theta$ is the cost-augmented inference network that is used only during training i.e. its output is used as $\bar{y}$ in \ref{eqn:prob_orig}. As such $F_\theta$ is trained to maximize \ref{eqn:prob_orig} with respect to $\bar{y}$. On the other hand, the goal of $A_\theta$ is to predict the output during testing, and as such $A_\theta$ is trained to minimize the energy.

%This inference problem involves finding an output with low energy but high cost relative to the gold standard. Thus, it is not well- aligned with the test-time inference problem
%The goal of $F$ is to output candidates $\bar{y}$ with low energy that also have a high task loss. These are then used as effective negative samples to update the energy $E_\phi$.  



\subsection{Using Energy as Proxy Loss}
An ideal predictor would be directly minimizing the structured loss $s$. However, due to the nature of many real-life structured losses (like BLEU, F1, IoU), and due to the often discrete nature of the output space, performing such an inference is intractable. 
A natural alternative then is using deep networks to build a proxy or surrogate loss. In fact, the classic cost sensitive hinge/margin loss used in \citet{ssvm} (i.e. \ref{eqn:prob_orig}) is a convex surrogate of the true cost \citep{hazan2010direct}. Similarly the value network method of \citet{gygli2017deep} aims to learn a differentiable energy network which directly predicts the score/task-loss of an output. This suggests that the training of the energy network in energy-based structured prediction methods an indirect way to learn a surrogate loss function. This can also be seen by the fact that the energy function $E$ appears additively in the training objective for the inference net $A$. 


 % which is distinct from the model. 

%learning hyper-parameters \citep{franceschi2018bilevel} or learning policies for parameter update \citep{maclaurin2015gradient, l2l, li2016learning, franceschi2017forward, meier2018online, daniel2016learning}. \citet{rajeswaran2019meta} use implicit-gradient based methods in a MAML setting.
% 



%Graph SPEN (Graber and Schwing, 2019) which achieves the SOTA results for structured prediction (on similar tasks) has not been included in the paper.
% function, and optimize it to guide the inference network.





%While in principle a powerful enough neural network should be able to predict the output; in practice modeling the dependencies between output labels is often important to achieve good performance.
%Next building upon the recent work of \citet{tu2019benchmarking, tu2020improving} we present a bi-level optimization method for structured prediction based on learning energy based models. 





%\citet{hazan2010direct} prove that for linear energy models, the term A in Equation \ref{eqn:prob_orig}



Surrogate loss learning can be formulated as a bi-level optimization with the outer optimization over loss function parameters $\phi$ constantly updating itself to provide better feedback to the prediction model (as discussed in Section 2). Under this view the margin loss based training can be interpreted as the following bi-level objective:

$$
\min_{\phi} \LP(\theta(\phi),\phi) \quad \text { s.t. } \quad \theta(\phi) = \argmin_\theta \LA(\phi, \theta) 
$$
where 
$$ \LA(\theta, \phi) = - ( s(F_\theta(x), y) + E_\phi(x, F_\theta(x)) + \lambda  E_\phi(x,  A_\theta(x)) ) $$
and 
\iffalse{
\begin{align*}
\LP(\theta, \phi) = \left[ & s(F_\theta(x), y) - E_\phi(x, F_\theta(x)) + E_\phi(x, y) \right]_+ \\ 
&+ \lambda \left[ -E_\phi(x, A_\theta(x) +  E_\phi(x, y) \right]_+
\end{align*}
}\fi
\begin{align*}
\LP(\theta, \phi) = [ & s(F_\theta(x), y) - E_\phi(x, F_\theta(x)) + E_\phi(x, y) ]_+ \\ 
&+ \lambda [ -E_\phi(x, A_\theta(x) +  E_\phi(x, y) ]_+
\end{align*}

%They also added the second penalty term in $\LP(\theta, \phi)$. These two changes helps stabilize training by removing the purely adversarial nature of the original formulation of \citep{tu-18}. Our proposal builds off on these insights by further decoupling the losses.


%, partly becase the alternate optimization procedure uses 'biased' gradients of the outer-objective.

The interpretation of the training procedure is that at each step, the inference network is trained to predict an incorrect $\bar{y}$ with low energy and then the energy network is updated to guide the inference network to a newer solution. Next we note that under a well trained $E$ one does not need two different networks $A,F$, and so we combine the two of them in the same network. To train this model, we use gradient descent based optimization, however, instead of backpropagating through the gradient steps, we use the implicit gradient method to obtain the gradients of $\phi$. 



%\begin{rem}
%\label{rem:3}
Under our interpretation, the procedure of \citet{tu2020improving,lee2022structured} uses biased gradients during update of $\phi$. Specifically since they only use $\partial_\phi \LP(\theta, \phi)$, their gradient for $\phi$ misses the second term (labeled implicit gradient) in Equation \ref{eqn:grad_thm}, which captures the influence due to the implicit dependence of $\theta$ on $\phi$. Specifically the presence of the mixed derivatives serve the purpose of mapping changes in $\phi$ and $\theta$ into each other. The presence of the inverse Hessian in the missing term provides insight into why the bi-level approach can be superior. Note that the condition number of the Hessian is a useful measure of the hardness of an optimization and an ill-conditioned Hessian would cause the missing term to explode, something which the adversarial training process of \citet{tu2020improving} ignores. We present a more detailed discussion of this in the Appendix.
%\end{rem}



One can observe that in the above optimization for $\theta$, the observed outputs $y$ only appear directly in the score function $s$. If $s$ is not differentiable (which is usually the case), updates to $A_\theta$ relies on the function $E$. However, during initial steps of training, $E_\phi$ would not yet have learned to score the true output correctly. Thus the model $A_\theta$ will receive poor supervision. To alleviate this issue, we add direct supervision from output $y$ in $\LA$. In this case we use the output of $A_\theta$ to construct a distribution which is updated via the cross entropy (CE)/ log-likelihood (MLE) loss.

%Further note that in practice the outputs of  the cost-augmented inference model $F$ and basic in $A$ tend to be similar \citep{tu2019benchmarking,tu2020improving}. This is not very surprising given that the objectives for the two networks just differ by the score function $s$. This effect is even stronger when the networks use a shared feature representation which is usually the case. Hence, we use the same function $A$ use the same function for both $F$ and $A$. 




%\begin{rem}
While one can use different parameters $\theta,\theta'$ to parameterize the $F,A$ networks respectively, for a well trained energy model these networks are not very dissimilar in behaviour. Furthermore, \citet{tu2020improving} also found sharing parameters between F and A helpful. Hence we also consider these as the same network.
Putting these changes (i.e. merging of inference networks and addition of MLE loss) together we get the following auxiliary objective:

\begin{align}
\begin{aligned}
\LA = \mathcal{L}_{MLE}(y, A_\theta(x)) + \lambda E_\phi(x, A_\theta(x)) 
%\sum\limits_{j=1}^{L} \underbrace{ -y^j \log((A_\theta(x))^j) - (1-y^j)\log(1 - (A_\theta(x))^j) }_{MBCE(y, A_\theta(x))} + \lambda E_\phi(x, A_\theta(x)) 
\end{aligned}
\label{eqn:new_la}
\end{align}

\noindent
where $\mathcal{L}_{MLE}$ is a log-likelihood/cross-entropy based loss and  $\lambda$ is a hyperparameter.\footnote{If we replace F by A in $\LA$ objective above both energy based terms become same; next since $s$ is not-directly optimizable we replace it for supervision with $\mathcal{L}_{MLE}$}
%where $MBCE$ is just the multi-label binary cross entropy loss ( defined to be the sum of cross entropy loss for each component of the output ) \footnote{We used MBCE here only for illustration on multi-label classification} and $\lambda$ is a hyperparameter.

\begin{rem}
Unlike standard meta-learning problems, where the outer parameter $\phi$ is used as the initialization point of the model, here we can directly use the learned inference network $A_\theta$ for prediction. However, one can refine the final $\theta$ on a validation set, or attempt to refine the output of network $A_\theta(x)$ via gradient descent on the energy $E_\phi$. We do not use the validation set for further refinement in our experiments. 
\end{rem}


\begin{rem} While ideally the inference networks would be predicting binary vectors, in practice they are used to predict soft examples. The energy function can then be updated both on the real valued vectors or by obtaining hard outputs via sampling, rounding or other methods.
\end{rem}

\subsection{Scalable computation of the implicit-gradient}
An astute reader might note that computing the gradient given in Equation \ref{eqn:grad_thm} directly requires the Hessians $\frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi))$ and $\frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi))$. While computing the Hessians can be compute-intensive if the dimensionality of the parameters $\theta , \phi$ is large; computing the inverse Hessian is prohibitively more so.
An alternative method is to differentiate through the optimization procedure, however that severely limits the number of optimization steps one can conduct. Moreover truncated optimization will induce its own biases \citep{vollmer2016exploration}. 

Fortunately, we do not need to compute any of the two matrices. Instead we only need the vector product of these hessian matrices (HVP) with the gradient $\frac{\partial ( \LP(\theta^*(\phi), \phi) }{\partial \theta} $. Efficiently doing such operations is a well researched area with numerous methods \citep{christianson1992automatic, vazquez2011new, song2022modeling}. 
The given expression can be transformed into first computing a HVP with the cross-Hessian $\frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi))$, and then into an inverse-Hessian vector product (iHVP) with the Hessian $\frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi))$.  
For the inverse Hessian, we use the von-Neumann expansion method suggested in \citet{lorraine2020optimizing}. This allows one to convert iHVP with a matrix $H$ to product to a polynomial in HVP using the same matrix $H$ (details in the Appendix). % \ref{apx:approx}
Once every requisite operation has been turned to HVP,  we can use auto-differentiation on perturbed parameters (i.e. finite step divided difference approximation). 


\subsection{Primary Loss Design}
\label{sec:struct}
An advantage of breaking this problem as a bi-level optimization is that unlike \citep{tu2020improving} where the objectives being used for training $\phi,\theta$ are by construction adversarial, we can now use different objectives for our primary and auxiliary losses. We implicitly already used this fact when we added the binary cross entropy loss to $\LA$, and wrote slightly different form for $\LA$ in Equation \ref{eqn:new_la}. However we also have the freedom to choose the primary loss $\LP$ which can result in different behaviour for the models. In fact structured prediction literature has explored variety of losses for training energy models. We mention a few of these which we work use as $\LP$ in our experiments. Some of these have also been explored for structured prediction by \citet{lee2022structured}. In this section we shall often use $\bar{y}$ to denote an element from $\mathcal{Y}$ which is distinct from the true output $y$.

\textbf{Hinge/SSVM Loss}. Early structured prediction models were often trained with a version of the hinge loss adjusted for the score function \citep{ssvm}. In current parlance it is also known as margin loss. This is one of the components of the loss used in \citep{tu2020improving}. It is given by the following equation:
$$\mathcal{L}_\text{SSVM} =\left[ s(\bar{y}, y) - E_\phi(x,\bar{y}) + E_\phi(x, y) \right]_+$$


\textbf{DVN Loss}. A natural candidate for learning energy network is to use it as a differentiable proxy for the score function $s$. This can be done by match the energy values to the task loss i.e. $E(x,\bar{y}) \approx s(\bar{y},y)$.  \citep{gygli2017deep} proposed to normalize the score function $s$ and use a cross entropy loss to match the energy $E$ with it.


$ \begin{aligned}
\mathcal{L}_\text{DVN} = & -s(\bar{y}, y) \log( - E_\theta(x,\bar{y})) \\
&- (1 -s(\bar{y}, y))  \log( 1 + E_\theta(x,\bar{y}))
\end{aligned}
$


\textbf{Contrastive Divergence}. Literature in probabilistic inference have proposed various losses to do maximum likelihood estimate of energy models \citep{gutmann2010noise}. A common loss for such training is the contrastive-divergence \citep{hyvarinen2005estimation} based loss which uses samples to approximate the log-likelihood of the model. We use a similar loss augmented with the score function $s$ as shown below. 

$$\mathcal{L}_{CD} =  \log \dfrac{\exp(-E_\phi(x,y))}{\sum\limits_{k=0}^K \exp(-E_\phi(x,\bar{y}_k) + s(\bar{y}_k, y))  )}$$
 
where $\bar{y}_{1..K}$ refers to $K$ possible negative (non-true output) samples and $\bar{y}_0 = y$.


\textbf{Noise-Contrastive Loss}. We also experiment with a modified version of the $\mathcal{L}_{CD}$ loss above which inspires from noise contrastive estimation \citep{ma2018noise}. \citet{lee2022structured} have also used this version to train structured energy networks. 

$$\mathcal{L}_{NCE} =  \log \dfrac{\exp(-E_\phi(x,y) - \log P(y|x))}{\sum\limits_{k=0}^K \exp(-E_\phi(x,\bar{y}_k) + s(\bar{y}_k, y)) - \log P(\bar{y}_k|x) )}$$
 
where $\bar{y}_{1..K}$ once again refers to $K$ possible negative (non-true output) samples and $\bar{y}_0 = y$. $P(\bar{y}_k|x)$ is the probability of the value $\bar{y}_k$ as estimated by the predictive inference net under the assumption that its components are independent i.e. $P(\bar{y}_k|x) = \prod_i P_\phi(\bar{y}^i_k|x)$

During training $\bar{y}$ in the aforementioned objectives gets replaced by the prediction of the inference net $A_\theta(x)$. When multiple values are required (such as for $\mathcal{L}_{CD}$) we obtain them samples by interpreting the continuous output of $A_\theta(x)_j$ as a Bernoulli random variables, and drawing samples from the corresponding distribution.

\begin{rem}
Learnt loss functions have been used in literature for the outer objective \citep{Bechtle19}. However, these are also loss objectives used to train the prediction model ($\LA$ in our notation). In this work, predictions are obtained from the inference network $A_\theta$, which is trained by optimizing the energy function $E$. Hence we call $E$ dynamic loss in the latter sense. 
\end{rem}





Now we are in a position to state our exact proposal to train structured prediction models. Our proposed method is summarized in Algorithm \ref{alg:struct_train}. The network $E$ is trained by an energy-learning-based objective to learn a landscape that incorporates signal from the task loss and reflects the dependencies among output variables. An energy optimum is indicative of a good prediction satisfying high similarity with the true output while respecting statistical dependence between labels. The inference net gets trained to predict an optima of the energy $E$. The algorithm updates the inference networks in the  direction of reducing energy  and the energy serves as a surrogate loss.

\begin{algorithm}
\caption{Implicit Gradient for structured prediction}\label{alg:struct_train}
\begin{algorithmic}[h]
\REQUIRE Energy Network $E_\phi$, Inference Network $A_\theta$,\\
Regularization $\lambda$,  Training Data $\mathcal{D} = {x_i,y_i}$, \\ Inner/Outer Iterations $T_\text{inner},T_\text{outer}$\\
%\hline \\ %rulefill\\
\STATE Sample $\theta_0,\phi_0$ randomly
\FOR{$ t \in T_\text{outer}$ iterations}
   \STATE Obtain sample $x,y$ from $\mathcal{D}$
   \STATE $\theta_p \leftarrow \theta_t$
   \FOR{$p \in T_\text{inner}$ iterations }
   \STATE Compute $\LA (\theta_p, \phi_t)$
   \STATE $\theta_p \leftarrow \theta_p - \eta \nabla \LA (\theta_p, \phi_t)$
   \ENDFOR 
   \STATE $\theta_t \leftarrow \theta_p$
   \STATE Compute $\LP (\theta_t, \phi_t)$
   \STATE Compute $ g = \frac{d}{d \phi} \LP (\theta_t, \phi_t)$ via Equation \ref{eqn:grad_thm}
   \STATE $\phi_{t+1} \leftarrow \phi_{t} - \eta g$
   \STATE $\theta_{t+1} \leftarrow \theta_{t}$
\ENDFOR
   
\STATE Return resulting model $A_\theta$
\end{algorithmic}
\end{algorithm}


\subsection{Connection to Dynamic Losses}
A key distinction between our proposed method, employing the $\mathcal{L}_{SSVM}$ loss, and the InfNet+ approach suggested by \citet{tu2020improving} lies in the utilization of the implicit gradients/bi-level optimization method instead of alternative independent optimization. It is worth noting that when training via independent optimization, the parameters $\phi$ are updated on $\partial\phi \LP$. However, Equation \ref{eqn:grad_thm} reveals that $\partial_\phi \LP$ represents only one component of the true gradient, as it incorporates an additional term (Line 1 of Equation \ref{eqn:grad_thm}). This additional term captures the indirect influence of $\theta$ on $\LP$, as $\theta$ is also dependent on $\phi$.

As such standard energy based training methods, use biased gradients which can lead to improper solutions to the underlying problem. However, by utilizing the correct gradient, the energy optimization ($\phi$) becomes explicitly aware of and receives feedback from the inner optimization process of the inference network. To emphasize th distinction this causes from the standard training procedure of energy-based models (EBMs) described by \citet{tu2020improving}, consider a scenario where the energy parameters $E$ have fully optimized $\LP$ (e.g., through margin loss). 
Suppose that at the current parameters $\theta$ of the inference net $A$, the energy of the output $E(x, A(x))$ has a singular hessian $\partial^2_\theta E_\phi(x, A_\theta(x))$. 
Since the condition number of the Hessian often serves as a reliable indicator of the difficulty of convex optimization (e.g., gradient descent-based optimization becomes slow in the presence of a flat surface), it is reasonable to infer that finding the optimal $y$ is challenging within the current energy landscape.

Under the standard EBM training, as the updates solely depend on $\partial_\phi \LP$, the energy network remains oblivious to the difficulty associated with predicting $y$. Consequently, when $\LP$ approaches optimality, the energy network ceases to receive updates. Conversely, in our framework, the update to the energy function is not solely determined by $\partial_\phi \LP$. It is important to note that the first term in Equation \ref{eqn:grad_thm} relies on the inverse Hessian of the loss with respect to $\theta$. Therefore, if the Hessian of the inference network exhibits ill-conditioning, a significant gradient for the energy network emerges. This enables the energy function to discover a landscape that is adapted to the behavior of the inference network.

In this sense, our energy function behaves akin to a learned dynamic loss \citep{wu2018learning, Bechtle19}, actively adjusting itself to the prediction network's behavior.

\section{Experiments}

%We evaluate our method using classic structured prediction tasks of multi-label classification (MLC) and sequence-labelling ( POS tagging and NER). 

\paragraph{Multi-Label Classification}
We use the following multi-label classification datasets for testing our model: bibtex \citep{katakis2008multilabel}, delicious \citep{tsoumakas2008effective}, eurlexev \citep{mencia2008efficient}.
The performance metric is F1 score , which is also the score function used for training our models. The max-likelihood loss $\mathcal{L}_{\text{MLE}}$ in this case is given by the multi-label binary cross entropy (MBCE). We use the output of $A$ as a vector of Bernoulli variables, and MBCE is then just the sum of logistic losses over the individual components of $y$.
\begin{align*}
\mathcal{L}_{\text{MLE}} = \sum\limits_{j=1}^{L} -y^j \log((A_\theta(x))^j) - (1-y^j)\log(1 - (A_\theta(x))^j)
\end{align*}

%literature \citep{tsoumakas2009mining,kong2011multi,sarawagi2008accurate, belanger2016structured}.   

For fair comparison with earlier works on these datasets, we used the energy network design of \citet{belanger2016structured}.
The corresponding energy function $E_\phi$ is parameterized as:
$$
E_{\phi} = y^T Wb(x) + v^T\sigma(My)
$$
where the parameters $ \phi$ comprise of  $\{ W, v, M, b \}$. Network $b$ is defined by a multilayer perceptron. A similar multilayer perceptron from the basis of the inference network $A_\theta$.


\begin{table}[]
\centering
\small
\begin{tabular}{l|l||cccc}
\hline
\multicolumn{2}{c||}{Method} & \multicolumn{4}{c}{Dataset}             \\
\hline
     &  & BibTex & Delicious & Eurlexev & Bookmark\\ 
\hline
\hline
\multirow{5}{*}{Slow} & SPEN   & 43.12  & 26.56     & 41.75  & 34.4 \\ 
 & NCE   & 20.12  & 16.97     & 19.50   & - \\ 
 & DVN    & 42.73  & 29.71     & 31.90  & 37.1 \\ 
 & ALEN &  46.4   & - & - &  38.3 \\
 & GSPEN & \textbf{48.6} & - & - & \textbf{40.7} \\
\hline
\multirow{6}{*}{Fast} & MBCE     & 42.47  & 30.12     &
43.25 & 33.8 \\ 
& iALEN &  42.8   & - & - &  37.2 \\
& $\mathcal{L}_\text{SSVM}$  & 44.55  & 30.34     & 42.50  & 37.9\\ % 
& $\mathcal{L}_\text{DVN}$    & 44.94  & 28.87     & 42.35 & 38.1   \\ %
& $\mathcal{L}_\text{CD}$     & 45.76 & 34.50    & 42.9  & \textbf{38.5}$\dagger$ \\
& $\mathcal{L}_\text{NCE}$     & \textbf{46.21}$\dagger$  & \textbf{35.12}     & \textbf{43.49}  & \textbf{38.5}$\dagger$ \\ 

\hline

\end{tabular}
\caption{Performance of our approach with different objectives (SSVM,CD,NCE) compared to standard multi-label classification (MBCE) and energy based models (SPEN,DVN,NCE). Our implicit gradient trained model significantly outperforms the other approaches.
%* denotes we report results from literature and not our own replication 
\label{tab:struct_expt_small}. $\dagger$ denotes statistically significant }
\end{table}




We experiment with SPEN \citep{End-to-EndSPEN}, DVN \citep{gygli2017deep}, and an energy model trained by NCE loss \citep{gutmann2010noise,ma2018noise}. As a baseline we also present the results of an MLP trained by standard multi-label binary cross entropy, and ALEN, iALEN \citep{pan2020adversarial} and GSPEN \citep{graber2019graph}. For our proposed implicit training method, we experiment with different objectives for inner-optimization $\LP$ as described in the section: ``Primary Loss Design''. Our results are presented in Table \ref{tab:struct_expt_small}. 

From the experiments, it is clear that our implicit training approach is superior to most current approaches of using energy based models for structured prediction. Our implicit gradient method gives a boost of upto \emph{5 F1 points} depending on the primary loss objective and the dataset. Furthermore, we also note that ($\mathcal{L}_{SSVM}$, SPEN) and ($\mathcal{L}_{DVN}$, DVN) use the same loss and energy function, and the difference in results is attributable to our proposed implicit training of the inference network.
Next, we also note that the only model that outperforms our proposed method is GraphSPEN/GSPEN, which lacks scalability. For example the running-time of GSPEN on Bib (which is our smallest dataset) is more than 6 times our approach. This is due to the need of computationally hard constrained inference in GSPEN and makes it infeasible on the larger datasets that we experiment with in the next section. Finally we see that the noise contrastive objective outperforms the other methods, and so we focus on this objective in our other experiments.
%\ssh{Why does GraphSPEN perform comparably or better than ours?}

%{\color{red} Why does GraphSPEN perform comparably or better than ours?}


\paragraph{Large Scale Multi-label Modelling}.  To demonstrate that our approach is more scalable and general, we apply our approach on two large text based datasets RCV1 \citep{lewis2004rcv1} and AAPD \citep{yang2018sgm}. Existing models on these datasets instead rely on standard max likelihood training. The dependence between labels is usually modeled by novel architectures \citep{zeng2021modeling}, transforming the problem into sequence prediction (SGM) \citep{yang2018sgm} or by adding regularization terms to improve representation (LACO) \citep{zhang2021enhancing}. There are no available energy based baselines on these tasks, partly because of intractability of inference required for energy based structured prediction. We use models from the aforementioned works as baselines, and use a similar architecture to the smaller MLC task for our energy model, except that our feature networks use pretrained BERT models. We also compare to the state of the art LACO  model that uses BERT to learn label embeddings \citep{zhang2021enhancing}, the seq2seq approach of \citet{nam2017maximizing} and \citet{tsai2020order} which is a RNN based auto-regressive decoder. Our results are presented in Table \ref{tab:struct_expt_large}. It is clear that using energy based method significantly outperforms BERT based models and edges out ahead of other methods which explicitly focus on modeling label dependence. 


\begin{table}[]
\centering
\begin{tabular}{|l||cc|cc|}
\hline
%Method & \multicolumn{4}{c|}{Dataset}             \\
%\hline
Method  & \multicolumn{2}{c|}{RCV} & \multicolumn{2}{c|}{AAPD} \\ 
\hline
  & Mi-F1 & Ma-F1 & Mi-F1 & Ma-F1 \\ 
\hline
\hline
SGM    &  86.9 & - & 70.2 & - \\
BERT-CE  & 87.1 & 66.7 & 74.1 & 57.2 \\
OCD & - & - &  72.1 & 58.5 \\
Seq2Seq & 87.9 & 66.0 & 69.0 & 54.1 \\
SeqTag & 87.7 & 68.7 & 73.1 & 58.5 \\
LACO   & 88.2 & \textbf{69.1} & 74.7  & 59.1 \\ 
\hline
$\mathcal{L}_\text{NCE}$     & \textbf{88.5}$\dagger$  & 68.9 & \textbf{75.6}$\dagger$ & \textbf{59.8}$\dagger$ \\
\hline
\end{tabular}
\caption{Performance of our model on large scale multi-label classification against existing models (SGM, OCD, Seq2Seq, BERT-CE, LACO). Our implicit gradient trained model significantly outperforms or matches other approaches. $\dagger$ denotes statistically significanct scores \label{tab:struct_expt_large}}
\end{table}

\paragraph{Named Entity Recognition}. For our experiments we work with the commonly used CoNLL 2003 English dataset \citep{conll20003}. Similar to previous work \citep{Ratinov:2009}, we consider 17 NER labels, and evaluate the results based on the F1 score. 
Following \citet{tu-18}, we design the energy network $E_\phi$ and the inference network $A_\theta$ based on Glove based word embeddings \citep{pennington2014glove}. The text embeddings are then provided to bi-LSTMs to form the features $b(x)$ for the energy function. If we denote by $b(x,t)$ the bi-LSTM output at step $t$, then the energy is :
\begin{align}
\label{eqn:energy-sequence-labeling}
E_{\phi}(x, y) &= \sum_{t=1}^T  \sum_{j=1}^L y_{t,j}U_j^\top b(x,t)  + \sum_{t=1}^T y_{t-1}^\top W y_{t}
\end{align}

The parameters $\phi$ compose of the matrix $W$ and the per label parameter $U_j$, along with the LSTM parameters. Similarly $A_\theta(x)$ can be written as a linear MLP over $b(x)$.

We run our models with two different input feature sets. For the NER version, the input consists of only words and their Glove embeddings. NER+ configuration also provides POS tags and chunk information. 
As baselines we use SPEN \citep{End-to-EndSPEN}, InfNet\citep{tu-18}, InfNet+\citep{tu2020improving} and a cross entropy trained BILSTM baseline. Our results in Table \ref{tab:struct_expt_ner} show that implicit models outperform other existing models. Note in particular that our model with SSVM loss is very similar to the InfNet+\citep{tu2020improving} (with the same losses etc.). The difference between these is a) the final layer in the inference networks $F,A$ are not shared in Infnet+ but are in ours and b) the training procedure is different due to using implicit gradients. If both these models are trained correctly then their final performance should be consistent which seems to be the case. Finally similar to the previous experiments, we see improved performance with contrastive losses.

\begin{table}
\centering
%\begin{minipage}[b]{0.48\textwidth}\centering
\begin{tabular}{|l||c|c|}
\hline
Models  & NER  & NER+  \\
\hline\hline
BILSTM  & 84.9 & 89.1  \\
SPEN    & 85.1 & 88.6  \\
InfNet  & 85.2 & 89.3  \\
InfNet+ & 85.3 & 89.7  \\
\hline
$\mathcal{L}_\text{SSVM}$    & 85.4  & 89.6  \\
$\mathcal{L}_\text{NCE}$     & \textbf{85.7} & \textbf{90.3}$\dagger$  \\
\hline
\end{tabular}
\caption{Test results for NER and NER+  for different energy based models. SSVM and NCE refers to our implicit gradient models. $\dagger$ indicates statistical significance 
\label{tab:struct_expt_ner}}
%\end{minipage}
%\hfill
\end{table}

\paragraph{Citation Field Extraction}.  We also run our model on the citation-field extraction task \citep{seymore1999learning}. This is an information extraction task where the goal is to segment a citation text into its constituents such as Author, Title, etc. We use the extended Cora citation dataset \citet{seymore1999learning} used in \citet{rooshenas19search}. The citation texts have a max length of 118 tokens, which can be labeled with one of 13 tags. 

%Details and results about this experiment can be found in Appendix \ref{apx:more_expts}.
% We conduct experiments on a citation-field extraction task. This is an information extraction task where the goal is to segment a citation text into its constituents as Author, Title, Journal, Date etc. We use the extended Cora citation dataset \citet{seymore1999learning} used in \citet{rooshenas19search}. The citation texts have a max length of 118 tokens, which can be labeled with one of 13 tags. 

We explore this task in the indirectly semi-supervised structured prediction of \citet{rooshenas19search}. In this setting, we have a few labeled points, and are also given rules based rewards for the unlabeled samples. However, the citation reward loss is based on domain knowledge and is noisy. For the task loss, we use token-level accuracy to supplement the reward function. Similarly, the model performance is measured on token-level accuracy. We run this task with 1000 unlabeled and 5, 10, and 50 labeled data points. We compare against GE,  RSPEN and SGSPEN \citep{rooshenas19search}, DVN \citep{gygli2017deep} and our method. Our results are presented in Table~\ref{apx:tab:struct_cite}. 

\begin{table}[!ht]
\centering
\begin{tabular}{|l||c|c|c|c|c|}
\hline
   & GE   & RSPEN & DVN  & SGSPEN & Ours($\mathcal{L}_{NCE}$) \\
\hline
\hline
5  & 54.7 & 55.0 & 57.4 & 53.0 & \textbf{58.9}$$ \\
10 & 57.9 & 65.0 & 60.9 & 62.4 & \textbf{67.8}$$ \\
50 & 68.0 & 81.5 & 79.4 & 82.6 & \textbf{82.9} \\   
\hline
\end{tabular}
\caption{Comparing performance of our approach on the semi-supervised setting for the citation-field extraction task. Our implicit gradient trained model significantly outperforms the other approaches. \label{apx:tab:struct_cite}}
\end{table}



\paragraph{Ablations} 
The key difference between existing methods like \citet{lee2022structured,tu2020improving} and ours is that in our approach the energy function is updated via the 'true' gradient (Equation \ref{eqn:grad_thm}) while these approaches use alternate optimization and hence use only the explicit gradient term of Equation \ref{eqn:energy-sequence-labeling}. To demonstrate that these works are less effective due to using biased gradients, we compare our approach to such models in Table \ref{tab:struct_expt_comp}. For this experiment we focus on the smaller multi-label classification task and experiment with all four objectives discussed earlier. The results show our method to be consistely superior likely because the implicit gradient term provides explicit information to the energy network $E_\phi$ not only via the output samples of the inference net $A_\theta$, but also via the Hessian of the parameters $\phi$. %We provide a greater discussion about how the implicit gradient term is important in Appendix D.
\begin{table}[]
\centering
\small
\begin{tabular}{|l|l||ccc|}
\hline
Method & Objective & \multicolumn{3}{c|}{Dataset}             \\
\hline
     &  & BibTex & Delicious & Eurlexev \\ 
\hline
\hline
\multirow{4}{*}{Non-Implicit} & $\mathcal{L}_\text{SSVM}$  & 43.15  & 28.91     & 42.10  \\ % 
& $\mathcal{L}_\text{DVN}$    & 42.59  & 30.17     & 42.15 \\ %
& $\mathcal{L}_\text{CD}$     & 43.3  & 31.09     & 42.79  \\ 
& $\mathcal{L}_\text{NCE}$     & 43.2  & 33.08     & 42.19  \\

\hline
\multirow{4}{*}{Ours} & $\mathcal{L}_\text{SSVM}$  & 44.55  & 30.34     & 42.50  \\ % 
& $\mathcal{L}_\text{DVN}$    & 44.94  & 28.87     & 42.35  \\ %
& $\mathcal{L}_\text{CD}$     & 45.76  & 34.50     & 42.92  \\
& $\mathcal{L}_\text{NCE}$     & 46.21  & 35.12     & 43.49  \\ 
\hline
\end{tabular}
\caption{Ablation study of our method using implicit gradients to tune the loss against the SEAL method. Our proposal consistently outperforms as the implicit gradient tunes the energy network towards a loss surface more amenable for the inference net \label{tab:struct_expt_comp}.}
\end{table}

%Part-of-Speech (POS) Tagging and 

\paragraph{Time Comparisons}
In Table \ref{tab:time}, we provide the training time and inference time comparison of our method against other methods like SPEN and DVN on multi-label classification datasets. As can be seen the inference time of our proposed method is much better than gradient descent based methods of SPEN and DVN. Moreover , the SOTA GSPEN method of \citet{graber2019graph} takes more than 13s ( $>$ 6 times our approach) for one pass over the bib dataset, highlighting its inefficiency which makes using it infeasible on larger tasks.

\begin{table}[ht!]
\centering
\begin{tabular}{|l||c|c||c|c|}
\hline
          & \multicolumn{2}{|c||}{Training Time} & \multicolumn{2}{c|}{Inference Time} \\
\hline
           & Bib   &  Eurlexev   & Bib   &  Eurlexev   \\
\hline
\hline
SPEN       & 28.2  & 134.5 & 3.8       & 24.5     \\
DVN        & 32.1  & 128.7  & 3.8      & 24.6    \\
\hline
%$\mathcal{L}_{SSVM}$ & 31.2 & 43.3  & 1.8      & 12.1     \\
Ours   & 27.7 & 45.6  & 1.8    & 12.1    \\
\hline
\end{tabular}
\caption{Training and inference time (sec/epoch) comparison of our approach against SPEN and DVN. Since the number of parameter update steps for our approach is different per epoch than other models, we have normalized training time/epoch by the number of parameter updates. \label{tab:time}}
\end{table}


\paragraph{Additional Experiments}
We also conduct experiments with role labeling, POS tagging and image segmentation. These results are available in the supplementary material.

\section{Related Work}
\paragraph{Implicit Gradients} Implicit gradients are a powerful technique with a wide range of applications. Recently they have been used for applications like few-shot learning \citep{rajeswaran2019meta, lee2019meta} and building differentiable optimization layers in neural-networks \citep{amos2017optnet,agrawal2019differentiable}. These techniques also arise naturally in other problems related to  differentiating through optimizers \citep{vlastelica2019differentiation}, such as general hyper-parameter optimization \citep{lorraine2020optimizing}. For more detailed review of implicit gradients we refer the readers to \citet{dontchev2009implicit, krantz2002implicit}. Implicit gradient methods have been used for energy based learning of MRFs \citep{tappen2007learning, samuel2009learning}. These works have been further extended to use finite-difference methods. While, our work is similar in that it focuses on using implicit gradients for learning energy based models; we focus on structured prediction tasks. %instead of Gaussian models\citep{samuel2009learning}.


\paragraph{Structured Prediction}
\label{sec:rel_sp} In recent years energy based models have become prominent in the field \citep{belanger2016structured, rooshenas19search, tu2019benchmarking}. These models essentially relax the output space to a continuous version on which an energy function is learnt for scoring the outputs. Structured prediction energy networks \citep{End-to-EndSPEN, rooshenas19search} pair up such energy based models with gradient-based inference for prediction. The training methods for these models have generally relied on generalized version of structural SVM learning \citep{ssvm}, with repeated cost augmented inference being done to adapt the energy models landscape. Due to the difficulty of prediction and instability in training such models \citet{tu-18} propose an approach called InfNet which directly performs the inference step instead of using gradient descent or other optimization procedures. Our work directly builds upon recent research on energy based structured prediction \citep{tu2020improving,lee2022structured}. The most important difference between these works and ours is the bi-level optimization formulation and use of implicit gradients. To the best of our knowledge no work in structured prediction literature uses implicit gradient based methods. Secondly, most works either use cost-augmented inference during training \citep{rooshenas19search, belanger2016structured} or use the inference network and energy network in an adversarial game \citep{End-to-EndSPEN, tu-18}. The former increases inference time significantly while the latter uses incorrect gradients. ALEN \citep{pan2020adversarial} propose augmenting the deep energy model of a SPEN with adversarial loss.
To handle structural constraints and have direct control over correlations between output variables, \citet{graber2019graph} incorporate classical inference into SPENs.

An important difference of our method differs from these methods is that we 'meta-learn' the energy function as a trainable objective and can be applied to adjust training of these models as well. Moreover models like GraphSPEN which incorporate constrained inference are not scalable. Our approach side-steps this issue by using an Inference Network \citep{tu-18} approach. Finally ideas from energy based learning have been used in translation \citep{tu-etal-2020-engine,bhattacharyya2020energy,edunov2017classical} and text generation \citep{deng2020residual}.

%\paragraph{Surrogate Loss Learning}
\paragraph{Learning Dynamic and Surrogate Losses} 
Surrogate loss learning was formulated as a multi-level optimization by \citet{colson2007overview}. Our work uses the insight of \citet{hazan2010direct}, to interpret learning a structured energy model as a surrogate loss learning problem and uses the bi-level optimization framework to solve the corresponding task. Modern works such as that of \citet{wu2018learning, huang2019addressing,Bechtle19} have attempted to learn dynamic losses for standard classification and regression tasks. Other works such as \citet{sung2017learning,epg18} have also proposed learning a reward function for optimization. While the goal of these works and ours is similar in that we try to 'learn' an objective loss for increasing a model performance, there are multiple key differences between them. First, these works do not look at the implicit gradient. Instead they rely on ‘unrolling’ one/few-step gradient updates in the inner optimization and then backpropagate through those updates. This leads to improper characterization of the model/optimizee parameters induced by the learned loss. Secondly, in the supervised learning based applications the model tries to boost a validation set performance, while in our case we are optimizing the prediction on the training examples via the task loss function available in the structured prediction setting.% A final difference is that of the input to the meta objective, as these works focus on stage of training such as training step, learning rates etc. while we are dealing with samples from the training set. 

%use implicit-gradient based methods in a MAML setting.

%to train neural networks.  The idea of learning reward or loss functions has also been used in reinforcement learning (RL) problems such as implicit rewards \citep{singh2010intrinsically}, inverse reinforcement learning \citep{ng2000algorithms} and reward shaping \citep{zou2019reward}. 

\paragraph{Meta Learning}
Our method has some algorithmic similarities with learning to learn methods ~\citep{Schmidhuber:87long}. This is due to the general nature of bi-level objectives which has been adapted for learning hyper-parameters \citep{franceschi2018bilevel}, learning policies for parameter update \citep{maclaurin2015gradient, franceschi2017forward, meier2018online} and meta-learning \citet{rajeswaran2019meta}. The key idea in meta-learning is to make the model 'aware' of the 'learning process' ~\citep{Schmidhuber:87long, ThrunP98}. 
However meta-learning is commonly used for learning model parameters $\theta$ that can be easily adapted to new tasks~\cite{mendonca2019guided, gupta2018meta}, multi-task transfer learning \citep{metz2018learning}; while we aim to learn a loss function.

\section{Conclusion}
\textbf{Summary}
The primary goal of our work is to learn dynamic losses for model optimization using implicit gradients, in a setting with complex outputs such as in structured prediction. This work uses a bi-level optimization framework for structured prediction that uses a dynamic loss. Then we use implicit gradients to optimize an energy-based model in our proposed framework. We also explore possible designs of these dynamic objectives. Our experiments show our approach outperforms or achieves similar results to existing approaches. Our method tends to be more stable than existing approaches based on inference networks and gradient-based inference. 
%While our primary focus is on structured prediction, we also show how these methods can be used in reinforcement learning to learn intrinsic rewards. The primary advantage of our method for reward learning is that we can avoid backpropagating through multiple iterations of gradient descent often used in current methods.

\textbf{Limitations and Social Impact} Our contributions are mostly restricted to inference network based structured prediction; and our experiments are mostly textual datasets. Structured prediction has also been explored in domains like generative modelling, but our experiments are of little insight into those areas. Moreover, even though our approach trains better than other energy based methods, they are still more sensitive to hyperparameters than standard autoregressive models. We do not foresee any negative societal impact from this work.


\begin{acknowledgements} % will be removed in pdf for initial submission,
We thank Yash Chandak and Vihari Piratla for discussions and feedback on the paper.
    %\emph{All} acknowledgements go in this section.
\end{acknowledgements}
%Moreover for applications like translation etc. energy based models have had limited successes, and our approach does not trivially apply to such tasks.

%\textbf{Future Research} Our work also opens avenues for implicit methods for learning from partial and indirect supervision. Another related direction is whether one can also learn a score function for partial structures. Finally, there is the possibility of extending this method to other settings with non-differentiable losses such as reinforcement learning and for constructing surrogate losses.
%the biases of various policy-gradient approximations used in reinforcement learning.
%that meta-learn model-parameters $\theta$ directly. Similar to earlier works on learning loss functions \citep{sung2017learning,epg18,wu2018learning,huang2019addressing}, we aim at learning loss function parameters $\phi$ .

%and is generally used for multi-task transfer learning \citep{metz2018learning}, few-shot learning \citep{maml} and adaptive learning \citep{mendonca2019guided}. 

%In basic meta-learning approaches, the parameters $\phi$ of an \emph{optimizee} are updated according to a \emph{fixed} loss function to get new model parameters $\theta_\text{new}$. The 









%$h$ that transform parameters updates with respect to known loss or reward functions, or learning loss/reward function representations $\phi$~\cite{sung2017learning, epg18, zou2019reward}.




% \clearpage
%\section{Conclusion}

%\newpage

%%BIBLIO

%\bibliographystyle{unsrtnat}
\bibliography{mybib}

%\include{appendix1}
\end{document}

%\section{Appendix}
%\include{appenfix1}


