
\documentclass{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\include{header1}
\usepackage{subfigure}
\usepackage{bibentry}
% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{shankar_127}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
%\newtheorem{rem}{Remark}
% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage{multirow}
% The \icmltitle you define below is probably too long as a header.
% Therefore, a short form for the running title is supplied here:
%\icmltitlerunning{Submission and Formatting Instructions for ICML 2023}


%\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}

\usepackage{amsmath,amsthm,amssymb}
\usepackage{dsfont}

\newcommand{\swap}[3][-]{#3#1#2} % just an example


\newcommand{\E}{\mathbb{E}}

\newcommand{\LP}{\mathcal{L}_\text{Prim}}
\newcommand{\LA}{\mathcal{L}_\text{Aux}}

\newcommand{\R}{\mathcal{R}}

\newcommand{\St}{\mathbf{S}}
%\newcommand{\td}[1]{\textcolor{red}{TODO: #1}}
\newcommand{\LT}{\mathcal{L}_2}

\NewDocumentCommand{\tens}{e{_^}}{%
  \mathbin{\mathop{\otimes}\displaylimits
    \IfValueT{#1}{_{#1}}
    \IfValueT{#2}{^{#2}}
  }%
}

\DeclareMathOperator{\Var}{Var}

\newcommand{\Eb}{{\mathbb{E}}}
\newcommand{\En}{{\mathcal{E}}}

\title{Implicit Training of Inference Network Models for Structured Prediction
(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2023 paper}{Shiv Shankar}{}}
% Add affiliations after the authors
\affil[1]{%
   University of Massachusetts, USA}
  
\begin{document}

\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\section {Implicit Gradient}
\label{apx:meta}


Consider the following bi-level objective
\begin{align}
    &\phi^* = \argmin_\phi \LP(\theta*(\phi), \phi) \text{    s.t.   }
    \theta^*(\phi) = \argmin_\theta \LA(\theta, \phi)
\end{align}

Here we have explicitly added the dependence of $\theta$ due to the opimization process on $\phi$. 
One approach to find the optimal $\phi^*$ is to find the partial derivative of $\mathcal{L}(f(\theta^*(\phi)), D_{\text{val}})$ with respect to the $\phi$, and use gradient descent based optimization. 
The corresponding partial derivative is given by 
\begin{align}
 \frac{d}{d \phi} \LP(\theta^*(\phi), \phi) =  \underbrace{   \frac{\partial }{\partial \theta} ( \LP(\theta^*(\phi), \phi) )|_{\theta^*}   }_{1}    \circ \underbrace{  \frac{\partial}{\partial \phi} \theta^*   }_{\rom{2}} + \frac{\partial}{\partial \phi} \LP(\theta^*(\phi), \phi)
 \label{eqn:11}
\end{align}

Considering that the inner optimization is finished we have direct access to $\theta^*(\phi)$ and the first term $\rom{1}$ in the previous equation can be computed directly. The second term is a more challenging to compute.

Implicit gradient method computes this gradient via differentiation of the optimality criteria of the inner optimization. The optimality criteria states that the gradient of the inner loss at the optima $\theta^*(\phi)$ is zero i.e.

\begin{align}
&\nabla_\theta \LA(\theta, \phi) = 0 \Rightarrow \frac{\partial}{\partial \theta} (\LA(\theta, \phi)) = 0
\end{align}

By differentiating this with respect to $\phi$ one gets:
\begin{align}
&\frac{d}{d \phi} \frac{\partial}{\partial \theta} \LA(\theta,\phi) = 0 \Rightarrow \frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) + \frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi))\frac{\partial}{\partial \phi} \theta   = 0\\
&\rightarrow \frac{\partial \theta}{\partial \phi} = - \left[ \frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right] \left[ \frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right]^{-1} 
\end{align}

Putting this back in Equation \ref{eqn:11} we get

\begin{align}
\label{eqn:implireg1}
 \frac{d}{d \phi} \LP(\theta^*(\phi), \phi) =  - \left[ \frac{\partial}{\partial \phi} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right] \left[ \frac{\partial}{\partial \theta} \frac{\partial}{\partial \theta}(\LA(\theta,\phi)) \right]^{-1} \frac{\partial ( \LP(\theta^*(\phi), \phi) }{\partial \theta}
     + 
 \frac{\partial (\LP(\theta^*(\phi), \phi) )}{\partial \phi} 
\end{align}

\begin{rem}
Notice that in Equation \ref{eqn:implireg1}, the inverse of second order derivative i.e. a Hessian  needs to be computed which can be expensive. In practice, approximations via Conjugate Gradient, Shermon-Morrison Identity, diagonalized Hessian or von-Neumann Expansion could be used. For our structure prediction experiments we used the von-Neumann approximation.
\end{rem}

\subsection{Approximation for inverse Hessian}
\label{apx:approx}
\paragraph{von Neumann Approximation}
When we apply the von Neumann series for inverse operators on the matrix $I-H$ we get:
$$\\H^{-1} = (I - (I - H))^{-1} = \sum \limits_{i=0}^{\infty} (I - H)^i \approx \sum \limits_{i=0}^{K} (I - H)^i $$
This is convergent for matrices $H$ with singular values less than 2. An approximation is obtained by truncating the series. While it is invalid for general matrices, this approximation has been shown useful when used in the context of gradient based methods \citep{lorraine2020optimizing}. To do so one preconditions the matrix $H$ with a suitably chosen large divisor.

Note that the first order approximation is linear in $H$ and along with automatic differentiation methods allows easy and efficient multiplication with any vector by the Hessian-vector product (HVP) method  \citep{christianson1992automatic}. To compute the Hessian-vector product (HVP) with the vector $v$, one simply changes the parameters by $\epsilon v$ (for some small $\epsilon$) and computes the gradient. The difference between the two gradient when scaled equals the HVP. Furthermore this also holds when multiplying with the cross-Hessian $\partial_\theta \partial_\phi$, the same trick can be used once again. Next the terms in the von-Neumann series can be iteratively obtained by using HVP with the output of the previous iteration. This allows us to compute the series approximation to as many orders as desired. For further details refer to \citet{christianson1992automatic}

\section{Dataset Details}

\begin{table}
\centering
\begin{minipage}[b]{0.48\textwidth}\centering
\begin{tabular}{|l|lll|l|}
\hline
Dataset   & Train & Valid & Test & Label \\
\hline
Bibtex    & 4407  & 1491  & 1497 & 159   \\
Delicious & 9690  & 3207  & 3194 & 983   \\
Eurlexev  & 11557 & 3876  & 3881 & 3993 \\
\hline
\end{tabular}
\caption{Summary statistics for multi-label classification datasets }
\end{minipage}
\hfill
\begin{minipage}[b]{0.48\textwidth}\centering
\begin{tabular}{|l|l|l|ll|}
\hline
Dataset   & Size & Label & Avg Length & Avg Labels \\
\hline
AAPD    & 55840  & 54  & 163.4& 2.4 \\
RCV & 804414   & 103  &123.9 & 3.2\\
\hline
\end{tabular}
\caption{Summary statistics for large classification datasets}
\end{minipage}
\caption{Summary statistics \label{tab:apx_sum}}
\end{table}

For the larget textual datsets, we follow the processing of \citep{yang2018sgm} to preprocess the datasets. We filtered the dataset to 50000 words, and any texts longer than 500 words were discarded. For the smaller MLC datasets we used the standard splits. The details for both are presented in Table \ref{tab:apx_sum}.

For (POS) tagging, we follow \citep{tu2020improving} and use annotated datset from \citep{owoputi-etal-2013-improved}. The data set has 25 output tags. We also conduct experiments with small scale image segmentation on the Weizmann horses dataset \citep{borenstein2002class}. This is a classic dataset for structured prediction evaluation. It contains 328 images of horses and their manually labelled segmentation masks. For this task we follow the protocol detailed in \citet{lu2020structured}. 

\section{Analysis of Learnt Energies}

\subsection{MLP Classification}
The role of the global energy function $v^T\sigma(My)$ in $E_\phi$ is to model interaction between labels. The gradient of $E$ wrt the label $y$ has contributions from values of other labels, and optimization of $E$ should correspondingly increase or decrease the likelihood of a label, given the current probabilities of other labels. To test this hypothesis we compare the Hessian of the learned global energy wrt the output $y$. For frequently co-occurring labels $y_i,y_j$, increasing $y_i$ should give positively impact the gradient of $E$ wrt $y_j$. Similarly for pairs which co-occur less frequently the hessian should give negative values. In Figures \ref{fig:apx:bib_heat_map} we plot the average Hessian of the energy function over the instances as well as the co-occurence matrix of the labels. Note that the diagonals have been removed. We see a general correspondence between the co-occurence matrix and the hessian, though there are values which do not correspond. 

% \begin{figure}[htb]
%     \centering
%     \begin{subfigure}[b]{0.48\textwidth}
%     \includegraphics[width=0.46\textwidth]{figs/bib.png}
%     \caption{Energy Hessian}
%     \label{fig:apx:bib}
%     \end{subfigure}
%     \begin{subfigure}[b]{0.46\textwidth}
%      \centering
%      \includegraphics[width=0.44\textwidth]{figs/co_occ.png}
%      \caption{Co-occurence Matrix \label{fig:apx:coc}}
%      \end{subfigure}
%      \caption{Learnt energy and Label Cooccurence on bibtex \label{fig:apx:bib_heat_map}}
% \end{figure}


\begin{figure}[htb]
    \centering
    \subfigure[Energy Hessian \label{fig:apx:bib}]{
    \includegraphics[width=0.46\textwidth]{figs/bib.png}
    }
    \subfigure[Co-occurence Matrix \label{fig:apx:coc}]{
     \includegraphics[width=0.44\textwidth]{figs/co_occ.png}
     }
     \caption{Learnt energy and Label Cooccurence on bibtex \label{fig:apx:bib_heat_map}}
\end{figure}

\subsection{POS Tagging}

In Figure \ref{fig:apx:postag_heat_map} we plot average cross-Hessian of the energy function $E$ wrt $y_t,y_{t+1}$ in the POS tagging experiment. That is we are plotting $H = \dfrac{\partial^2 E}{\partial y_t \partial y_{t+1}}$.  Ideally the model should learn to put weight on tag pairs which follow each other i.e. if the tag A frequently appears after tag B, the term in $H_{\text{BA}}$ should be higher.  
\begin{figure}
    \centering
    \includegraphics[width=0.7\textwidth]{figs/postag_map1.png}
    \caption{Heatmap representation of learnt pair correlation energies for POS tagging}
    \label{fig:apx:postag_heat_map}
\end{figure}
Our experiments suggests this to be the case. As an example the tag 0 corresponds to nouns, 5 to verbs and 6 corresponds to adjectives. We can see from the map that the matrix downweights a noun-adjective and adjective-verb pairing; while upweighing the adjective-noun and noun-verb pairs.

\section{Additional Details}
\label{apx:more_expts}
\iffalse{
\paragraph{Time Comparisons}
In table \ref{apx:tab:time} we provide the training time and inference time comparison of our method against other methods like SPEN and DVN on multi-label classification datasets.

\begin{table}[]
\centering
\begin{tabular}{|l||c|c|c||c|c|c|}
\hline
          & \multicolumn{3}{|c||}{Training Time} & \multicolumn{3}{c|}{Inference Time} \\
\hline
           & Bib   & Delicious & Eurlexev   & Bib   & Delicious & Eurlexev   \\
\hline
\hline
SPEN       & 28.2  & 37.8      & 134.5 & 3.8  & 5.8     & 24.5     \\
DVN        & 32.1  & 55.3      & 128.7  & 3.8  & 5.6      & 24.6    \\
\hline
Our($\mathcal{L}_{SSVM}$) & 31.2 & 29.6      & 43.3  & 1.8 & 1.4      & 12.1     \\
Our($\mathcal{L}_{CD}$)   & 27.7 & 39.7     & 140.6  & 1.8 & 1.4     & 12.1    \\
\hline
\end{tabular}
\caption{Training and inference time (sec/epoch) comparison of our approach against SPEN and DVN. Since the number of parameter update steps for our approach is different per epoch than other models, we have normalized training time/epoch by the number of parameter updates. \label{apx:tab:time}}
\end{table}
}\fi

\iffalse{
\paragraph{Why Implicit Gradient and connection to Dynamic Losses}
%The crucial difference between our method with $\mathcal{L}_{SSVM}$ loss and the InfNet+ proposal of \citet{tu2020improving} is the use of implicit gradient method  rather than alternative independent optimization. Note that training via independent optimization updates $\phi$ with only $\partial_\phi \LP$. From Equation \ref{eqn:grad_thm}, we can see that $\partial_\phi \LP$ is only one component of the true gradient, which includes an additional term (Line 1 of Equation \ref{eqn:grad_thm}). This additional term captures the indirect effect on $\LP$ due to $\theta$ being dependent on $\phi$ as well.  
%If one uses the correct gradient, the optimization for energy ($\phi$) becomes explicitly aware and receives feedback from the inner optimization of the inference network. 
%To highlight a difference form the standard EBM training procedure of \citet{tu2020improving}, consider a situation where the energy $E$ parameters have fully optimized $\LP$ ( for example margin loss). However, at the current parameters $\theta$ of the inference net $A$, the energy of the output $E(x, A(x))$ has a singular hessian $\partial^2_\theta E_\phi(x, A_\theta(x))$. 
%Since the condition number of the Hessian is generally a good proxy for hardness of a convex optimization (e.g. gradient descent-based optimization will be slow due to the flatness of surface); it is reasonable to suggest that for the current energy landscape, finding the optimal $y$ is difficult. 
%Under standard EBM training, as the updates only depend on $\partial_\phi \LP$, the energy network is agnostic to the difficulty of predicting $y$. As such when $\LP$ is close to optimized, the energy network will not be updated. On the other hand, with our framework the update to energy does not depend only on $\partial_\phi \LP$. Note that the first term in Equation \ref{eqn:grad_thm}, depends on inverse hessian of the loss wrt $\theta$. As such if the Hessian of the inference network is ill-conditioned, we would see a a large gradient for the energy network. This allows the energy function to find a landscape that is adapted to the inference network.
%In this sense, the our energy function behaved like a learned dynamic loss \citep{wu2018learning, Bechtle19} which is actively adapting itself to the prediction networks behaviour.



A key distinction between our proposed method, employing the $\mathcal{L}{SSVM}$ loss, and the InfNet+ approach suggested by \citet{tu2020improving} lies in the utilization of the implicit gradients/bi-level optimization method instead of alternative independent optimization. It is worth noting that when training via independent optimization, the parameters $\phi$ are updated on $\partial\phi \LP$. However, Equation \ref{eqn:grad_thm} reveals that $\partial_\phi \LP$ represents only one component of the true gradient, as it incorporates an additional term (Line 1 of Equation \ref{eqn:grad_thm}). This additional term captures the indirect influence of $\theta$ on $\LP$, as $\theta$ is also dependent on $\phi$.

As such standard energy based training methods, use biased gradients which can lead to improper solutions to the underlying problem. However, by utilizing the correct gradient, the energy optimization ($\phi$) becomes explicitly aware of and receives feedback from the inner optimization process of the inference network. To emphasize th distinction this causes from the standard training procedure of energy-based models (EBMs) described by \citet{tu2020improving}, consider a scenario where the energy parameters $E$ have fully optimized $\LP$ (e.g., through margin loss). 
Suppose that at the current parameters $\theta$ of the inference net $A$, the energy of the output $E(x, A(x))$ has a singular hessian $\partial^2_\theta E_\phi(x, A_\theta(x))$. 
Since the condition number of the Hessian often serves as a reliable indicator of the difficulty of convex optimization (e.g., gradient descent-based optimization becomes slow in the presence of a flat surface), it is reasonable to infer that finding the optimal $y$ is challenging within the current energy landscape.

Under the standard EBM training, as the updates solely depend on $\partial_\phi \LP$, the energy network remains oblivious to the difficulty associated with predicting $y$. Consequently, when $\LP$ approaches optimality, the energy network ceases to receive updates. Conversely, in our framework, the update to the energy function is not solely determined by $\partial_\phi \LP$. It is important to note that the first term in Equation \ref{eqn:grad_thm} relies on the inverse Hessian of the loss with respect to $\theta$. Therefore, if the Hessian of the inference network exhibits ill-conditioning, a significant gradient for the energy network emerges. This enables the energy function to discover a landscape that is adapted to the behavior of the inference network.

In this sense, our energy function behaves akin to a learned dynamic loss \citep{wu2018learning, Bechtle19}, actively adjusting itself to the prediction network's behavior.
}\fi
%The where a non-MLE based loss function is learnt to maximize(and similar to the learned loss method used in Wu et al. [2018], Huang et al. [2019], and Bechtle et al. [2019] 

\paragraph{Addition of task-loss in contrastive divergence}
The standard constrastive divergence \citep{hyvarinen2005estimation} depends only on the energy of positive and negative samples. In this case however we have added the task-loss $s$ to the negative samples, as it more strongly penalizes the energy of model outputs with high task loss. If we consider only one negative sample, then $\mathcal{L}_{CD}$ reduces to:
$$
 \log \dfrac{1}{1 + \exp(E_\phi(x,y) - E_\phi(x,\bar{y}_1) +  s(\bar{y}_k, y))  )} = \text{softplus}( s(\bar{y}_k, y) + E_\phi(x,y) - E_\phi(x,\bar{y}_1))
$$

which is essentially the same as the margin loss $\mathcal{L}_{SSVM}$ with the hinge being replaced by softer function. Furthermore similar to NCE\citep{gutmann2010noise} and CD\citep{hyvarinen2005estimation} losses, by using multiple samples it can provide greater information for the energy function, which is not possible for the SSVM loss.

%Our knowledge-based reward function is equivalent to Rooshenas et al. (2018), which takes input citation text and predicated tags and evaluates the consistency of the prediction with about 50 given rules describing the human domain-knowledge about citation text
%We used the Cora citation dataset (Sey- more et al., 1999), which includes 100 labeled ex-amples as the test set and another 100 labeled ex-amples for the validation set. Our training data consists of 300 training examples from the Cora citation data set

\section{Additional Experiments}

Following \citet{tu2019benchmarking} we use our proposed method for a POS-tagging task as well. We use the Twitter-POS data from \citet{owoputi-etal-2013-improved}. The energy model $E_\phi$ is similar to the one used for NER tasks. We compare against a Bi-LSTM and CRF baseline, SPEN \citep{End-to-EndSPEN} and InfNet \citep{tu2019benchmarking} based model. Our results are presented in Table \ref{tab:struct_expt_pos1}. 

%\begin{minipage}[b]{0.48\textwidth}\centering
\begin{table}[hbt]
\centering
\begin{tabular}{|l||l|}
\hline
Models  & Accuracy  \\
\hline\hline
BILSTM  & 88.7  \\
SPEN   & 88.6 \\
CRF    & 89.3  \\
InfNet  & 89.7  \\
\hline
$\mathcal{L}_\text{NCE}$     & \textbf{90.1} $\dagger$  \\
\hline
\end{tabular}
\caption{Test results for POS Tagging with different energy based models. $\dagger$ indicates statistically significant \label{tab:struct_expt_pos1}}
\end{table}


\begin{table}[hbt]
\centering
\begin{tabular}{l|l}
\hline
Model & Mean IoU (\%)\\
\hline
\hline
DVN   & 83.9    \\
cGLOW & 81.2    \\
ALEN  & 85.7    \\
\hline
$\mathcal{L}_\text{NCE}$   & \textbf{89.4} $\dagger$  \\
\hline
\end{tabular}
\caption{Mean IoU results for segmentation with different models on the Weizmann Horses dataset. The input image size is 32x32 pixels. $\dagger$ indicates statistically significant \label{tab:weisman_horse}}
\end{table}

We also conduct experiments with binary image segmentation on the Weizmann horses dataset. Following \citet{pan2020adversarial} and \citet{} we resize the images and masks to be 32 × 32 pixels. For the inference network we used a convolutional model with the same design as in cGLOW \citep{lu2020structured} like . As baselines we compare with DVN \citep{gygli2017deep}, ALEN \citep{pan2020adversarial}, and cGLOW \citep{lu2020structured}. The task loss in this case the the IoU (intersection over union) metric. Our results are reported in Table \ref{tab:weisman_horse}.
It is clear that our proposed method achieves the highest
IoU among the comparison methods with a close to 4\% percent improvement over ALEN, which itself is much ahead of other models. 

\nobibliography{mybib}
\end{document}
