<!DOCTYPE html>
<html lang="en-us">

  <head>
  <link href="http://gmpg.org/xfn/11" rel="profile">
  <meta http-equiv="content-type" content="text/html; charset=utf-8">

  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1">

  <title>
    
      A Deeper Look at Zero-Cost Proxies for Lightweight NAS &middot; The ICLR Blog Track
    
  </title>

  
  <link rel="canonical" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/2021/12/01/zero-cost-proxies/">
  

  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/css/poole.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/css/syntax.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/css/lanyon.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/css/custom.css">
  <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=PT+Serif:400,400italic,700%7CPT+Sans:400">

  <link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/apple-touch-icon-precomposed.png">
  <link rel="shortcut icon" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/favicon.ico">

  <link rel="alternate" type="application/rss+xml" title="RSS" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/atom.xml">

  

  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript" ></script>
 <!-- <script type="text/x-mathjax-config"> MathJax.Hub.Config({ TeX: { equationNumbers: { autoNumber: "AMS" } } }); </script> -->
  <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ],
         processEscapes: false
        }
      });
</script>
</head>


  <body>

    <!-- Target for toggling the sidebar `.sidebar-checkbox` is for regular
     styles, `#sidebar-checkbox` for behavior. -->
<input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox">
<!-- <input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox" > -->

<!-- Toggleable sidebar -->
<div class="sidebar" id="sidebar">
  <div class="sidebar-item">
    <p>For short-term, peer-sourced tests of time, generalizations, specializations, reproductions, etc.!</p>
  </div>

  <nav class="sidebar-nav">

    

    
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/">ICLR 2022 Blog Track</a>
        
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/about/">About</a>
        
      
    
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/submitting/">Submitting</a>
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/tags/">Tags</a>
        
      
    

    <a class="sidebar-nav-item" href="https://github.com/iclr-blog-track/iclr-blog-track.github.io">GitHub project</a>
    <span class="sidebar-nav-item">Currently vICLR Spring 2021</span>
  </nav>

  <div class="sidebar-item">
    <p>
      &copy; 2022. All rights reserved.
    </p>
  </div>
</div>


    <!-- Wrap is the content to shift when toggling the sidebar. We wrap the
         content to avoid any CSS collisions with our real content. -->
    <div class="wrap">
      <div class="masthead">
        <div class="container">
          <h3 class="masthead-title">
            <a href="/" title="Home">The ICLR Blog Track</a>
            <small></small>
          </h3>
        </div>
      </div>

      <div class="container content">
        <div class="post">
  <h1 id="iclr-post-title" class="post-title">A Deeper Look at Zero-Cost Proxies for Lightweight NAS</h1>
  <span class="post-date">01 Dec 2021 | 
    <a class="content-tag" href="/tags/#deep-learning"> deep learning </a>
  
    <a class="content-tag" href="/tags/#neural-architecture-search"> neural architecture search </a>
  
    <a class="content-tag" href="/tags/#zero-cost-proxies"> zero-cost proxies </a>
  </span>

  <span id="iclr-post-authors" class="post-date">Anonymous</span>
  <p><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/MainFigure.png" alt="Markdown Monster icon" style="float: left; margin-right: 10px;" /></p>

<p>Imagine you have a brand new dataset, and you are trying to find a neural network that achieves high validation accuracy on this dataset. You choose a neural network, but after 3 hours of training, you find that the validation accuracy is only 85%. After more choices of neural networks — and many GPU-hours — you finally find one that has an accuracy of 93%. Is there an even better neural network? And can this whole process become faster?</p>

<p>These questions are central to the main challenges of neural architecture search (NAS), an area of research which seeks to automate the discovery of the highest-performing neural networks (with respect to a chosen combination of accuracy, latency, etc).
NAS has been used recently to achieve state-of-the-art performance in a variety of tasks
(<a href="https://arxiv.org/abs/1611.01578">Zoph and Le 2016</a>, <a href="https://arxiv.org/abs/1806.09055">Liu et al. 2018</a>, <a href="https://arxiv.org/abs/1802.01548">Real et al. 2018</a>, <a href="https://arxiv.org/abs/1808.05377">Elsken et al. 2018</a>),
but NAS techniques often take <a href="https://arxiv.org/abs/1906.02243">notoriously many GPU-hours to train</a>.
In 2020, one of the most novel papers in the field was released: <a href="https://arxiv.org/abs/2006.04647">Neural Architecture Search without Training</a> by Mellor et al. 
This paper promised to give an estimate of the validation accuracy for a given architecture <em>in just five seconds</em>. It spurred a flurry of follow-up papers
(<a href="https://arxiv.org/abs/2011.06006">Park et al. 2020</a>; 
<a href="https://arxiv.org/abs/2102.11535">Chen et al. 2021</a>; 
<a href="https://arxiv.org/abs/2108.01899">Li et al. 2021</a>; 
<a href="https://arxiv.org/abs/2102.01063">Lin et al. 2021</a>; 
<a href="https://arxiv.org/abs/2102.08099">Lopes et al. 2021</a>),
notably, including the ICLR 2021 paper <a href="https://arxiv.org/abs/2101.08134">Zero-cost proxies for Lightweight NAS</a> by Abdelfattah et al., which is the focal point of our blog post. Abdelfattah et al. assembled a variety of zero-cost proxies (ZC proxies) inspired by the pruning-at-initialization literature and demonstrates their effectiveness. Furthermore, via their experiments on NAS-Bench-101, -201, -ASR, and -NLP, it can be considered the gold standard in this sub-area as of ICLR 2021. Their release of <a href="https://github.com/SamsungLabs/zero-cost-nas">thorough and reproducible code</a> for their evaluations was critical to our blog and demonstrates the importance of such public repositories for furthering NAS research.</p>

<p><strong>But, do zero-cost proxies really work?</strong> 
Has this line of work truly come up with meaningful indicators of architecture performance, or do simple baselines such as “number of parameters” perform just as well, as claimed in two <a href="https://arxiv.org/abs/2008.03064">recent</a> <a href="https://openreview.net/forum?id=hP-SILoczR">papers</a>?</p>

<p>In this blog post, we take a deeper look into ZC proxies for NAS. We survey prior work on ZC proxies and then run new experiments using the recent <a href="https://arxiv.org/abs/2110.05668">NAS-Bench-360</a> 
and <a href="https://arxiv.org/abs/2105.11871">TransNAS-Bench-101</a> benchmarks, which give a much more diverse set of datasets and tasks than all prior work.
We make the following conclusions:</p>
<ol>
  <li>Across a wide range of tasks, there is no single ZC proxy which performs significantly better than the others.</li>
  <li>ZC proxies still require further research since “flops” and “params” are consistently competitive baselines, and data-agnostic ZC proxies tend to perform very inconsistently.</li>
  <li>ZC proxies show great promise when used in conjunction with other methods such as <a href="https://arxiv.org/abs/2106.06799">one-shot training</a> or <a href="https://arxiv.org/abs/2104.01177">model-based prediction</a>, to improve the performance of these existing NAS techniques at very little additional cost.</li>
</ol>

<p>Overall, we provide a landscape overview of this promising area, highlight strengths and weaknesses, and shed light on future research in this direction. 
The rest of this blog post is organized as follows:</p>
<ul>
  <li><a href="#background-and-related-work">Background and Related Work</a></li>
  <li><a href="#evaluation-of-zc-proxies-on-diverse-tasks">Evaluation of ZC Proxies on Diverse Tasks</a></li>
  <li><a href="#cases-for-and-against-zero-cost-proxies">Cases For and Against Zero-Cost Proxies</a></li>
  <li><a href="#conclusions-and-future-directions">Conclusions and Future Directions</a></li>
</ul>

<h2 id="background-and-related-work">Background and Related Work</h2>

<p>To make sure we’re all on the same page, we start with a bit of background on NAS, ZC proxies, and the search spaces we consider later.</p>

<h3 id="neural-architecture-search-nas">Neural architecture search (NAS)</h3>

<p>Given a dataset and a large set of neural architectures (the search space), the goal of NAS is to efficiently find the architecture with the highest validation accuracy (or a predetermined combination of accuracy and latency, size, etc.) on the dataset. For a survey of the different techniques used for NAS, see <a href="https://arxiv.org/abs/1808.05377">Elsken et al. (2018)</a> and <a href="https://arxiv.org/abs/1905.01392">Wistuba et al. (2019)</a>. Many NAS methods make use of techniques to predict the final performance of a neural network before it is fully 
trained 
(<a href="https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/15-IJCAI-Extrapolation_of_Learning_Curves.pdf">Domhan et al. 2015</a>;
<a href="https://arxiv.org/abs/1912.00848">Wen et al. 2019</a>;
<a href="https://arxiv.org/abs/2008.03064">Ning et al. 2021</a>;
<a href="https://arxiv.org/abs/2006.04492">Ru et al. 2021</a>).</p>

<h3 id="zero-cost-zc-proxies">Zero-cost (ZC) proxies</h3>

<p>The starting point of this blog is the ICLR 2021 paper <a href="https://arxiv.org/abs/2101.08134">Zero-cost proxies for lightweight NAS</a> by Abdelfattah et al., which assembles a variety of ZC proxies inspired by the pruning-at-initialization literature and demonstrates their effectiveness, both separately and via ensembling, on standard NAS benchmarks. Before analyzing Abdelfattah et al., we give a brief overview of the extant methods in the field.
There are many ways one can categorize ZC proxies;
for example, one can start with the original application of the measures used, grouping those such as <a href="https://arxiv.org/abs/1810.02340">snip</a>, <a href="https://arxiv.org/abs/2002.07376">grasp</a>, and <a href="https://arxiv.org/abs/2006.05467">synflow</a> that were originally proposed for parameter pruning—i.e. “guessing lottery tickets”—together to separate them from those introduced directly for NAS such as <a href="https://arxiv.org/abs/2006.04647">NASWOT</a> (referred to as “jacob_cov” by Abdelfattah et al.).
Another reasonable taxonomy is the justification behind these methods, e.g. whether they are arbitrary heuristics such as <a href="https://arxiv.org/abs/2101.08134">grad_norm</a> (taking the gradient norm of a randomly initialized architecture after one minibatch), somewhat higher-order and better-justified measures such as <a href="https://arxiv.org/abs/1906.04113">Blockswap</a> (referred to as “fisher” by Abdelfattah et al.), or the range of “theory-inspired” measures such as grasp, <a href="https://arxiv.org/abs/2102.11535">TE-NAS</a>, and <a href="https://arxiv.org/abs/2011.06006">NNGP-NAS</a> that use recent attempts to understand deep nets via limiting-case kernel approximations to construct ZC proxies.</p>

<p>While we do examine methods across all these categories, 
an experimental focus of this blog post is in answering the following question: 
<em>do these ZC proxies work consistently across a diverse set of datasets and tasks, when the search space is fixed?</em>
As a result, the most relevant taxonomy of these methods is the extent to which they actually use the training data of any given dataset to make predictions.
Here we have two distinct groups of ZC proxies:</p>

<ol>
  <li><strong>Oblivious.</strong> We define this group of ZC proxies to be those that entirely ignore the dataset, or use it only to set dimensions.
Despite the fact that they are by definition incapable of customizing architectures to the data distribution at hand, multiple oblivious ZC proxies such as synflow and <a href="https://arxiv.org/abs/2108.01899">GenNAS</a> have been published in top conferences in the past year.
Their use on their own requires assuming the existence of near-universal architectures that work for any task;
of course, they may also be used in conjunction with other NAS methods to universally bias a search space before fine-tuning to a specific task.
Either approach requires at least a decent correlation between oblivious scores and task performance on any dataset.
We also consider the natural oblivious baseline — params — which simply counts the number of weights in the network.</li>
  <li><strong>Data-dependent.</strong> This category is defined to be those ZC proxies that use the data to compute the scores but do not make gradient updates to the network weights.
This covers the majority of ZC proxies, including theory-inspired ones (grasp, TE-NAS, NNGP-NAS) and various heuristic measures (grad_norm, snip, jacob_cov, fisher).
While in principle these measures can often use the entire dataset — e.g. grad_norm could sum the gradient across all datapoints  and TE-NAS could compute statistics of the full kernel matrix — in practice usually only one minibatch is used.
Thus, while these measures do not entirely ignore the data, they nevertheless do not make full use of it to score deep nets, making it necessary to check whether their usefulness is indeed maintained when using a different type of data.
Here we also have a natural baseline: the number of flops it takes to pass the input through the network.</li>
</ol>

<p>Most other NAS methods have an explicit training component — either using full or partial network training or updating a set of shared-weights in a network.
This makes them non-ZC, as cost is commonly understood to mean training time, although in-practice ZC proxies can also have small but non-trivial computational overhead.</p>

<h3 id="nas-bench-search-spaces-datasets-and-tasks">NAS-Bench search spaces, datasets, and tasks</h3>

<p>Now we describe the search spaces, datasets, and tasks that we use for experiments in the next section.</p>

<ol>
  <li><strong>NATS-Bench.</strong> <a href="https://arxiv.org/abs/2009.00437">NATS-Bench</a> is a popular cell-based search space for research. It has two different search spaces: (1) Topological Search Space (NATS-Bench-TSS) consisting of 6466 non-isomorphic architectures (15625 total) trained on CIFAR-10, CIFAR-100, and ImageNet16-120. This is identical to its earlier version named <a href="https://arxiv.org/abs/2001.00326">NAS-Bench-201</a>. (2) Size Search Space (NATS-Bench-SSS) which includes 32768 architectures which differ amongst each other in the size of each stage in the cell-based skeleton. We use the NATS-Bench-TSS search space in our experiments.</li>
  <li><strong>DARTS.</strong> The <a href="https://arxiv.org/abs/1806.09055">DARTS</a> search space is the most widely-used search space in NAS research. It consists of $10^{18}$ architectures. The search space contains two kinds of cells, each with seven nodes: “normal” and “reduction”. The edges between each node can take one of eight operations and each node can take in inputs from two other nodes.</li>
  <li><strong>TransNAS-Bench-101.</strong> <a href="https://arxiv.org/abs/2105.11871">TransNAS-Bench-101</a> is a tabular NAS benchmark which consists of two separate search spaces: a micro search space of size 4096, and a macro search space of size 3256. All architectures on each search space are evaluated on seven different computer vision-based tasks from the <a href="http://taskonomy.stanford.edu/">Taskonomy</a> dataset. The tasks include object classification, scene classification, unscrambling the image (jigsaw), and image upscaling (autoencoder). We use the micro-level search space, which is similar to NAS-Bench-201 but with 4 choices of operations per edge instead of 6.</li>
</ol>

<table>
  <tbody>
    <tr>
      <td>$\quad\quad\text{Raw Image}\quad\quad$</td>
      <td>$\text{Object Classification}$</td>
      <td>$\text{Scene Classification}\;$</td>
      <td>$\quad\text{Jigsaw Puzzle}\quad$</td>
      <td>$\quad\text{Autoencoding}\quad$</td>
    </tr>
    <tr>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/original.png" alt="Raw Image" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/object_classification.png" alt="Object Classification" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/scene_classification.png" alt="Scene Classification" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/jigsaw.png" alt="Jigsaw" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/autoencoder.png" alt="Autoencoder" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 1: Diverse tasks from <a href="https://arxiv.org/abs/2105.11871">TransNAS-Bench-101</a> based on <a href="http://taskonomy.stanford.edu/">Taskonomy</a> used in our experiments.</em></p>

<p><strong>New datasets</strong></p>

<p>The NATS-Bench and DARTS <em>search spaces</em> have already been used in prior work to evaluate ZC proxies, but we introduce several novel <em>datasets</em> to ZC proxy evaluation by making use of <a href="https://arxiv.org/abs/2110.05668">NAS-Bench-360</a> and <a href="https://arxiv.org/abs/2106.04010">recent synthetic data</a>.</p>

<ol>
  <li><strong>Spherical CIFAR-100.</strong>
Natural planar images from CIFAR-100 are projected onto a hemisphere with random rotations to create spherical signals, each 60 X 60 pixels across RGB channels. Spherical images are highly relevant to application areas such as <a href="https://arxiv.org/abs/1801.10130">omnidirectional vision and weather modeling</a>.</li>
  <li><strong>NinaPro DB5.</strong>
The <a href="https://ieeexplore.ieee.org/document/6290287">NinaPro dataset</a> consists of electromyography (EMG) wave signals to be classified into 18 categories. The signals are converted into single-channel 2D matrices, on which CNNs have achieved state-of-the-art performance.</li>
  <li><strong>Darcy Flow.</strong>
Darcy Flow refers to a family of partial differential equations (PDE), and the task is to learn the mapping from input conditions to their solutions, which acts as an efficient replacement of traditional solvers. This dataset requires the network to output a 2D grid of the same size as the inputs; the specific setting in NAS-Bench-360 is the lowest-resolution problem considered by <a href="https://arxiv.org/abs/2010.08895">Li et al. (2021)</a>.</li>
  <li><strong>Synthetic CIFAR-10.</strong>
This dataset is a drop-in replacement for CIFAR-10 with the same image resolution of 32x32, 3 channels, 50000 training and 10000 test images. It was designed by <a href="https://arxiv.org/abs/2106.04010">Dey et al. (2021)</a> to test the performance of ZC proxies. The images are sampled from a random Gaussian distribution, and their class membership labels are determined by passing the images through 10 randomly initialized neural networks and picking the label to be the ID of the neural network that had the maximum output response to the image. We include this dataset in our repertoire to study whether the content of data itself has an effect on the performance of zero-cost proxies.</li>
</ol>

<table>
  <tbody>
    <tr>
      <td>Spherical CIFAR-100</td>
      <td>NinaPro DB5</td>
      <td>Darcy Flow</td>
      <td>Synthetic CIFAR-10</td>
    </tr>
    <tr>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/spherical.png" alt="Spherical CIFAR-100" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/ninapro.png" alt="NinaPro DB5" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/darcyflow.png" alt="Darcy Flow" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/synthetic.png" alt="Synthetic CIFAR-10" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 2: Diverse datasets from <a href="https://arxiv.org/abs/2110.05668">NAS-Bench-360</a> and <a href="https://arxiv.org/abs/2106.04010">synthetic data</a> used in our experiments.</em></p>

<h2 id="evaluation-of-zc-proxies-on-diverse-tasks">Evaluation of ZC Proxies on Diverse Tasks</h2>

<p>We evaluate ZC proxies on (1) the NATS-Bench and DARTS search spaces using NAS-Bench-360 and synthetic datasets, and (2) the TransNAS-Bench-101 search space with four vision-related tasks, both of which include a diverse set of tasks, are novel to ZC proxy research, and stand in contrast to the <em>de rigueur</em> over-reliance on image classification in current NAS literature. Furthermore, by evaluating the same search space over different tasks, we can evaluate whether data- and task-oblivious ZC proxies like synflow suggest ranks which are universal in nature.</p>

<p>For the NATS-Bench and DARTS experiments, we use <a href="https://github.com/microsoft/archai">Archai</a>, while for the TransNAS-Bench-101 experiments, we use <a href="https://github.com/automl/NASLib">NASLib</a>. Both NAS frameworks have permissive open-source licenses, and we release fully reproducible source code and datasets with this blog post (upon acceptance). We randomly draw a fixed set of architectures from each search space (1000 for NATS-Bench and DARTS, 100 for TransNAS-Bench-101), and evaluate each ZC Proxy metric for each architecture. Then, for each ZC proxy, we compute the Spearman’s rank correlation between the proxy and the validation accuracy when fully trained.</p>

<p>On NATS-Bench TSS, we run experiments on three different datasets: Spherical CIFAR-100, CIFAR-100, and Synthetic CIFAR-10. Since Spherical CIFAR-100 and Synthetic CIFAR-10 are not part of the original NATS-Bench tabular benchmark, we also trained each of the 1000 architectures on these datasets following the protocol laid out in the <a href="https://arxiv.org/abs/2009.00437">NATS-Bench paper</a> (SGD optimizer, 200 epochs, cosine decay learning rate from 0.1 to 0, 0.9 momentum, 256 batch size).
On the DARTS search space, we run this experiment on five different datasets: Spherical CIFAR-100, CIFAR-100, Synthetic CIFAR-10, Darcy Flow, and NinaPro. These datasets span image classification, EMG wave signal classification, and PDE solving. We trained the set of 1000 randomly sampled architectures on all five datasets, closely following the <a href="https://arxiv.org/abs/2008.09777">NAS-Bench-301</a> protocol (100 epochs, batch size 96, number of cells 8, momentum 0.9, cosine learning rate from 0.025 to 0.001).
On the TransNAS-Bench-101 search space, we evaluate on four different tasks from the <a href="http://taskonomy.stanford.edu/">Taskonomy</a> dataset: object classification, scene classification, jigsaw (image unscrambling), and autoencoder (image upscaling). We obtained the final performance of each architecture on each task from the <a href="https://github.com/kmdanielduan/TransNASBench">TransNAS-Bench-101 API</a>.</p>

<p><em>Table 1: Spearman’s rank correlation between each ZC proxy and ground-truth evaluations for NATS-Bench Topological Search Space (TSS).</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">Proxies</th>
      <th style="text-align: center">NATS-Bench TSS <br /> Spherical CIFAR-100</th>
      <th style="text-align: center">NATS-Bench TSS <br /> CIFAR-100</th>
      <th style="text-align: center">NATS-Bench TSS <br /> Synthetic CIFAR-10</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center">fisher</td>
      <td style="text-align: center">0.0745</td>
      <td style="text-align: center">0.6104</td>
      <td style="text-align: center">-0.4281</td>
    </tr>
    <tr>
      <td style="text-align: center">grad_norm</td>
      <td style="text-align: center">0.0056</td>
      <td style="text-align: center">0.6779</td>
      <td style="text-align: center">-0.3140</td>
    </tr>
    <tr>
      <td style="text-align: center">grasp</td>
      <td style="text-align: center">0.0327</td>
      <td style="text-align: center">0.6319</td>
      <td style="text-align: center">-0.2568</td>
    </tr>
    <tr>
      <td style="text-align: center">jacob_cov</td>
      <td style="text-align: center">-0.2512</td>
      <td style="text-align: center">0.7194</td>
      <td style="text-align: center"><strong>0.1988</strong></td>
    </tr>
    <tr>
      <td style="text-align: center">snip</td>
      <td style="text-align: center">0.0075</td>
      <td style="text-align: center">0.6789</td>
      <td style="text-align: center">-0.3194</td>
    </tr>
    <tr>
      <td style="text-align: center">synflow</td>
      <td style="text-align: center"><strong>0.1758</strong></td>
      <td style="text-align: center"><strong>0.7938</strong></td>
      <td style="text-align: center">-0.0004</td>
    </tr>
    <tr>
      <td style="text-align: center">flops</td>
      <td style="text-align: center">0.0239</td>
      <td style="text-align: center">0.7142</td>
      <td style="text-align: center">-0.0701</td>
    </tr>
    <tr>
      <td style="text-align: center">params</td>
      <td style="text-align: center">-0.0017</td>
      <td style="text-align: center">0.7346</td>
      <td style="text-align: center">-0.0426</td>
    </tr>
  </tbody>
</table>

<p><em>Table 2: Spearman’s rank correlation between each ZC proxy and ground-truth evaluations for DARTS.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">Proxies</th>
      <th style="text-align: center">DARTS <br /> Spherical CIFAR-100</th>
      <th style="text-align: center">DARTS <br /> CIFAR-100</th>
      <th style="text-align: center">DARTS <br /> NinaPro</th>
      <th style="text-align: center">DARTS <br /> Synthetic CIFAR-10</th>
      <th style="text-align: center">DARTS <br /> Darcy Flow</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center">fisher</td>
      <td style="text-align: center"><strong>0.4986</strong></td>
      <td style="text-align: center">-0.0161</td>
      <td style="text-align: center">-0.1181</td>
      <td style="text-align: center">-0.1685</td>
      <td style="text-align: center">0.1540</td>
    </tr>
    <tr>
      <td style="text-align: center">grad_norm</td>
      <td style="text-align: center">0.2450</td>
      <td style="text-align: center">0.2669</td>
      <td style="text-align: center">-0.1436</td>
      <td style="text-align: center">0.0105</td>
      <td style="text-align: center"><strong>0.1788</strong></td>
    </tr>
    <tr>
      <td style="text-align: center">grasp</td>
      <td style="text-align: center">-0.4754</td>
      <td style="text-align: center">0.2301</td>
      <td style="text-align: center"><strong>0.0107</strong></td>
      <td style="text-align: center">0.0523</td>
      <td style="text-align: center">-0.0970</td>
    </tr>
    <tr>
      <td style="text-align: center">jacob_cov</td>
      <td style="text-align: center">0.3538</td>
      <td style="text-align: center">-0.1337</td>
      <td style="text-align: center">0.0277</td>
      <td style="text-align: center">-0.1235</td>
      <td style="text-align: center">-0.1232</td>
    </tr>
    <tr>
      <td style="text-align: center">snip</td>
      <td style="text-align: center">0.2675</td>
      <td style="text-align: center">0.2303</td>
      <td style="text-align: center">-0.1458</td>
      <td style="text-align: center">0.0234</td>
      <td style="text-align: center">0.1419</td>
    </tr>
    <tr>
      <td style="text-align: center">synflow</td>
      <td style="text-align: center">-0.0560</td>
      <td style="text-align: center">0.3935</td>
      <td style="text-align: center">-0.1729</td>
      <td style="text-align: center">0.0552</td>
      <td style="text-align: center">-0.3978</td>
    </tr>
    <tr>
      <td style="text-align: center">flops</td>
      <td style="text-align: center">-0.2074</td>
      <td style="text-align: center">0.5625</td>
      <td style="text-align: center">-0.1085</td>
      <td style="text-align: center">0.2635</td>
      <td style="text-align: center">-0.1971</td>
    </tr>
    <tr>
      <td style="text-align: center">params</td>
      <td style="text-align: center">-0.2389</td>
      <td style="text-align: center"><strong>0.5630</strong></td>
      <td style="text-align: center">-0.0888</td>
      <td style="text-align: center"><strong>0.2644</strong></td>
      <td style="text-align: center">-0.2275</td>
    </tr>
  </tbody>
</table>

<p><em>Table 3: Spearman’s rank correlation between each ZC proxy and ground-truth evaluations for TransNAS-Bench-101.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">Proxies</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Jigsaw</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Object Classification</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Scene Classification</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Autoencoder</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center">fisher</td>
      <td style="text-align: center">0.4361</td>
      <td style="text-align: center"><strong>0.7998</strong></td>
      <td style="text-align: center">0.7522</td>
      <td style="text-align: center"><strong>0.5611</strong></td>
    </tr>
    <tr>
      <td style="text-align: center">grad_norm</td>
      <td style="text-align: center">0.4933</td>
      <td style="text-align: center">0.7286</td>
      <td style="text-align: center">0.6756</td>
      <td style="text-align: center">0.4380</td>
    </tr>
    <tr>
      <td style="text-align: center">grasp</td>
      <td style="text-align: center">0.5085</td>
      <td style="text-align: center">0.6233</td>
      <td style="text-align: center">0.5034</td>
      <td style="text-align: center">0.4646</td>
    </tr>
    <tr>
      <td style="text-align: center">jacob_cov</td>
      <td style="text-align: center">0.3733</td>
      <td style="text-align: center">0.3969</td>
      <td style="text-align: center">0.6964</td>
      <td style="text-align: center">-0.1569</td>
    </tr>
    <tr>
      <td style="text-align: center">snip</td>
      <td style="text-align: center"><strong>0.5367</strong></td>
      <td style="text-align: center">0.7582</td>
      <td style="text-align: center">0.7162</td>
      <td style="text-align: center">0.3671</td>
    </tr>
    <tr>
      <td style="text-align: center">synflow</td>
      <td style="text-align: center">0.4853</td>
      <td style="text-align: center">0.6331</td>
      <td style="text-align: center"><strong>0.7582</strong></td>
      <td style="text-align: center">-0.0850</td>
    </tr>
    <tr>
      <td style="text-align: center">flops</td>
      <td style="text-align: center">0.5161</td>
      <td style="text-align: center">0.5686</td>
      <td style="text-align: center">0.7360</td>
      <td style="text-align: center">0.0650</td>
    </tr>
    <tr>
      <td style="text-align: center">params</td>
      <td style="text-align: center">0.5068</td>
      <td style="text-align: center">0.5614</td>
      <td style="text-align: center">0.7181</td>
      <td style="text-align: center">0.0517</td>
    </tr>
  </tbody>
</table>

<p>Across Tables 1-3, we see a diverse set of ZC proxies achieving the best performance for each task. 
On NATS-Bench TSS, synflow and jacob_cov perform the best.
Notably, on the synthetic dataset, all ZC proxies are negatively correlated with accuracy, except for jacob_cov.
On DARTS, all of the ZC proxies are relatively balanced. Fisher, grad_norm, grasp, and params each perform the best for at least one dataset.
Finally, on TransNAS-Bench-101, we also see a mix of performances. Snip and fisher perform particularly well. 
Across all search spaces, we see that the data-oblivious ZC proxies, synflow and params, tend to perform very well on some tasks, and very poorly on others. 
This is because for each search space, they give a universal architecture ranking, which cannot generalize across tasks.</p>

<h3 id="correlations-among-zc-proxies">Correlations among ZC proxies</h3>

<p>In Tables 1-3, another trend we see is that the rank correlations of some ZC proxies are correlated, such as flops and params. 
This motivates the next set of figures: we compute the full correlations between all pairs of ZC proxies (as well as “gt” — ground truth performance).
Figures 3-5 give the full correlations on each of the 12 datasets, while Figure 6 gives the average correlations across each search space.</p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">NATS-Bench TSS <br /> CIFAR-100</th>
      <th style="text-align: center">NATS-Bench TSS <br /> Spherical CIFAR-100</th>
      <th style="text-align: center">NATS-Bench TSS <br />  Synthetic CIFAR-10</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_natsbench_cifar100.png" alt="Zero-cost NATS-Bench" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_natsbench_scifar100.png" alt="Zero-cost NATS-Bench" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_natsbench_synthetic_cifar10.png" alt="Zero-cost NATS-Bench" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 3: Spearman’s rank correlation among all pairs of ZC proxies on NATS-Bench Topological Search Space (TSS).</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">DARTS <br /> CIFAR-100</th>
      <th style="text-align: center">DARTS <br /> Darcy Flow</th>
      <th style="text-align: center">DARTS <br /> Ninapro</th>
      <th style="text-align: center">DARTS <br /> Spherical CIFAR-100</th>
      <th style="text-align: center">DARTS <br /> Synthetic CIFAR-10</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_darts_cifar100.png" alt="Zero-cost DARTS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_darts_darcyflow.png" alt="Zero-cost DARTS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_darts_ninapro.png" alt="Zero-cost DARTS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_darts_scifar100.png" alt="Zero-cost DARTS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/all_pairs_zc_spe_darts_synthetic_cifar10.png" alt="Zero-cost DARTS" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 4: Spearman’s rank correlation among all pairs of ZC proxies on DARTS.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Jigsaw</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Object Classification</th>
      <th style="text-align: center">TransNAS-Bench-101 <br /> Scene Classification</th>
      <th>TransNAS-Bench-101 <br /> Autoencoder</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/jigsaw_all_pairs_zc_spe.png" alt="Zero-cost TransNAS-Bench-101" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/class_object_all_pairs_zc_spe.png" alt="Zero-cost TransNAS-Bench-101" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/class_scene_all_pairs_zc_spe.png" alt="Zero-cost TransNAS-Bench-101" /></td>
      <td><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/autoencoder_all_pairs_zc_spe.png" alt="Zero-cost TransNAS-Bench-101" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 5: Spearman’s rank correlation among all pairs of ZC proxies on TransNAS-Bench-101.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">NATS-Bench TSS</th>
      <th style="text-align: center">DARTS</th>
      <th style="text-align: center">TransNAS-Bench-101</th>
      <th style="text-align: center">Overall</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/avg_all_pairs_zc_spe_natsbench.png" alt="Zero-cost NATS-Bench TSS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/avg_all_pairs_zc_spe_darts.png" alt="Zero-cost DARTS" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/avg_all_pairs_zc_spe.png" alt="Zero-cost TransNAS-Bench-101" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/avg_all_pairs_zc_spe_all.png" alt="Zero-cost Overall" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 6: Spearman’s rank correlation among all pairs of ZC proxies averaged within each search space and across all search spaces.</em></p>

<p>In Figures 3-6, we see similar trends across all search spaces and tasks. 
Flops and params are consistently highly correlated with one another. Furthermore, synflow is also often highly correlated with flops and params. This is consistent with recent work by <a href="https://arxiv.org/abs/2008.03064">Ning et al. (2021)</a>, who provided a proof that synflow’s value increases with the number of parameters in the architecture. Therefore, synflow somewhat acts as a soft parameter counter.
We also see that grad_norm, snip, and fisher are highly correlated with one another, and occasionally with grasp as well.</p>

<h3 id="relative-performance-of-zc-proxies">Relative performance of ZC Proxies</h3>

<p>In this section, we better answer the question, <em>which ZC proxies perform the best on average across each search space?</em></p>

<p>First, we compute the average ranking of each of the ZC proxies on each search space, and over all search spaces.
For example, if a ZC proxy was 2nd, 4th, and 6th across three datasets for a search space, then its average ranking would be 4.0.
If one of the ZC proxies consistently performed well relative to the other ZC proxies across all datasets, its average ranking would be close to 1 or 2. In Table 4, with the exception of NATS-Bench-TSS, we see that this is not the case. The top performers for NATS-Bench TSS, DARTS, and TransNAS-Bench-101 were 
synflow at 1.33, params at 3.8, and fisher at 2.75, respectively. Furthermore, over all search spaces, <em>all of the ZC proxies have an average ranking between 4.0 and 5.5.</em> This suggests that the ZC proxies are extremely close in performance on average across all tasks.</p>

<p><em>Table 4: Average ranking of each of the ZC proxies on each search space, and over all search spaces.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center"> </th>
      <th style="text-align: center">fisher</th>
      <th style="text-align: center">grad_norm</th>
      <th style="text-align: center">grasp</th>
      <th style="text-align: center">jacob_cov</th>
      <th style="text-align: center">snip</th>
      <th style="text-align: center">synflow</th>
      <th style="text-align: center">flops</th>
      <th>params</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center">NATS-Bench TSS</td>
      <td style="text-align: center">6.0</td>
      <td style="text-align: center">6.0</td>
      <td style="text-align: center">5.0</td>
      <td style="text-align: center">4.0</td>
      <td style="text-align: center">5.67</td>
      <td style="text-align: center"><strong>1.33</strong></td>
      <td style="text-align: center">4.0</td>
      <td>4.0</td>
    </tr>
    <tr>
      <td style="text-align: center">DARTS</td>
      <td style="text-align: center">4.6</td>
      <td style="text-align: center">4.2</td>
      <td style="text-align: center">4.6</td>
      <td style="text-align: center">4.8</td>
      <td style="text-align: center">4.6</td>
      <td style="text-align: center">5.4</td>
      <td style="text-align: center">4.0</td>
      <td><strong>3.8</strong></td>
    </tr>
    <tr>
      <td style="text-align: center">TransNAS-Bench-101</td>
      <td style="text-align: center"><strong>2.75</strong></td>
      <td style="text-align: center">4.5</td>
      <td style="text-align: center">4.5</td>
      <td style="text-align: center">7.5</td>
      <td style="text-align: center">3.0</td>
      <td style="text-align: center">4.5</td>
      <td style="text-align: center">4.0</td>
      <td>5.25</td>
    </tr>
    <tr>
      <td style="text-align: center">Overall</td>
      <td style="text-align: center">4.33</td>
      <td style="text-align: center">4.75</td>
      <td style="text-align: center">4.67</td>
      <td style="text-align: center">5.5</td>
      <td style="text-align: center">4.33</td>
      <td style="text-align: center">4.08</td>
      <td style="text-align: center"><strong>4.0</strong></td>
      <td>4.33</td>
    </tr>
  </tbody>
</table>

<p>To visualize this phenomenon in a different way, we compute the <em>relative performance profiles</em> (<a href="https://arxiv.org/pdf/cs/0102001.pdf">Dolan et al. 2001</a>) of the ZC proxies on each search space, and over all search spaces.
Within the Spearman value range from -1 (worst possible) to 1 (best possible), across each task, we compute how suboptimal each ZC proxy is compared to the best ZC proxy on that task, in terms of their “error” (distance of Spearman from 1). For example, on DARTS Spherical CIFAR-100, the best ZC proxy is fisher with a Spearman value of 0.4986 (error of 0.5014), and the Spearman value of snip is 0.2675 (error of 0.7325), therefore, snip is 1.46-suboptimal on DARTS Spherical CIFAR-100 (relative to the 8 ZC proxies we tested).</p>

<p>Across each task, we compute the fraction of tasks for which a method is at most $\tau$-suboptimal, for all $\tau$. By plotting this fraction on the y-axis, vs. $\tau$ on the x-axis, we can visualize the relative performance of the ZC proxies. 
If one ZC proxy were to achieve the best performance across all tasks, then its performance profile would be a horizontal line at $y=1$ (since its fraction of tasks which are 1-suboptimal is 1). In general, a ZC proxy with a line above all other lines indicates strong performance relative to the other ZC proxies. However, in Figures 7 and 8, we see that many of the plotted lines are highly overlapping, showing that the ZC proxies are relatively balanced in performance.</p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">NATS-Bench TSS</th>
      <th style="text-align: center">DARTS</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/pp_nats.png" alt="Zero-cost PP" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/pp_darts.png" alt="Zero-cost PP" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 7:  Relative performance profiles on NATS-Bench TSS and DARTS.</em></p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">TransNASBench-101</th>
      <th style="text-align: center">Avg. over all search spaces and tasks</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/pp_trans.png" alt="Zero-cost PP" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/pp_all_tasks.png" alt="Zero-cost PP" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 8: Relative performance profiles on TransNASBench-101, and averaged over all search spaces and tasks.</em></p>

<h3 id="performance-of-zc-proxies-as-training-progresses">Performance of ZC proxies as training progresses</h3>

<p>Finally, we evaluate the stability of zero-cost proxies as training progresses. We expand on the initial results reported
by <a href="https://arxiv.org/abs/2106.04010">Dey et al. (2021)</a> to evaluate each ZC proxies’ Spearman 
rank correlation after every epoch of training on the NATS-Bench TSS search space with two different 
datasets: CIFAR-10 and Spherical CIFAR-100.</p>

<table>
  <thead>
    <tr>
      <th style="text-align: center">Spearmans’s rank correlation vs. training epochs <br /> NATS-Bench TSS CIFAR-10</th>
      <th style="text-align: center">Spearmans’s rank correlation vs. training epochs <br /> NATS-Bench TSS Spherical CIFAR-100</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/zerocost_epochs_c10.gif" alt="Zero-cost Epochs" /></td>
      <td style="text-align: center"><img src="https://iclr.iro.umontreal.ca/d03efef6-6d49-4921-929c-c230895265fc_1642231246/public/images/2021-12-01-zero-cost-proxies/zerocost_epochs_scifar100.gif" alt="Zero-cost Epochs" /></td>
    </tr>
  </tbody>
</table>

<p><em>Figure 9: Spearman’s rank correlation vs. training epochs on NATS-Bench TSS CIFAR-10 and Spherical CIFAR-100.</em></p>

<p>On CIFAR-10, we find that measures like snip and grad_norm gradually degrade in rank 
correlation as the network trains. jacob_cov and grasp at initialization have rank correlations
of 0.69 and 0.63 respectively, but after even one epoch of training, they drastically 
degrade to −0.02 and −0.41. Synflow stays relatively stable. 
This is not surprising given that most of the ZC proxies have 
been directly applied from pruning-at-initialization literature. 
Similarly, on spherical CIFAR-100, most ZC proxies degrade in performance, except jacob_cov and synflow.</p>

<p>From both datasets, we observe a counterintuitive property: 
<em>ZC proxies are unable to take advantage of strictly increasing
information</em>. 
After all, as a network trains for more epochs, it only gets closer to its final fully-trained state.
It would not be unreasonable to expect future ZC proxies to be able to capitalize on more information, and so we encourage designers of future ZC proxies to keep this desirable property in mind.
However, we note that this property can be obtained artificially by combining the ZC proxy with a technique that already has this property.
For example, there is preliminary work on combining <a href="https://arxiv.org/abs/2104.01177">ZC proxies with learning curve extrapolation</a>.</p>

<!-- Example collapsible text -->
<!-- <details>
  <summary>Details of every heatmap</summary>

    Example text.

</details> -->

<h2 id="cases-for-and-against-zero-cost-proxies">Cases For and Against Zero-Cost Proxies</h2>

<p>Based on the experiments from the previous section, and based on recent literature, we aggregate the main strengths and weaknesses of ZC proxies.</p>

<h3 id="strengths">Strengths</h3>
<ol>
  <li><strong>Speed.</strong>
Although the previous section shows that ZC proxies do not always achieve a strong correlation with model performance, their speed sets ZC proxies apart from all other types of performance prediction techniques. All ZC proxies are computed with at most a forward and backward pass from a single minibatch of data. The wall-clock time depends on the size of the network and the type of data, but it typically takes five seconds or less on a GPU or CPU.
We encourage the community to think of ZC proxies as cheap “weak learners”, which may be combined with other ZC proxies, or other techniques, to achieve strong performance.
ZC proxies are especially useful when improving other, slower techniques at little extra cost, and/or when used as features in a prediction model, which can then choose whether or not to ignore the signal from each individual ZC proxy on a task by task basis. In the next two points, we give specific examples of this.</li>
  <li><strong>Usage with model-based prediction.</strong>
There is initial work in using ZC proxies as features in model-based prediction for NAS.
Model-based prediction is a common subroutine used to guide NAS algorithms, especially in combination with Bayesian optimization (e.g. <a href="https://arxiv.org/abs/1802.07191">NASBOT</a>, <a href="https://arxiv.org/abs/1910.11858">BANANAS</a>, <a href="https://arxiv.org/abs/2006.07556">NASBOWL</a>). At various points in time during an algorithm, when there is already a set of architectures fully evaluated, a meta-model can be trained using the architecture topology as features, and the validation accuracies as labels. This model can then be used to predict the validation accuracy of new architectures that have not yet been evaluated. <a href="https://arxiv.org/abs/2104.01177">White et al. (2021)</a> showed that adding jacob_cov as an additional feature can improve performance of this model by up to 20%. 
<a href="https://arxiv.org/abs/2110.10423">Shen et al. (2021)</a> further added ZC proxies to Bayesian optimization, showing 3-5x speedups over previous state-of-the-art methods.
Additional improvements may be possible, for example, if several zero-cost proxies could be used as additional features instead of just jacob_cov. ZC proxies are particularly well-suited to be part of features of a model that predicts architecture performance, because it mitigates two of their downsides: the model can learn to use only the ZC proxies that are most correlated with current task, and information from multiple ZC proxies can be leveraged by the model, including flops and params.</li>
  <li><strong>Usage with one-shot methods.</strong>
There is also initial work by <a href="https://arxiv.org/abs/2106.06799">Xiang et al. (2021)</a> in combining zero-cost proxies with one-shot methods.
Specifically, this work builds off of the popular recent work on <a href="https://arxiv.org/abs/2108.04392">perturbation-based operation selection for differentiable NAS</a>.
<a href="https://arxiv.org/abs/2106.06799">Xiang et al. (2021)</a> use ZC proxies to score operation perturbations to make decisions during the one-shot procedure. This leads to a new NAS method, Zero-Cost-PT, that can achieve up to 40x speedups compared to prior methods. Again, ZC proxies are particularly well-suited for this task, since many perturbations are encountered throughout each run of a one-shot algorithm, which must be quickly scored.</li>
  <li><strong>Untapped potential.</strong>
Preliminary research shows that the best performance from ZC proxies are not when they are used individually, but when they are used in combination. For example, Abdelfattah et al. showed “vote”, the majority vote among jacob_cov, synflow, and snip, achieved top performance in their settings. 
<a href="https://openreview.net/forum?id=hP-SILoczR">Recent work</a> has also shown that combining jacob_cov, snip, synflow, and <a href="https://arxiv.org/abs/2102.01063">zen</a>, as well as combining each ZC proxy with flops and params, leads to even better performance. Fleshing out this direction is a promising avenue for future work. Furthermore, understanding <em>why</em> certain zero-cost proxies are effective has been relatively under-studied as of now. Tackling this problem could be the key to better combining the strengths of each ZC proxy, and devising newer, better ZC proxies. Overall, ZC proxies have not yet achieved their full potential.</li>
</ol>

<h3 id="weaknesses">Weaknesses</h3>
<ol>
  <li><strong>Unreliable performance.</strong>
In the previous section, our experiments across a diverse set of datasets and tasks showed that while ZC proxies perform well on some datasets and tasks, they do not perform well on other datasets and tasks (e.g. Gaussian data, PDE-solving, EMG signals) even when keeping the search space constant. For some tasks, the majority of ZC proxies have a <em>negative</em> correlation with model performance, meaning that ZC proxies would perform <em>worse</em> than randomly picking neural networks. 
In Table 4, we even found that flops, a simple baseline, was the ZC proxy with the best average rank over all 12 tasks we studied.
Therefore, more work must be done to create ZC proxies that consistently outperform flops and params. There is already <a href="https://openreview.net/forum?id=hP-SILoczR">initial work</a> in this direction, simply by combining ZC proxies with flops and params.</li>
  <li><strong>Unhelpful biases.</strong> 
The goal of a ZC proxy is to correlate strongly with target error metrics.
However, ZC proxies have been found to have other strong preferences that may bias the search process.
For example, synflow has been shown both experimentally and theoretically to prefer large models by <a href="https://openreview.net/forum?id=Esd7tGH3Spl&amp;noteId=snLJ80NTB6L">Ning et al. (2021)</a>. 
Furthermore, <a href="https://openreview.net/forum?id=hP-SILoczR">Chen et al. (2021)</a> experimentally show that snip has a preference for wide channels, while grasp has a preference for narrow architectures.</li>
  <li><strong>Amdahl’s law.</strong> 
In early ZC proxy research, one of the main selling points was the creation of new NAS algorithms that output an architecture in minutes. However, finding an architecture is only part of the machine learning pipeline, with discovered architectures still needing to be trained to be useful.
As a result, in practical settings, ZC proxies run into an issue akin to the one described by <a href="https://en.wikipedia.org/wiki/Amdahl's_law">Amdahl’s law</a> from parallel computing: they are optimizing only an already-fast component of the pipeline and so the overall achievable speedup is actually quite small.
For example, on the DARTS space <a href="https://arxiv.org/abs/2102.11535">TE-NAS</a> reports that it takes 0.05 GPU-days to achieve a CIFAR-10 accuracy close to that of PDARTS, which takes 0.3 GPU-days; this is a six-fold improvement in search-time.
This is overwhelmed by the training time of a DARTS architecture, which takes roughly 1.2 GPU-days, and thus the improvement for the full pipeline is only 1.2x.
In fact, the best possible theoretical improvement according to Amdahl’s law, in the case where the ZC is truly zero-cost,  is only 1.25x.
However, this weakness does not apply to applications in which ZC proxies are used to improve the performance of other techniques such as model-based prediction or one-shot models, described in the “Strengths” section.</li>
  <li><strong>Correlation decay.</strong> 
Many ZC proxies, both oblivious and data-dependent, explicitly use model weights to predict an architecture’s performance.
Although this has not been a target of past ZC proxy papers, ideally the predictive power of ZC proxies would <em>increase</em> as one trains the architecture, allowing them to be combined with early-stopping methods.
However, in our experiments, we showed that in fact the performance of many proxies <em>decreases</em> with the number of training iterations.
On the other hand, ZC proxies could be used in tandem with other techniques that <em>do</em> have this property, such as learning curve extrapolation. There is some <a href="https://arxiv.org/abs/2104.01177">preliminary work</a> in this vein.</li>
</ol>

<h2 id="conclusions-and-future-directions">Conclusions and Future Directions</h2>

<p>In this blog post, we took a deeper look at zero-cost proxies for NAS. We ran new experiments using the recent <a href="https://arxiv.org/abs/2110.05668">NAS-Bench-360</a> and <a href="https://arxiv.org/abs/2105.11871">TransNAS-Bench-101</a> benchmarks to probe the effectiveness of zero-cost proxies on more diverse datasets than had previously been tested in existing literature.
Our main findings were the following:</p>

<ol>
  <li>ZC proxies have differing performance profiles across tasks, and across diverse tasks, there is no single ZC proxy which performs significantly better than the others.</li>
  <li>ZC proxies still require further research since flops and params are very competitive baselines.</li>
  <li>Data-agnostic ZC proxies such as synflow have inconsistent performance across different tasks.</li>
</ol>

<p>In general, ZC proxies are best thought of as cheap “weak learners” which can quickly improve the performance of other techniques.
Based on prior work and on our experimental observations, we find particularly promising avenues for future work:</p>

<ol>
  <li>Integrating zero-cost methods into one-shot and model-based methods.</li>
  <li>Better ways of combining ZC proxies with each other and with flops and params.</li>
  <li>Understanding <em>why</em> zero-cost proxies work well in certain situations, which can lead to the development of even better ZC proxies.</li>
</ol>

<p>Overall, while there are currently issues with inconsistency, zero-cost proxies are a promising, novel direction that are sure to play a key role in future NAS techniques.</p>

</div>

<div id="bibtex-container" class="related">
  For attribution in academic contexts, please cite this work as
  <pre id="bibtex-academic-attribution">

  </pre>

  BibTeX citation
  <pre id="bibtex-box">

  </pre>
</div>
<script>
  let authorsSpan = document.getElementById("iclr-post-authors");
  let authorsText = authorsSpan.textContent;
  let lnameFnameInstitution = authorsText.split(";");
  let lfiList = lnameFnameInstitution.map(lfi => lfi.split(",").map(item => item.trim()));
  let bibtexLFI = lfiList.map(lfi => lfi[0] + ", " + lfi[1]).join(" and ")
  let academicLFI = lfiList.map(lfi => lfi[0]);
  {
    if(academicLFI.length > 2) academicLFI = academicLFI[0] + ", et al.";
    else if(academicLFI.length == 2) academicLFI = academicLFI[0] + " & " + academicLFI[1];
    else academicLFI = academicLFI[0];
  }

  let titleSpan = document.getElementById("iclr-post-title");
  let titleText = titleSpan.textContent.trim();
  let bibtexTitleShorthand = (lfiList[0][1]+
    "2022"+
    titleText.split(" ").slice(0, 3).join("")
  ).replace(" ", "").replace(/[\p{P}$+<=>^`|~]/gu, '').toLowerCase().trim();

  let bibtexTemplate = `
@inproceedings{${bibtexTitleShorthand}},
  author = {${bibtexLFI}},
  title = {${titleText}},
  booktitle = {ICLR Blog Track},
  year = {2022},
  note = {${window.location.href}},
  url  = {${window.location.href}}
}
  `.trim();
  document.getElementById("bibtex-box").innerText = bibtexTemplate;

  let academicTemplate = `
${academicLFI}, "${titleText}", ICLR Blog Track, 2022.
`.trim();
  document.getElementById("bibtex-academic-attribution").innerText = academicTemplate;

</script>


<div class="related">
  <h2>Related posts</h2>
  <ul class="related-posts">
    
      <li>
        <h3>
          <a href="/2021/09/01/sample-submission/">
            Sample Submission
            <small>01 Sep 2021 | 
    <a class="content-tag" href="/tags/#deep-learning"> deep learning </a>
  
    <a class="content-tag" href="/tags/#neural-architecture-search"> neural architecture search </a>
  
    <a class="content-tag" href="/tags/#zero-cost-proxies"> zero-cost proxies </a>
  </small>
          </a>
        </h3>
      </li>
    
      <li>
        <h3>
          <a href="/2020/04/02/example-content/">
            Example content (Basic Markdown)
            <small>02 Apr 2020 | 
    <a class="content-tag" href="/tags/#deep-learning"> deep learning </a>
  
    <a class="content-tag" href="/tags/#neural-architecture-search"> neural architecture search </a>
  
    <a class="content-tag" href="/tags/#zero-cost-proxies"> zero-cost proxies </a>
  </small>
          </a>
        </h3>
      </li>
    
  </ul>
</div>


<script src="https://utteranc.es/client.js"
        repo="iclr-blog-track/iclr-blog-track.github.io"
        issue-term="pathname"
        label="utterance"
        theme="boxy-light"
        crossorigin="anonymous"
        >
</script>


      </div>
    </div>

    <label for="sidebar-checkbox" class="sidebar-toggle"></label>

    <script src='/public/js/script.js'></script>
  </body>
</html>
