<!DOCTYPE html>
<html lang="en-us">

  <head>
  <link href="http://gmpg.org/xfn/11" rel="profile">
  <meta http-equiv="content-type" content="text/html; charset=utf-8">

  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1">

  <title>
    
      Classification or Generation? Understanding Paradigm Shift for Knowledge-Intensive Tasks &middot; The ICLR Blog Track
    
  </title>

  
  <link rel="canonical" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/2021/12/01/Classification-or-Generation-Understanding-Paradigm-Shift-for-Knowledge-Intensive-Tasks/">
  

  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/css/poole.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/css/syntax.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/css/lanyon.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/css/custom.css">
  <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=PT+Serif:400,400italic,700%7CPT+Sans:400">

  <link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/apple-touch-icon-precomposed.png">
  <link rel="shortcut icon" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/favicon.ico">

  <link rel="alternate" type="application/rss+xml" title="RSS" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/atom.xml">

  

  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript" ></script>
 <!-- <script type="text/x-mathjax-config"> MathJax.Hub.Config({ TeX: { equationNumbers: { autoNumber: "AMS" } } }); </script> -->
  <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ],
         processEscapes: false
        }
      });
</script>
</head>


  <body>

    <!-- Target for toggling the sidebar `.sidebar-checkbox` is for regular
     styles, `#sidebar-checkbox` for behavior. -->
<input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox">
<!-- <input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox" > -->

<!-- Toggleable sidebar -->
<div class="sidebar" id="sidebar">
  <div class="sidebar-item">
    <p>For short-term, peer-sourced tests of time, generalizations, specializations, reproductions, etc.!</p>
  </div>

  <nav class="sidebar-nav">

    

    
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/">ICLR 2022 Blog Track</a>
        
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/about/">About</a>
        
      
    
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/submitting/">Submitting</a>
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/tags/">Tags</a>
        
      
    

    <a class="sidebar-nav-item" href="https://github.com/iclr-blog-track/iclr-blog-track.github.io">GitHub project</a>
    <span class="sidebar-nav-item">Currently vICLR Spring 2021</span>
  </nav>

  <div class="sidebar-item">
    <p>
      &copy; 2022. All rights reserved.
    </p>
  </div>
</div>


    <!-- Wrap is the content to shift when toggling the sidebar. We wrap the
         content to avoid any CSS collisions with our real content. -->
    <div class="wrap">
      <div class="masthead">
        <div class="container">
          <h3 class="masthead-title">
            <a href="/" title="Home">The ICLR Blog Track</a>
            <small></small>
          </h3>
        </div>
      </div>

      <div class="container content">
        <div class="post">
  <h1 id="iclr-post-title" class="post-title">Classification or Generation? Understanding Paradigm Shift for Knowledge-Intensive Tasks</h1>
  <span class="post-date">01 Dec 2021 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#language-modeling"> Language Modeling </a>
  
    <a class="content-tag" href="/tags/#entity-retrieval"> Entity Retrieval </a>
  </span>

  <span id="iclr-post-authors" class="post-date">Anonymous</span>
  <ul>
  <li><a href="#1-abstract">1. Abstract</a></li>
  <li><a href="#2-introduction-to-entity-retrieval">2. Introduction to Entity Retrieval</a>
    <ul>
      <li><a href="#21-problem-definition">2.1 Problem Definition</a></li>
    </ul>
  </li>
  <li><a href="#3-reformulation-of-the-problem">3. Reformulation of the Problem</a></li>
  <li><a href="#4-methodology">4. Methodology</a>
    <ul>
      <li><a href="#41-prefix-tree">4.1 Prefix Tree</a></li>
      <li><a href="#42-autoregressive-end-to-end-entity-linking">4.2 Autoregressive End-To-End Entity Linking</a></li>
    </ul>
  </li>
  <li><a href="#5-experiments-and-analyses">5. Experiments and Analyses</a></li>
  <li><a href="#6-classification-generation-and-prompt-based-learning">6. Classification, Generation, and Prompt-based Learning</a></li>
  <li><a href="#7-conclusion">7. Conclusion</a></li>
  <li><a href="#references">References</a></li>
</ul>

<hr />

<h2 id="1-abstract">1. Abstract</h2>
<p>Knowledge-intensive tasks such as entity retrieval are challenging for even cutting edge NLP models since they require models to apply knowledge about the world. Previous studies typically treat this task as classification. Recently, a new paradigm has emerged, which reformats knowledge-intensive tasks as natural language generation. This post summarizes the paradigm shift and reviews the new generative methodology for the ICLR community, providing philosophical questions and new directions.</p>

<h2 id="2-introduction-to-entity-retrieval">2. Introduction to Entity Retrieval</h2>
<p>Search engines have become part of our daily lives. We use Google (Bing, Yandex, Baidu, etc.) as the main gateway to information on the Web. With a specific type of content in mind, we may search directly on a particular site or service, e.g., on Facebook or LinkedIn for people, organizations, and events; on Amazon or eBay for products; or YouTube or Spotify for music. Accustomed to a search box somewhere near the top of the screen, we have also increased our expectations of the quality and speed of the responses to our searches.</p>

<p><strong>Information retrieval</strong> (IR), on the top level of abstraction, is about matching <em>information needs</em> with <em>information objects</em>. When a user puts a <em>query</em>, i.e., an expression varying from some keywords (e.g., <em>Apple</em>) to a natural language question (e.g., <em>who is the CEO of Apple company</em>), the search engine responds with a ranked list of information objects, traditionally related documents.</p>

<p>With the support of the enormous development of large-scale structured knowledge bases, we have witnessed the transition from “documents” to “answers”, as search engines directly return related entities or facts instead of merely “ten blue links”. The knowledge bases organize information around specific things and objects referred to as entities. The need to make search engines respond to queries with related entities brings us to the field of entity retrieval (ER), which is also the main problem the paper presented here tries to tackle, <a href="https://arxiv.org/abs/2010.00904">“<strong><em>Autoregressive Entity Retrieval</em></strong>”</a><a href="#Refer-1"><sup>1</sup></a> By Nicola De Cao, Gautier Izacard, Sebastian Riedel, Fabio Petroni.</p>

<h3 id="21-problem-definition">2.1 Problem Definition</h3>
<p>Formally, entities are uniquely identifiable objects or things (such as persons, organizations, and places), characterized by their types, attributes, and relationships to other entities. In an entity retrieval task, we have a collection of entities $\mathcal{E}$ (e.g., Wikipedia articles) where each entity is an entry in a Knowledge Base (KB) such as Wikipedia. Given a textual input source $x$ (e.g., question), a model has to return the most relevant entities from $\mathcal{E}$ concerning $x$. We assume that each $e \in \mathcal{E}$ is uniquely assigned to a textual representation (i.e., its name): a sequence of tokens $y$ (e.g., Wikipedia pages are identiﬁed by their titles).</p>

<p>Concretely, the following tasks are involved in this paper:</p>
<ul>
  <li><strong>Entity Disambiguation</strong> (ED), where an input $x$ is annotated with a mention and a system has to select either its corresponding entity from $\mathcal{E}$, or to predict that there is no corresponding entry in the KB (see <a href="#Figure-1">Figure 1</a> as an example).</li>
  <li><strong>End-To-End Entity Linking</strong> (EL). This task is to jointly detect entity mentions $m$ from an input $x$ and link those mentions to their respective KB entities $e \in \mathcal{E}$.</li>
  <li><strong>Page-level Document Retrieval</strong> (DR). The input $x$ is intended as a query and $\mathcal{E}$ as a collection of documents identiﬁed by their unique titles (e.g., Wikipedia articles).</li>
</ul>

<div id="Figure-1"></div>
<p><img src="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/images/2021-12-01-Classification or Generation-Understanding Paradigm Shift for Knowledge-Intensive Tasks/Figure1.png" alt="Figure1-entity-retrieval" /></p>

<!-- <div id="Section-2"></div> -->

<h2 id="3-reformulation-of-the-problem">3. Reformulation of the Problem</h2>
<p>In previous research, entity retrieval has been modeled as a multi-class classification problem where each entity is assigned with a unique atomic label. A typical retrieval system consists of these parts:</p>
<ol>
  <li>An encoder model that converts input queries to hidden representations;</li>
  <li>An retrieval model captures context and entity affinity, usually with vector dot products.</li>
</ol>

<p>The output of the retrieval model is sorted, and top-k similar candidates are chosen as matches. This process has several obvious drawbacks:</p>
<ul>
  <li>Training the system requires constructing negative samples where the mismatched entities and query pairs are fed into the model, and the choice of negative pairs has a strong influence on the final performance;</li>
  <li>When provided with large sets of entities, the storage of their dense representations requires a large memory footprint;</li>
  <li>The process of vector dot product might fail in modeling the fine-grained interactions between the context and the entities.</li>
</ul>

<p>Now, let’s return to the basics: by classifying or ranking the output of interactions between the queries and the entities, <em>what</em> are we supposed to achieve with this system? In a page-level Document Retrieval problem, we expect the model to output the most relevant documents (or sentences) in the KB given queries containing certain entity mentions; in an Entity Disambiguation problem, we want the model to output the mentioned entities in the given queries.</p>

<p>In other words, we can reformulate the retrieval problems as a generation task where the system gets an input sentence and outputs another - that’s exactly what a Seq2Seq model does!</p>

<p>But hold on, as we may get some unexpected answers from the model which do not appear in the given KB, we need to add some constraints. To ensure the outputs strictly follow the KBs content, we may build and apply a <strong><em>trie</em></strong>, i.e., a prefix tree, to constrain the decoding process since the generation is performed from left to right. We will talk about the details in later sections. Now, with this Seq2Seq alternative, it is surprising to find that the problems mentioned above are alleviated:</p>
<ul>
  <li>In a Seq2Seq task, we don’t have to worry about the construction of negative samples as all the other sentences already serve as negative samples to a certain extent;</li>
  <li>The memory overhead of a Seq2Seq model relies mainly on the size of beam search and the average length of output sequence, much smaller than that of storing all entities’ representations;</li>
  <li>The Seq2Seq model, together with the prefix constraints, captures interactions at the token level, which is intuitively better than the dot product between representation vectors.</li>
</ul>

<h2 id="4-methodology">4. Methodology</h2>
<p>Up to now the main idea behind the paradigm proposed in this paper, <strong>“GENRE”</strong> (for <em>Generative ENtity REtrieval</em>),  is covered, and here are some more details.</p>

<p>Concretely, the paper leverages a transformer-based architecture pre-trained with a language model objective (i.e., the BART model) and fine-tuned to generate entity names. GENRE ranks each entity $e \in \mathcal{E}$ by calculating a score with an autoregressive formulation:</p>

\[\operatorname{score}(e \mid x)=p_{\theta}(y \mid x)=\prod_{i=1}^{N} p_{\theta}\left(y_{i} \mid y_{&lt;i}, x\right),\]

<p>where $y$ is the set of $N$ tokens in the identifier of $e$, and $\theta$ the parameters of the model.</p>

<h3 id="41-prefix-tree">4.1 Prefix Tree</h3>
<p>Now let’s take a closer look at the trie constraints applied to the decoding part.</p>
<blockquote>
  <p>In computer science, a trie, also called digital tree or prefix tree, is a type of search tree, a tree data structure used for locating specific keys from within a set. These keys are most often strings, with links between nodes defined not by the entire key, but by individual characters. In order to access a key (to recover its value, change it, or remove it), the trie is traversed depth-first, following the links between nodes, which represent each character in the key…</p>

  <p>All the children of a node have a common prefix of the string associated with that parent node, and the root is associated with the empty string. – Wikipedia<a href="#Refer-2"><sup>2</sup></a></p>
</blockquote>

<p>In the prefix tree we mentioned here, each node is associated with a token instead of an individual character. For example, given the following phrases:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>English language
English literature
France
</code></pre></div></div>
<p>we can build a prefix tree as shown in <a href="#Figure-2">Figure 2</a>:</p>

<div id="Figure-2" align="center">
<object width="666" height="600" align="middle" type="application/pdf" data="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/images/2021-12-01-Classification or Generation-Understanding Paradigm Shift for Knowledge-Intensive Tasks/trie.pdf"></object>
</div>

<p>The sentences are aggregated with the same prefix tokens, and each complete path (i.e., a path that begins with a <code class="language-plaintext highlighter-rouge">BOS</code> node and ends with an <code class="language-plaintext highlighter-rouge">EOS</code> node) represents a sentence. We can perform a sentence search efficiently by comparing an input sequence of tokens with the associated tokens in different nodes.</p>

<p>In the decoding process, with the tokens already output, we can set the probability of tokens that don’t appear in the children nodes of the current node to zero and make the model choose possible tokens till we meet an <code class="language-plaintext highlighter-rouge">EOS</code> node. In this way, we make sure the model only outputs “legal” sentences that appeared in our KB. The trie reduces the search space of beam search while performing sentence inference.</p>

<p>Another advantage of a trie is its low memory overhead (e.g., constraining on Wikipedia titles using the BART tokenizer produces a trie with ∼6M leaves, ∼17M internal nodes that occupied ∼600MB of disk space), since it is a compressed representation of a series of documents and can be pre-computed and stored in memory.</p>

<h3 id="42-autoregressive-end-to-end-entity-linking">4.2 Autoregressive End-To-End Entity Linking</h3>
<p>When putting the autoregressive framework further to address end-to-end Entity Linking (EL) problem, a markup annotation is used where spans boundaries are ﬂagged with special tokens and accompanied by their corresponding entity identiﬁers. As an example, given an input sentence:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>In 1503, Leonardo began painting the Mona Lisa.
</code></pre></div></div>
<p>where the mention “Leonardo” refers to the entity “Leonardo da Vinci”, and the mention “Mona Lisa” refers to the entity “Mona Lisa” in the knowledge base, its corresponding output will be:</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>In 1503, [Leonardo](Leonardo da Vinci) began painting the [Mona Lisa](Mona Lisa).
</code></pre></div></div>
<p>Since the annotated output space is exponentially large, it becomes intractable to pre-compute a trie for decoding, and the search probability is computed dynamically instead. In such a dynamic decoding straregy, there are three different conditions at each generating step:</p>
<ol>
  <li>Outside in the sentence, where the decoder can either start a new mention with a special token (i.e., <code class="language-plaintext highlighter-rouge">[</code>) or continue by copying the next input token;</li>
  <li>Inside an entity mention, where the decoder can either continue with next input token or to end this mention with a special token (i.e., <code class="language-plaintext highlighter-rouge">]</code>);</li>
  <li>Inside an entity link, where the decoder follows an entity trie discussed above to generate valid entity identifiers.</li>
</ol>

<p>The model is constrained differently under these circumstances, as shown in <a href="#Figure-3">Figure 3</a>.</p>

<div id="Figure-3"></div>

<p><img src="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/images/2021-12-01-Classification or Generation-Understanding Paradigm Shift for Knowledge-Intensive Tasks/Figure3.png" alt="Figure3-dynamical-constraints" /></p>

<h2 id="5-experiments-and-analyses">5. Experiments and Analyses</h2>
<p>Extensive evaluations on more than 20 datasets across three tasks (Entity Disambiguation, end-to-end Entity Linking (EL), and page-level Document Retrieval) report the effectiveness of the GENRE paradigm.</p>

<p>Overall, GENRE achieves very competitive results in all of the three settings being the best performing system on average across all of them, especially on the page-level retrieval tasks of KILT benchmark (<a href="#Table-1">Table 1</a>):</p>

<div id="Table-1">Table 1: R-Precision for page-level retrieval on KILT test data. Bold indicates the best model and underline indicates the second best.</div>

<table>
  <thead>
    <tr>
      <th> </th>
      <th>Fact Check.</th>
      <th>Entity Disambiguation</th>
      <th>Slot Filling</th>
      <th>Open Domain QA</th>
      <th>Dial.</th>
      <th> </th>
      <th> </th>
      <th> </th>
      <th> </th>
      <th> </th>
      <th> </th>
      <th> </th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td><strong>Model</strong></td>
      <td><strong>FEV</strong></td>
      <td><strong>AY2</strong></td>
      <td><strong>WnWi</strong></td>
      <td><strong>WnCw</strong></td>
      <td><strong>T-REx</strong></td>
      <td><strong>zsRE</strong></td>
      <td><strong>NQ</strong></td>
      <td><strong>HoPo</strong></td>
      <td><strong>TQA</strong></td>
      <td><strong>ELI5</strong></td>
      <td><strong>WoW</strong></td>
      <td><strong>Avg.</strong></td>
    </tr>
    <tr>
      <td>DPR + BERT</td>
      <td><ins>72.9</ins></td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
      <td>40.1</td>
      <td><strong>60.7</strong></td>
      <td>25.0</td>
      <td>43.4</td>
      <td>-</td>
      <td>-</td>
      <td>-</td>
    </tr>
    <tr>
      <td>DPR</td>
      <td>55.3</td>
      <td>1.8</td>
      <td>0.3</td>
      <td>0.5</td>
      <td>13.3</td>
      <td>28.9</td>
      <td>54.3</td>
      <td>25.0</td>
      <td>44.5</td>
      <td>10.7</td>
      <td>25.5</td>
      <td>23.6</td>
    </tr>
    <tr>
      <td>tf-idf</td>
      <td>50.9</td>
      <td>3.7</td>
      <td>0.24</td>
      <td>2.1</td>
      <td>44.7</td>
      <td>60.8</td>
      <td>28.1</td>
      <td>34.1</td>
      <td>46.4</td>
      <td><ins>13.7</ins></td>
      <td>49.0</td>
      <td>30.5</td>
    </tr>
    <tr>
      <td>DPR + BART</td>
      <td>55.3</td>
      <td>75.5</td>
      <td>45.2</td>
      <td>46.9</td>
      <td>13.3</td>
      <td>28.9</td>
      <td>54.3</td>
      <td>25.0</td>
      <td>44.4</td>
      <td>10.7</td>
      <td>25.4</td>
      <td>38.6</td>
    </tr>
    <tr>
      <td>RAG</td>
      <td>61.9</td>
      <td>72.6</td>
      <td>48.1</td>
      <td>47.6</td>
      <td>28.7</td>
      <td>53.7</td>
      <td>59.5</td>
      <td>30.6</td>
      <td>48.7</td>
      <td>11.0</td>
      <td><ins>57.8</ins></td>
      <td>47.3</td>
    </tr>
    <tr>
      <td>BLINK + flair</td>
      <td>63.7</td>
      <td><ins>81.5</ins></td>
      <td><ins>80.2</ins></td>
      <td><ins>68.8</ins></td>
      <td><ins>59.6</ins></td>
      <td><ins>78.8</ins></td>
      <td>24.5</td>
      <td><ins>46.1</ins></td>
      <td><ins>65.6</ins></td>
      <td>9.3</td>
      <td>38.2</td>
      <td><ins>56.0</ins></td>
    </tr>
    <tr>
      <td><strong>genre</strong></td>
      <td><strong>83.6</strong></td>
      <td><strong>89.9</strong></td>
      <td><strong>87.4</strong></td>
      <td><strong>71.2</strong></td>
      <td><strong>79.4</strong></td>
      <td><strong>95.8</strong></td>
      <td><ins>60.3</ins></td>
      <td><strong>51.3</strong></td>
      <td><strong>69.2</strong></td>
      <td><strong>15.8</strong></td>
      <td><strong>62.9</strong></td>
      <td><strong>69.7</strong></td>
    </tr>
  </tbody>
</table>

<p>Despite outperforming other SotA models, GENRE significantly reduces its memory overhead, occupying 14 times less memory than BLINK and 34 times less memory than memory DPR. As the entity names are stored in the prefix tree in advance, the GENRE model also has an advantage under the cold start setting where only the name of entities are available in the KBs.</p>

<h2 id="6-classification-generation-and-prompt-based-learning">6. Classification, Generation, and Prompt-based Learning</h2>
<p>To push forward the success of this paradigm shift and apply autoregressive generative models to other classification problems, we need to find out the intrinsic reasons behind the superiority of generative models over classification models.</p>

<!-- Let's get back to the basics. Generation is technically a hierarchical classification procedure: at each generating step, the decoder performs token classification, reducing the search space. Additionally, the previously generated tokens in the autoregressive schema provide implicit guidance the following output. -->

<p>For knowledge-intensive tasks like entity retrieval, the names of entities containing rich semantic information are often ignored in previous single-level classification methods. However, in this autoregressive schema, interactions between entities and contexts are captured and help gain improvements.</p>

<p>This paradigm is similar to the popular trend of the <strong>prompt-based learning</strong> paradigm in recent times. Inspired by the remarkable few-shot performance of the GPT-3<a href="#Refer-3"><sup>3</sup></a> model, which leverages natural-language prompts and a few task demonstrations as input context, researchers modify the input using a template (called “prompt”) with some unfilled slots and transform the traditional categorical classification into a token classification. The prompt-based classification schema enjoys an overwhelming advantage over the traditional classification in few-shot and even zero-shot settings. The following <a href="#Figure-4">Figure 4</a> depicts MLM training, standard fine-tuning and the LM-BFF<a href="#Refer-4"><sup>4</sup></a> prompt-tuning.</p>

<div id="Figure-4"></div>

<p><img src="https://iclr.iro.umontreal.ca/65963df0-4662-4d40-a088-49c012f57cda_1642248002/public/images/2021-12-01-Classification or Generation-Understanding Paradigm Shift for Knowledge-Intensive Tasks/Figure4.png" alt="Figure4-prompt-tuning" /></p>

<!-- For example, in a binary sentiment classification problem, we feed the original sentence "I love this movie." and a prompt "Overall, it was a [MASK] movie.", and the model has to choose a suitable token from `{"great", "terrible"}` to replace the `[MASK]` slot instead of calculating and ranking categorical probabilities of positive/negative classes.  -->

<p>The success of prompt-based learning can be attributed to the consistency between the masked-language-model pre-training objective and the slot-filling objective in fine-tuning. LMs capture the direct semantic interaction between the prompt tokens and predicted tokens label tokens and utilize it to make decisions.</p>

<p>From a prompt-based perspective, we can also reformulate the autoregressive generation into a series of consecutive prompt-based classifications, where the previously generated tokens can be viewed as the prompt context. Note that the generative model outputs from left to right, hence only leverages one-way information.</p>

<p>Furthermore, the output of the generative model can be guided by incorporating extra information, like keywords, domain tags, or any variety of other pieces of information used to control the generated text<a href="#Refer-5"><sup>5</sup></a>. These extra prompt helps better utilize the task information and may provide valuable direction for future work on prompt learning and controlled text generation.</p>

<h2 id="7-conclusion">7. Conclusion</h2>
<p>This post discussed a new paradigm that autoregressively generates entities with prefix constraints. The plain and simple approach shatters some existing benchmarks surprisingly with lower memory footprints without search or reranking. We compared this schema with categorical classification and analyzed the intrinsic reasons for its advantages. Finally, we discussed the relationship between autoregressive generation and prompt-based learning and provided the community with new directions.</p>

<h2 id="references">References</h2>
<div id="Refer-1"></div>

<ul>
  <li>[1] De Cao, N., Izacard, G., Riedel, S., &amp; Petroni, F. (2020). <a href="https://arxiv.org/abs/2010.00904">Autoregressive entity retrieval</a>. arXiv preprint arXiv:2010.00904.</li>
</ul>

<div id="Refer-2"></div>

<ul>
  <li>[2] Wikipedia contributors. (2022, January 8). <a href="https://en.wikipedia.org/w/index.php?title=Trie&amp;oldid=1064464503">Trie. In Wikipedia, The Free Encyclopedia.</a> Retrieved 03:30, January 14, 2022, from https://en.wikipedia.org/w/index.php?title=Trie&amp;oldid=1064464503</li>
</ul>

<div id="Refer-3"></div>

<ul>
  <li>[3] Brown, T. B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., … &amp; Amodei, D. (2020). <a href="https://arxiv.org/abs/2005.14165">Language models are few-shot learners.</a> arXiv preprint arXiv:2005.14165.</li>
</ul>

<div id="Refer-4"></div>

<ul>
  <li>[4] Gao, T., Fisch, A., &amp; Chen, D. (2020). <a href="https://arxiv.org/abs/2012.15723">Making pre-trained language models better few-shot learners.</a> arXiv preprint arXiv:2012.15723.</li>
</ul>

<div id="Refer-5"></div>

<ul>
  <li>[5] Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., &amp; Neubig, G. (2021). <a href="https://arxiv.org/abs/2107.13586">Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing.</a> arXiv preprint arXiv:2107.13586.</li>
</ul>

</div>

<div id="bibtex-container" class="related">
  For attribution in academic contexts, please cite this work as
  <pre id="bibtex-academic-attribution">

  </pre>

  BibTeX citation
  <pre id="bibtex-box">

  </pre>
</div>
<script>
  let authorsSpan = document.getElementById("iclr-post-authors");
  let authorsText = authorsSpan.textContent;
  let lnameFnameInstitution = authorsText.split(";");
  let lfiList = lnameFnameInstitution.map(lfi => lfi.split(",").map(item => item.trim()));
  let bibtexLFI = lfiList.map(lfi => lfi[0] + ", " + lfi[1]).join(" and ")
  let academicLFI = lfiList.map(lfi => lfi[0]);
  {
    if(academicLFI.length > 2) academicLFI = academicLFI[0] + ", et al.";
    else if(academicLFI.length == 2) academicLFI = academicLFI[0] + " & " + academicLFI[1];
    else academicLFI = academicLFI[0];
  }

  let titleSpan = document.getElementById("iclr-post-title");
  let titleText = titleSpan.textContent.trim();
  let bibtexTitleShorthand = (lfiList[0][1]+
    "2022"+
    titleText.split(" ").slice(0, 3).join("")
  ).replace(" ", "").replace(/[\p{P}$+<=>^`|~]/gu, '').toLowerCase().trim();

  let bibtexTemplate = `
@inproceedings{${bibtexTitleShorthand}},
  author = {${bibtexLFI}},
  title = {${titleText}},
  booktitle = {ICLR Blog Track},
  year = {2022},
  note = {${window.location.href}},
  url  = {${window.location.href}}
}
  `.trim();
  document.getElementById("bibtex-box").innerText = bibtexTemplate;

  let academicTemplate = `
${academicLFI}, "${titleText}", ICLR Blog Track, 2022.
`.trim();
  document.getElementById("bibtex-academic-attribution").innerText = academicTemplate;

</script>


<div class="related">
  <h2>Related posts</h2>
  <ul class="related-posts">
    
      <li>
        <h3>
          <a href="/2021/09/01/sample-submission/">
            Sample Submission
            <small>01 Sep 2021 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#language-modeling"> Language Modeling </a>
  
    <a class="content-tag" href="/tags/#entity-retrieval"> Entity Retrieval </a>
  </small>
          </a>
        </h3>
      </li>
    
      <li>
        <h3>
          <a href="/2020/04/02/example-content/">
            Example content (Basic Markdown)
            <small>02 Apr 2020 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#language-modeling"> Language Modeling </a>
  
    <a class="content-tag" href="/tags/#entity-retrieval"> Entity Retrieval </a>
  </small>
          </a>
        </h3>
      </li>
    
  </ul>
</div>


<script src="https://utteranc.es/client.js"
        repo="iclr-blog-track/iclr-blog-track.github.io"
        issue-term="pathname"
        label="utterance"
        theme="boxy-light"
        crossorigin="anonymous"
        >
</script>


      </div>
    </div>

    <label for="sidebar-checkbox" class="sidebar-toggle"></label>

    <script src='/public/js/script.js'></script>
  </body>
</html>
