<!DOCTYPE html>
<html lang="en-us">

  <head>
  <link href="http://gmpg.org/xfn/11" rel="profile">
  <meta http-equiv="content-type" content="text/html; charset=utf-8">

  <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1">

  <title>
    
      Deep dive into CoCon - A Self Supervised approach for Controlled Text Generation &middot; The ICLR Blog Track
    
  </title>

  
  <link rel="canonical" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/2021/12/01/Deep-dive-into-CoCon-text-generation/">
  

  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/css/poole.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/css/syntax.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/css/lanyon.css">
  <link rel="stylesheet" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/css/custom.css">
  <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=PT+Serif:400,400italic,700%7CPT+Sans:400">

  <link rel="apple-touch-icon-precomposed" sizes="144x144" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/apple-touch-icon-precomposed.png">
  <link rel="shortcut icon" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/public/favicon.ico">

  <link rel="alternate" type="application/rss+xml" title="RSS" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/atom.xml">

  

  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript" ></script>
 <!-- <script type="text/x-mathjax-config"> MathJax.Hub.Config({ TeX: { equationNumbers: { autoNumber: "AMS" } } }); </script> -->
  <script type="text/x-mathjax-config">
      MathJax.Hub.Config({
        tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ],
         processEscapes: false
        }
      });
</script>
</head>


  <body>

    <!-- Target for toggling the sidebar `.sidebar-checkbox` is for regular
     styles, `#sidebar-checkbox` for behavior. -->
<input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox">
<!-- <input type="checkbox" class="sidebar-checkbox" id="sidebar-checkbox" > -->

<!-- Toggleable sidebar -->
<div class="sidebar" id="sidebar">
  <div class="sidebar-item">
    <p>For short-term, peer-sourced tests of time, generalizations, specializations, reproductions, etc.!</p>
  </div>

  <nav class="sidebar-nav">

    

    
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/">ICLR 2022 Blog Track</a>
        
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/about/">About</a>
        
      
    
      
    
      
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/submitting/">Submitting</a>
        
      
    
      
        
          <a class="sidebar-nav-item" href="https://iclr.iro.umontreal.ca/9401c203-7224-4cb4-ba2d-32fe3f206bb2_1642233642/tags/">Tags</a>
        
      
    

    <a class="sidebar-nav-item" href="https://github.com/iclr-blog-track/iclr-blog-track.github.io">GitHub project</a>
    <span class="sidebar-nav-item">Currently vICLR Spring 2021</span>
  </nav>

  <div class="sidebar-item">
    <p>
      &copy; 2022. All rights reserved.
    </p>
  </div>
</div>


    <!-- Wrap is the content to shift when toggling the sidebar. We wrap the
         content to avoid any CSS collisions with our real content. -->
    <div class="wrap">
      <div class="masthead">
        <div class="container">
          <h3 class="masthead-title">
            <a href="/" title="Home">The ICLR Blog Track</a>
            <small></small>
          </h3>
        </div>
      </div>

      <div class="container content">
        <div class="post">
  <h1 id="iclr-post-title" class="post-title">Deep dive into CoCon - A Self Supervised approach for Controlled Text Generation</h1>
  <span class="post-date">01 Dec 2021 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#transformers"> Transformers </a>
  
    <a class="content-tag" href="/tags/#conditonal-language-modeling"> Conditonal Language Modeling </a>
  </span>

  <span id="iclr-post-authors" class="post-date">Anonymous</span>
  <ul class="table-of-content" id="markdown-toc">
  <li><a href="#abstract" id="markdown-toc-abstract">Abstract</a></li>
  <li><a href="#related-concepts" id="markdown-toc-related-concepts">Related concepts</a></li>
  <li><a href="#method-of-solving" id="markdown-toc-method-of-solving">Method of solving</a>    <ul>
      <li><a href="#problem-representation" id="markdown-toc-problem-representation">Problem representation</a></li>
      <li><a href="#model-architecture" id="markdown-toc-model-architecture">Model Architecture</a></li>
      <li><a href="#cocon-block-internal-operation" id="markdown-toc-cocon-block-internal-operation">CoCon block internal operation</a></li>
      <li><a href="#multiple-content-inputs" id="markdown-toc-multiple-content-inputs">Multiple content inputs</a></li>
      <li><a href="#content-conditioning" id="markdown-toc-content-conditioning">Content Conditioning</a></li>
      <li><a href="#model-training" id="markdown-toc-model-training">Model training</a></li>
    </ul>
  </li>
  <li><a href="#results" id="markdown-toc-results">Results</a>    <ul>
      <li><a href="#cocon-setup" id="markdown-toc-cocon-setup">CoCon Setup</a></li>
      <li><a href="#content-similarity" id="markdown-toc-content-similarity">Content Similarity.</a></li>
      <li><a href="#topic-relevance" id="markdown-toc-topic-relevance">Topic relevance.</a></li>
      <li><a href="#sentiment-control" id="markdown-toc-sentiment-control">Sentiment control</a></li>
      <li><a href="#extended-experiments" id="markdown-toc-extended-experiments">Extended experiments</a></li>
    </ul>
  </li>
  <li><a href="#conclusion" id="markdown-toc-conclusion">Conclusion</a></li>
  <li><a href="#references" id="markdown-toc-references">References</a></li>
</ul>

<h2 id="abstract">Abstract</h2>
<p>Transformer-based language models ([1] Vaswani et.al, 2017) have stirred transfer-based learning in NLP and have improved the performance of several NLP tasks. The preliminary step involves pretraining a language model on a large amount of text on the web. Research on steering a pretrained language model to enable fine-grained control over the content and sentiment of output is still under active exploration and has great potential in various applications such as story generation, search engines, etc. This blog post discusses a paper - <a href="https://openreview.net/forum?id=VD_ozqvBy4W">(open-review-link)</a>  which proposes a content conditioner when trained auto-regressively alongside a Large pretrained language model (like GPT-2) provides the capability to control text at a fine-grained level.</p>

<h2 id="related-concepts">Related concepts</h2>
<p>Some of the concepts which are required for understanding the paper involve language models, self-attention, expected value, and GAN loss.</p>

<h2 id="method-of-solving">Method of solving</h2>
<h3 id="problem-representation">Problem representation</h3>
<p>In text generation, given a prompt text \(x_{:t-1}\) of length \(t-1\), where \(x_i\) represents the token at \(i^{th}\) position and \(x_{:t-1}\) = {\(x_1 ... , x_{t-1}\)}, the probability distribution of the text that follows, \(x_t,...x_l\) of length \(l-t+1\) can be modeled auto-regressively:</p>

\[\begin{align}
p(x_t ... , x_l | x_1,...,x_{t-1}) &amp;= \prod_{i = t}^{l} p(x_i | x_l,..., x_{i-1})
\end{align}\]

<p>Controlled text generation, or text generation conditioned on a desired attribute, can be modeled by conditioning the previous density on the desired attribute, here denoted by c:</p>

\[\begin{align}
p(x_t ... , x_l | x_1,...,x_{t-1}) &amp;= \prod_{i = t}^{l} p(x_i | \mathbf{c}, x_l,..., x_{i-1})
\end{align}\]

<p>where, <strong>\(c\)</strong> can be an attribute or a text sequence (content text) or a list of text sequences (list of content text).</p>

<h3 id="model-architecture">Model Architecture</h3>
<p>The research paper uses GPT-2 medium architecture ([2] Radford et.al, 2019) for controlled text generation. The below figures represents the key changes implemented to a vanilla GPT-2 model for utilizing CoCon block for conditional text generation :</p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/GPT-2.png" alt="GPT-2 without CoCon block*" /></p>
<p class="image-caption"><em>Figure 1. GPT-2 medium architecture without CoCon block with seperated \(LM_{\alpha}\) and \(LM_{\beta}\)</em></p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-1.png" alt="CoCon block*" /></p>
<p class="image-caption"><em>Figure 2. CoCon block sandwiched between \(LM_{\alpha}\) and \(LM_{\beta}\) of the GPT-2</em></p>

<p>here,</p>

<ul>
  <li><strong>\(LM_{\alpha}\)</strong> - The transformer block before CoCon - This acts as a feature extractor from the input embeddings and outputs intermediate represenation.</li>
</ul>

\[h_{t-1} = LM_{\alpha}(x_{:t-1})\]

<ul>
  <li><strong>\(LM_{\beta}\)</strong>  -  This block takes in interemediate representation at breakpoint and outputs the logits \(o_t\), which can be used for generating the next token \(x_t\).</li>
</ul>

\[o_{t} = LM_{\beta}(h_{:t-1})\]

<p>As per figure 2, CoCon block gets two intermediate representation from \(LM_{\alpha}\) and then generates a new representation \(h_{t-1}^{'}\). \(LM_{\alpha}\) acts as a mid-breakpoint to control the next token logits after \(x_{:t-1}\).</p>

\[h^{'}_{t-1} = CoCon(h_{:l_c}, h_{t-1}) \tag{1}\]

<p>The representation \(h^{'}_{t-1}\) will be concatinated with the token representations prior to \((t-1)\) and is fed to \(LM_{\beta}\) to get token \(\widetilde{o}_t\). By using softmax operation, we get the word token \(\widetilde{x}_t\) from the logit \(\widetilde{o}_t\).</p>

\[\widetilde{o}_t = LM_{\beta}([h_{t-2}, h^{'}_{t-1}]) \tag{2}\]

\[p_{\theta, \psi}(\widetilde{x}_t|c, x_{t-1}) = Softmax(\widetilde{o}_t) \tag{3}\]

<p>Here, \(\theta\) represents the CoCon block parameters and \(\psi\) represents the LM parameters. The operations (1), (2) and (3) are then repeated to generate all the tokens \(x_i\) (where \(i &gt; t\)) and subsequently the sentence.</p>

<h3 id="cocon-block-internal-operation">CoCon block internal operation</h3>

<p>Steps for generating \(h^{'}_{t-1}\) from CoCon block:</p>

<ol>
  <li>
    <p>Generate Query \((Q)\), Key \((K)\) and Value \((V)\) vectors for representation \(h_{t-1}\).</p>

\[Q, K, V \in \mathbb{R}^{(t-1) \times d}\]
  </li>
  <li>
    <p>Generate Key \((K^c)\) and Value \((V^c)\) vectors for representation \(h_{l_c}^{c}\). <br /> (\(l_c\) is the length of content input <strong>c</strong>, d - embedding dimension).</p>

\[K^c, V^c \in \mathbb{R}^{(l_c) \times d}\]
  </li>
  <li>
    <p>Concatinate Key and Value vectors from the above steps.</p>

\[K^{'}= [K^c, K] \ \&amp; \ V^{'} = [V^c, V] \in \mathbb{R}^{(l_c + t - 1) \times d} \tag{4}\]
  </li>
  <li>
    <p>Create attention matrix \(A\). Feed it to a Feed-forward network to create \(h^{'}_{t-1}\).</p>

\[A = Softmax(QK^{'T})V^{'} \in \mathbb{R}^{(t - 1) \times d }\]

\[h^{'}_{t-1} = FF(A) \in \mathbb{R}^{(t - 1) \times d }\]
  </li>
</ol>

<h3 id="multiple-content-inputs">Multiple content inputs</h3>
<p>if there are \(n\) content inputs, the eq(4) can be changed to :</p>

\[K^{'} = [K^{c^1} K^{c^2} ... K^{c^n}; K] \; \;   \; \; V^{'} = [V^{c^1} V^{c^2} ... V^{c^n}; V]\]

<p>and the flexibility of the CoCon enables the rest of the equation to be the same.</p>

<h3 id="content-conditioning">Content Conditioning</h3>
<p>Additionally, \(\tau_{content}\) can be used to vary the extent of content conditioning by biasing the attention weights \(W = QK^{'T}\). Making \(\tau_{content}\) more positive makes the generated text align more with the content input and negative can make the CoCon block to be not too far away from an unconditioned LM.</p>

<h3 id="model-training">Model training</h3>
<p>The CoCon block is trained using self-supervised learning with the output generated by the language model (LM which is used adjacent to the CoCon block). Given any text \(x\) of length \(l\), \(x = [x_1, x_2, .... x_{t-1}, x_{t}, .... , x_l]\), the sequence can be divided into two parts,</p>

\[x^a = {x_1, ... x_{t-1}}\]

\[x^b = {x_{t}, ... , x_l}\]

<p>Where, \(x = [x^a; x^b]\). In the real world, multiple sentences can follow \(x^a\). So without the information about \(x_b\) the probability to re-construct \(x^b\) from \(x^a\) is very low. To incorporate conditional modeling and alleviate text reconstruction issues, the paper introduces four losses :</p>

<p><strong>Self reconstruction loss:</strong>
For reconstructing the original sentence \(x\), the cocon block is provided with an intermediate representation of both \(x\) and \(c = x_b\).</p>

\[\mathbf{h_{:l}} = LM_{\alpha}(x_{:l}), \; \; \; \mathbf{h^{(c)}_{:l_c}} = LM_{\alpha}(x_{t:l})\]

<p>The CoCon block then, by utilizing the representation \(\mathbf{h^{(c)}_{:l_c}}\) generates the intermediate representation auto-regressively \(\forall i \geq t-1\). (Here all the representations after \(i\) will be masked out so that CoCon does not see the future terms in \(h_{:l}\))</p>

\[h^{'}_{i} = CoCon(h^{(c)}_{:l_c},  h_{:i}), \; \; \; \forall i \geq t-1\]

<p>Using (2) and (3), next tokens are generated and the corresponding word token is generated by applying softmax.</p>

\[\widetilde{o}_{i+1} = LM_{\beta}([h_{:t-2}, \; h^{'}_{t-1:i}]),  \; \; \; p_{\theta, \psi}(\widetilde{x}_{i+1}|c, x_{:i}) = Softmax(\widetilde{o}_{i+1}), \; \; \; \forall i \geq t-1\]

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-self-reconstruction.png" alt="Self reconstruction loss with example text*" /></p>
<p class="image-caption"><em>Figure 3. Example representing Self-reconstruction loss</em></p>

<p>Now the training loss for Self-reconstruction is the sum of log-likelihood loss \(\forall i \in (t, \ l)\) where the conditioned text is the second part of the text sequence \(x\) with the training label \(x_b\).</p>

\[{L_{self}} = - \sum_{i = t}^{l}log \ p_{\psi, \theta}(x_i | (c = x_b), \{x_1, ..., x_{i-1}\})\]

<p><strong>Null content loss</strong>
The main aim of this loss is to make the text generation as fluent as possible. This loss removes the hard dependency on the presence of content in generating the text from the prompt text (In absence of any content) and makes the CoCon generate text as similar to an unconditioned LM.</p>

\[{L_{null}} = - \sum_{i = t}^{l}log \ p_{\psi, \theta}(x_i | (c = \emptyset), \{x_1, ..., x_{i-1}\})\]

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/Null-content-loss.png" alt="Null Content loss with example text*" /></p>
<p class="image-caption"><em>Figure 4. Example representing Null content loss. The content intermediate representations \(h_1^c,..h_{l_c}^c\) will be empty as no content has been provided.</em></p>

<p><strong>Cycle reconstruction loss</strong>
We can express the CoCon’s autoregressive generation as :</p>

\[y = f_{\theta, \psi}(c, p)\]

<p>where, \(c\) is the content, and \(p\) is the prompt text. To make the CoCon block more generalizable for text where both \(c\) and \(p\) are from
divergent sources, the paper uses two sentences \(x\) and \(x^{'}\) to create two pairs of \(c\) and \(p\) respectively. Splitting both the text sequence x and x’ :</p>

\[x = [x^{a}; x^{b}]\]

\[x^{'} = [x^{'a}; x^{'b}]\]

<p>Steps :</p>
<ul>
  <li>Generate text sequence \(y_{x, x^{'}}\) with content input \(c\) from \(x\) and prompt text from \(x^{'}\).</li>
</ul>

\[y_{x, x^{'}} = f_{\theta, \psi}((c = x^b), p = x^{'a})\]

<ul>
  <li>Next step involves using \(y_{x, x^{'}}\) as a content and \(x^{a}\) as the prompt text which generates \(y_{cycle}\).</li>
</ul>

\[y_{cycle} = f_{\theta, \psi}((c = y_{x, x^{'}}), p = x^{a})\]

<p>Now, \(x_b\) acts as a training label for the generated \(y_{cycle}\) and provides us the Cycle reconstruction loss for training the CoCon block.</p>

\[L_{cycle} = - \sum_{i = t}^{l}log \ p_{\psi, \theta}(y_{cycle} = x^{b} | (c = y_{x, x^{'}}), (p = x^a))\]

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/cocon-cycle-reconstruction.jpg" alt="cycle reconstruction loss with example text*" /></p>
<p class="image-caption"><em>Figure 5. Example representing CoCon cycle reconstruction step</em></p>

<p><strong>Adversarial loss</strong></p>

<p>To match the output texts’ representation \(LM_{\alpha}(y)\) with those of training samples \(LM_{\alpha}(x)\), The generator (here the LM with CoCon) is made to train adversarially with a \(f_{disc}\) network. The expression used, follows by the GAN Loss from [3] (Goodfellow et.al, 2014)</p>

\[L_{adv} = \mathbb{E}_{x}[log \ f_{disc}(LM_{\alpha}(x))] + \mathbb{E}_{y}[log \ (1 - f_{disc}(LM_{\alpha}(y)))] \tag{5}\]

<p>The \(f_{disc}\) is trained to maximize the above loss to distinguish the two representations better. This forces the Generator to output representation similar to training samples. The \(f_{disc}\) is parameterized by \(\phi\), so the training objective is:</p>

\[\phi^{*} = \underset{\phi}{arg \ max} \ L_{adv}\]

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-gan-adv-loss.png" alt="Adversarial loss with example text*" /></p>
<p class="image-caption"><em>Figure 6. Adversarial loss training with CoCon</em></p>

<p>This part of the training acts as a method of strengthening the discriminator to distinguish between input and generated representations.</p>

<p><strong>Full training</strong></p>

<p>Full training of the CoCon block is done by minimizing all the four loss terms through stochastic gradient descent.</p>

\[\theta^{*} = \underset{\theta}{arg \ max} (\lambda_{self}L_{self} + \lambda_{null}L_{null} + \lambda_{cycle}L_{cycle} + \lambda_{adv}L_{adv})\]

<p>Here, the constant \(\lambda\) is used to weigh the losses. The loss \(L_{adv}\) acts as a part of adversarial training. It pushes the CoCon block to generate a similar intermediate representation for training input \(x\) and the fully generated sentence \(y\) by minimizing the loss as per Eq (5) and eventually makes it difficult for \(f_{disc}\) to distinguish.</p>

<h2 id="results">Results</h2>
<p>The experiments on CoCon generated text have been extensively compared against some of the related works on the conditional text generation [4] PPLM (Dathathri et.al 2019) and [6] CTRL (Keskar et.al 2019).</p>

<h3 id="cocon-setup">CoCon Setup</h3>
<p>The pretrained LM used for CoCon experiments is a GPT-2 medium architecture. The \(LM_{\alpha}\) consists of 7 transformer blocks and the rest (17) blocks comprises \(LM_{\beta}\). The dimension size of the embeddings in the CoCon block is 1024 and mirrors that of the pretrained LM. The training samples are of length 30 BPE long segments <a href="https://leimao.github.io/blog/Byte-Pair-Encoding/">(More info on BPE)</a>. \(x_a\) which is used for training purposes is sampled at 8-12th BPE token and the rest constitues \(x_b\).</p>

<p>CoCon text generation is evaluated against these three features :</p>
<ol>
  <li>Content Similarity</li>
  <li>Topic Relevance</li>
  <li>Sentiment control</li>
</ol>

<h3 id="content-similarity">Content Similarity.</h3>
<p>This is used to validate the similarity in the text generated by CoCon against the provided content input <strong>c</strong>. Results on several ablations are mentioned in detail in appendix <strong>A.1</strong> of the paper.</p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-2.png" alt="CoCon content similarity on different ablation*" /></p>
<p class="image-caption"><em>Figure 7. CoCon content similarity on different ablation</em></p>

<p>Summary :</p>
<ol>
  <li>CoCon conditional text generation fares better than vanilla GPT-2 LM.</li>
  <li>Ablated variants (eg: without \(L_{null}\)) do seem to incorporate <strong>c</strong>’s content better than vanilla CoCon with a added hurt in perplexity.</li>
  <li>Removing \(L_{adv}\) as an ablation does seem to improve the perplexity and human evaluation. Authors speculate this is due to the presence of non-LM loss type for adversarial training.</li>
</ol>

<h3 id="topic-relevance">Topic relevance.</h3>
<p>Topic relevance is evaluated by providing a single token topic word as content. This has been evaluated against PPLM, CTRL, PPLM-BSR (a stronger PPLM where 10 baseline PPLM were generated and the best is chosen based on topic/sentiment), and CoCon+.  CoCon+ has a GPT output on top of a content token fed into CoCon to investigate whether CoCon can simultaneously condition on a target topic and content of a text passage.</p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-3.png" alt="CoCon topic relevance against other models*" /></p>
<p class="image-caption"><em>Figure 8. CoCon topic relevance against other models</em></p>

<p>Summary:</p>
<ol>
  <li>All the models do better than Vanilla GPT-2.</li>
  <li>CoCon outperforms other models in its localized topic generation.</li>
  <li>Larger variance in other models like PPLM and CTRL in topic relevance as they control high-level attributes (sentiment/topic).</li>
</ol>

<h3 id="sentiment-control">Sentiment control</h3>

<p>Content inputs used for steering sentiment : <br /> 
    1. Positive sentiment : <em>is perfect</em> <br />
    2. Negative sentiment : <em>is horrible</em></p>

<p>Using a classifier trained on the IMDB dataset [5] (Maas et.al, 2011), the results on sentiment classification for the generated sentences are validated. The results are compared against previous work :  (PPLM, CTRL, etc.)</p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-4.png" alt="CoCon sentiment generation capability against other models*" /></p>
<p class="image-caption"><em>Figure 9. CoCon sentiment generation capability against other models</em></p>

<p>Summary:</p>
<ol>
  <li>All models fare better at steering sentiment than Vanilla GPT-2 models.</li>
  <li>CoCon fares better against other methods in sentiment steering with a slight decrease in perplexity in the process.</li>
</ol>

<h3 id="extended-experiments">Extended experiments</h3>
<p>Using some additional prompt text, controlled text generation using CoCon was produced and the results are as follows:</p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-handmade-isperfect.png" alt="CoCon sentiment generation capability using &quot;is perfect&quot;*" /></p>
<p class="image-caption"><em>Figure 10. CoCon sentiment generation with additional prompt text and content string - “is perfect”</em></p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-handmade-isnegative.png" alt="CoCon sentiment generation capability using &quot;is horrible&quot;*" /></p>
<p class="image-caption"><em>Figure 11. CoCon sentiment generation with additional prompt text and content string - “is horrible”</em></p>

<p style="width: 100%;" class="center"><img src="/public/images/2021-12-01-Deep-dive-into-CoCon-text-generation/CoCon-handmade-gpt2.png" alt="GPT-2 generated text*" /></p>
<p class="image-caption"><em>Figure 12. Text generation using vanilla GPT-2 Medium</em></p>

<p>The above text using CoCon block was generated by the code : <a href="https://github.com/alvinchangw/COCON_ICLR2021">Github-link</a> and GPT-2 output was generated from the website : <a href="https://transformer.huggingface.co/doc/gpt2-large">Write-with-transformers</a> by setting the model size to GPT-2 medium. Underline at the beginning of the sentence represent the prompt text provided as an input to the model.</p>

<p>Non sentiment based text generation :</p>

<p><strong>Content (c)</strong> : is male <br />
<strong>Generated text</strong> : <em><u>The Nurse was tired</u>, dusty, and surprised that her out-of-town secret wasn’t being kept from her. I imagine it took her some amount of contemplation for things to unravel the way they did.\n\nThe person who comes up with the nurse’s deal with the monster isn’t a doctor or even a psychoanalyst, but an unstable Chinese businessman. How does she get what she wants? The killer”,</em></p>

<p>Observations:</p>

<ol>
  <li>More than half of the generated text, conditioned to target a positive sentiment with “is perfect” content has the sentiment phase “is perfect” present in the generated sentence.</li>
  <li>Having “Nurse” as a token in prompt text generated a stereotypical sentence. Introducing “is male” as a content text did not improve in changing the gender stereotype in the generated sentence.</li>
  <li>The CoCon block was successfully able to incorporate the sentiments provided. However, the amount of coherence between the prompt text and the conditionally generated text was low.</li>
</ol>

<h2 id="conclusion">Conclusion</h2>

<p>This method introduces a transformer block that can be used alongside a large pretrained language model and can steer the text generation to a particular context. However, research on fine-grained conditional text generation that maintains coherence between prompt and provided content still needs a lot of exploration.</p>

<h2 id="references">References</h2>

<p>[1] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
Łukasz Kaiser, and Illia Polosukhin. Attention is all you need. In Advances in neural information
processing systems, pp. 5998–6008, 2017.</p>

<p>[2] Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language
models are unsupervised multitask learners. OpenAI Blog, 1(8):9, 2019</p>

<p>[3] Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair,
Aaron Courville, and Yoshua Bengio. Generative adversarial nets. In Advances in neural information processing systems, pp. 2672–2680, 2014.</p>

<p>[4] Sumanth Dathathri, Andrea Madotto, Janice Lan, Jane Hung, Eric Frank, Piero Molino, Jason Yosinski, and Rosanne Liu. Plug and play language models: a simple approach to controlled text generation. arXiv preprint arXiv:1912.02164, 2019.</p>

<p>[5] Andrew L. Maas, Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts. (2011). Learning Word Vectors for Sentiment Analysis. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).</p>

<p>[6] Nitish Shirish Keskar, Bryan McCann, Lav R Varshney, Caiming Xiong, and Richard Socher.
Ctrl: A conditional transformer language model for controllable generation. arXiv preprint
arXiv:1909.05858, 2019.</p>

</div>

<div id="bibtex-container" class="related">
  For attribution in academic contexts, please cite this work as
  <pre id="bibtex-academic-attribution">

  </pre>

  BibTeX citation
  <pre id="bibtex-box">

  </pre>
</div>
<script>
  let authorsSpan = document.getElementById("iclr-post-authors");
  let authorsText = authorsSpan.textContent;
  let lnameFnameInstitution = authorsText.split(";");
  let lfiList = lnameFnameInstitution.map(lfi => lfi.split(",").map(item => item.trim()));
  let bibtexLFI = lfiList.map(lfi => lfi[0] + ", " + lfi[1]).join(" and ")
  let academicLFI = lfiList.map(lfi => lfi[0]);
  {
    if(academicLFI.length > 2) academicLFI = academicLFI[0] + ", et al.";
    else if(academicLFI.length == 2) academicLFI = academicLFI[0] + " & " + academicLFI[1];
    else academicLFI = academicLFI[0];
  }

  let titleSpan = document.getElementById("iclr-post-title");
  let titleText = titleSpan.textContent.trim();
  let bibtexTitleShorthand = (lfiList[0][1]+
    "2022"+
    titleText.split(" ").slice(0, 3).join("")
  ).replace(" ", "").replace(/[\p{P}$+<=>^`|~]/gu, '').toLowerCase().trim();

  let bibtexTemplate = `
@inproceedings{${bibtexTitleShorthand}},
  author = {${bibtexLFI}},
  title = {${titleText}},
  booktitle = {ICLR Blog Track},
  year = {2022},
  note = {${window.location.href}},
  url  = {${window.location.href}}
}
  `.trim();
  document.getElementById("bibtex-box").innerText = bibtexTemplate;

  let academicTemplate = `
${academicLFI}, "${titleText}", ICLR Blog Track, 2022.
`.trim();
  document.getElementById("bibtex-academic-attribution").innerText = academicTemplate;

</script>


<div class="related">
  <h2>Related posts</h2>
  <ul class="related-posts">
    
      <li>
        <h3>
          <a href="/2021/09/01/sample-submission/">
            Sample Submission
            <small>01 Sep 2021 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#transformers"> Transformers </a>
  
    <a class="content-tag" href="/tags/#conditonal-language-modeling"> Conditonal Language Modeling </a>
  </small>
          </a>
        </h3>
      </li>
    
      <li>
        <h3>
          <a href="/2020/04/02/example-content/">
            Example content (Basic Markdown)
            <small>02 Apr 2020 | 
    <a class="content-tag" href="/tags/#natural-language-processing"> Natural Language Processing </a>
  
    <a class="content-tag" href="/tags/#transformers"> Transformers </a>
  
    <a class="content-tag" href="/tags/#conditonal-language-modeling"> Conditonal Language Modeling </a>
  </small>
          </a>
        </h3>
      </li>
    
  </ul>
</div>


<script src="https://utteranc.es/client.js"
        repo="iclr-blog-track/iclr-blog-track.github.io"
        issue-term="pathname"
        label="utterance"
        theme="boxy-light"
        crossorigin="anonymous"
        >
</script>


      </div>
    </div>

    <label for="sidebar-checkbox" class="sidebar-toggle"></label>

    <script src='/public/js/script.js'></script>
  </body>
</html>
