<html>
<head>
  <title>VW Validator</title>

  <style>
body {
  font-family: "Helvetica Neue","Helvetica","Arial",sans-serif;
  background-color: #e6e987;
}
p {
  margin: 0; padding: 0;
}

#vwForm {
  width: 65%;
  padding: 1em;
  border-style: solid;
  border-width: 1px;
  border-color: white;
}

.vwFeedbackMsg {
}

.vwFeedbackMsgExample {
  font-style: normal;
  font-weight: bold;
  font-family: monospace;
}
  </style>

  <script type="text/javascript" src="http://code.jquery.com/jquery-1.8.0.min.js"></script> 
  <script type="text/javascript">
  
  var FIRST_FEEDBACK = true;
  
  function resetFeedback() {
    $("#vwFeedback").html("");
    FIRST_FEEDBACK = true;
  }
  
  function addFeedback(msgHtml, margin) {
    var currentHtml = $("#vwFeedback").html();
    if (FIRST_FEEDBACK) {
      currentHtml = "<h4>Validation Feedback</h4>";
      FIRST_FEEDBACK = false;
    }
    var newHtml = currentHtml + "<p class=\"vwFeedbackMsg\" style=\"margin-left: " + margin + ";\">";
    if (msgHtml.match(/!$/)) {
      newHtml += "<b>" + msgHtml + "</b>";
    }
    else {
      newHtml += msgHtml;
    }
    newHtml += "</p>";
    $("#vwFeedback").html(newHtml);
  }
  
  function addExampleFeedback(exIndex, msg) {
    addFeedback("<small>(example #" + (exIndex + 1) + ")</small> " + msg, 0);
  }

  function addExampleNamespaceFeedback(exIndex, namespaceIndex, msg) {
    addFeedback("<small>(example #" + (exIndex + 1) + ", namespace #" + (namespaceIndex + 1) + ")</small> " + msg, "2.5em");
  }
  function addExampleNamespaceFeatureFeedback(exIndex, namespaceIndex, featIndex, msg) {
    addFeedback("<small>(example #" + (exIndex + 1) + ", namespace #" + (namespaceIndex + 1) + ", feature #" + (featIndex + 1) + ")</small> " + msg, "5em");
  }

  function summarizeString(s) {
    if (s.length < 120) {
      return s;
    }
    return s.substring(0, 16) + "&hellip; " + s.substring(s.length - 16);
  }
  
  function isInteger(s) {
    return s.match(/^(\+|-)?\d+$/);
  }
  
  function isFloat(s) {
    return s.match(/^[+-]?\d+(\.\d+)?$/);
  }
  
  function validateVowpalWabbit() {
    resetFeedback();
    var vwData = $("#vwData").val();
    
    if (vwData.match(/\r\n/)) {
      addFeedback("Microsoft Windows -style newlines detected, please replace with Unix style (\\r\\n to just \\n)!");
    }
    if (vwData.match(/\r/)) {
      addFeedback("Carriage returns detected, please replace with Unix style newlines (\\r to just \\n)!");
    }
    
    var vwExamples = vwData.split(/\n/);
    addFeedback("Total of " + vwExamples.length + " examples pasted.");
    addFeedback("&nbsp;");
    
    for (var i=0; i < vwExamples.length; i++) {
      var vwExample = vwExamples[i];
      if ($.trim(vwExample) == "") {
        addExampleFeedback(i, "Empty example, probably a dangling newline!");
        continue;
      }
      addExampleFeedback(i, "Example &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExample) + "</span>&rdquo;.");

      var vwExPipeIdx = vwExample.indexOf("|");
      if (vwExPipeIdx == null) {
        addExampleFeedback(i, "Missing pipe ('|') separator in example.");
        continue;
      }
      var vwExPrefix = vwExample.substr(0, vwExPipeIdx);
      var vwExNamespaces = vwExample.substr(vwExPipeIdx + 1);

      var vwExLabel = null;
      var vwExImp = null;
      var vwExBase = null;
      var vwExTag = null;
      
      var isLegacyExTag = false;

      var vwExPrefixMatch = /^([^' ]+) ([^' ]+) ([^' ]+) '([^' ]+) $/.exec(vwExPrefix);
      if (vwExPrefixMatch != null) {
        addExampleFeedback(i, "Found &ldquo;&lt;label&gt; &lt;importance&gt; [base] ['tag] |&hellip;&rdquo; prefix format.");
        vwExLabel = vwExPrefixMatch[1];
        vwExImp = vwExPrefixMatch[2];
        vwExBase = vwExPrefixMatch[3];
        vwExTag = vwExPrefixMatch[4];
      }
      else {
        var vwExPrefixMatch = /^([^' ]+) ([^' ]+) ([^' ]+) ([^' ]+)$/.exec(vwExPrefix);
        if (vwExPrefixMatch != null) {
          addExampleFeedback(i, "Found &ldquo;&lt;label&gt; &lt;importance&gt; [base] [tag]|&hellip;&rdquo; prefix format.");
          vwExLabel = vwExPrefixMatch[1];
          vwExImp = vwExPrefixMatch[2];
          vwExBase = vwExPrefixMatch[3];
          vwExTag = vwExPrefixMatch[4];
          isLegacyExTag = true;
        }
        else {
          var vwExPrefixMatch = /^([^' ]+) ([^' ]+) '([^' ]+) $/.exec(vwExPrefix);
          if (vwExPrefixMatch != null) {
            addExampleFeedback(i, "Found &ldquo;&lt;label&gt; [importance] ['tag] |&hellip;&rdquo; prefix format.");
            vwExLabel = vwExPrefixMatch[1];
            vwExImp = vwExPrefixMatch[2];
            vwExTag = vwExPrefixMatch[3];
          }
          else {
            var vwExPrefixMatch = /^([^' ]+) ([^' ]+) ([^' ]+)$/.exec(vwExPrefix);
            if (vwExPrefixMatch != null) {
              addExampleFeedback(i, "Found &ldquo;&lt;label&gt; [importance] [tag]|&hellip;&rdquo; prefix format.");
              vwExLabel = vwExPrefixMatch[1];
              vwExImp = vwExPrefixMatch[2];
              vwExTag = vwExPrefixMatch[3];
              isLegacyExTag = true;
            }
            else {
              var vwExPrefixMatch = /^([^' ]+) '([^' ]+) $/.exec(vwExPrefix);
              if (vwExPrefixMatch != null) {
                addExampleFeedback(i, "Found &ldquo;[label] ['tag] |&hellip;&rdquo; prefix format.");
                vwExLabel = vwExPrefixMatch[1];
                vwExTag = vwExPrefixMatch[2];
              }
              else {
                var vwExPrefixMatch = /^([^' ]+) ([^' ]+)$/.exec(vwExPrefix);
                if (vwExPrefixMatch != null) {
                  addExampleFeedback(i, "Found &ldquo;[label] [tag]|&hellip;&rdquo; prefix format.");
                  vwExLabel = vwExPrefixMatch[1];
                  vwExTag = vwExPrefixMatch[2];
                }
                else {
                  var vwExPrefixMatch = /^([^' ]+) $/.exec(vwExPrefix);
                  if (vwExPrefixMatch != null) {
                    addExampleFeedback(i, "Found &ldquo;[label] |&hellip;&rdquo; prefix format.");
                    vwExLabel = vwExPrefixMatch[1];
                  }
                  else {
                    var ambPrefixEls = vwExPrefix.trim().split(" ");
                    addExampleFeedback(i, "Found &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExPrefix) + "</span>&rdquo; ambiguous prefix with " + ambPrefixEls.length + " elements.");
                    vwExLabel = ambPrefixEls[0];
                    if (ambPrefixEls.length > 1) {
                      vwExImp = ambPrefixEls[1];
                    }
                    if (ambPrefixEls.length > 2) {
                      vwExTag = ambPrefixEls[2];
                      if (vwExTag.match(/^'/)) {
                        vwExTag = vwExTag.substr(1);
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }

      addExampleFeedback(i, "Example label / response / class is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExLabel) + "</span>&rdquo;.");

      if (vwExImp == null) {
        addExampleFeedback(i, "Example has default &ldquo;<span class=\"vwFeedbackMsgExample\">1.0</span>&rdquo; importance weight.");
      }
      else {
        if (isFloat(vwExImp)) {
          addExampleFeedback(i, "Importance weight is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExImp) + "</span>&rdquo;.");
        }
        else {
          addExampleFeedback(i, "Importance weight &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExImp) + "</span>&rdquo; must be a float!");
        }
      }

      if (vwExBase == null) {
        addExampleFeedback(i, "Example has default &ldquo;<span class=\"vwFeedbackMsgExample\">0</span>&rdquo; base.");
      }
      else {
        if (isFloat(vwExBase)) {
          addExampleFeedback(i, "Base is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExBase) + "</span>&rdquo;.");
        }
        else {
          addExampleFeedback(i, "Base &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExBase) + "</span>&rdquo; must be a float!");
        }
      }

      if (vwExTag != null) {
        if (isLegacyExTag) {
          addExampleFeedback(i, "Tag is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExTag) + "</span>&rdquo; in the legacy format, should be preceded by a single quote!");
        }
        else {
          addExampleFeedback(i, "Tag is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExTag) + "</span>&rdquo;.");
        }
      }

      var vwExNamespaceEls = vwExNamespaces.split(" |");
      var isDefaultUsed = false;
      for (var j=0; j < vwExNamespaceEls.length; j++) {
        var vwExNamespace = vwExNamespaceEls[j];
        
        var namespaceMatch = /^(\S*) (.*)/.exec(vwExNamespace);
        if (namespaceMatch != null) {
          var vwExNamespace = namespaceMatch[1];
          var vwExNamespaceFeatures = namespaceMatch[2].split(" ");
          if (vwExNamespace != "") {
            var vwExNamespaceName = null;
            var vwExNamespaceScale = null;
            if (vwExNamespace.match(/:/)) {
              vwExNamespaceName = vwExNamespace.split(":")[0];
              vwExNamespaceScale = vwExNamespace.split(":")[1];
            }
            else {
              vwExNamespaceName = vwExNamespace;
            }
            addExampleNamespaceFeedback(i, j, "Namespace found with name &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExNamespaceName) + "</span>&rdquo;.");
            if (vwExNamespaceScale == null) {
              addExampleNamespaceFeedback(i, j, "Using default scale of &ldquo;<span class=\"vwFeedbackMsgExample\">1.0</span>&rdquo; for namespace.");
            }
            else {
              addExampleNamespaceFeedback(i, j, "Scale of namespace is &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwExNamespaceScale) + "</span>&rdquo;.");
            }
          }
          else {
            if (isDefaultUsed) {
              addExampleNamespaceFeedback(i, j, "Default namespace used more than once!");
            }
            else {
              addExampleNamespaceFeedback(i, j, "Using default namespace.");
              isDefaultUsed = true;
            }
          }
          
          addExampleNamespaceFeedback(i, j, "Found " + vwExNamespaceFeatures.length + " feature(s).");
          for (var k=0; k < vwExNamespaceFeatures.length; k++) {
            var vwFeature = vwExNamespaceFeatures[k].split(":");
            addExampleNamespaceFeatureFeedback(i, j, k, "Label &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwFeature[0]) + "</span>&rdquo;.");
            if (isInteger(vwFeature[0])) {
              addExampleNamespaceFeatureFeedback(i, j, k, "Label is an integer, and will be used instead of the feature label hashing.");
            }

            switch (vwFeature.length) {
            case 1:
              addExampleNamespaceFeatureFeedback(i, j, k, "Using default value of &ldquo;<span class=\"vwFeedbackMsgExample\">1</span>&rdquo; for feature.");
              break;
            case 2:
              addExampleNamespaceFeatureFeedback(i, j, k, "Value &ldquo;<span class=\"vwFeedbackMsgExample\">" + summarizeString(vwFeature[1]) + "</span>&rdquo;.");
              break;
            default:
              addExampleNamespaceFeatureFeedback(i, j, k, "Badly formatted feature, too many colon : sections!");
            }
          }
          
        }
        else {
          addExampleNamespaceFeedback(i, j, "Badly formed namespace.");
        }
      }
      
      addFeedback("&nbsp;");
    }
  }
  
  </script>

</head>
<body>

  <div id="vwForm">
    <h3>Vowpal Wabbit Data Format Validation</h3>
    <p><small>by <a href="http://blog.someben.com/">Ben Gimpert</a></small>  </p>
    
    <p style="width: 80%; margin-top: 1em;">Not sure if your training data is in the right format for Vowpal Wabbit? Not seeing the results you expect from Vowpal Wabbit, in general? Paste a few lines of your training data into this text box, and then hit the &ldquo;Validate&rdquo; button below. Note that the only unambiguous prefixes for training examples have exactly one or four elements.</p>
    
    <textarea id="vwData" style="width: 80%; height: 12em; font-family: monospace; margin-top: 1.5em; margin-bottom: 1.5em;">1 1.0 |MetricFeatures:3.28 height:1.5 length:2.0 |Says black with white stripes |OtherFeatures NumberOfLegs:4.0 HasStripes
1 1.0 zebra|MetricFeatures:3.28 height:1.5 length:2.0 |Says black with white stripes |OtherFeatures NumberOfLegs:4.0 HasStripes
1 2 'tag| a:2 b:3
1 2 'tag | a:2 b:3
1 2 tag | a:2 b:3
0 | price:.23 sqft:.25 age:.05 2006
1 2 'second_house | price:.18 sqft:.15 age:.35 1976
0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924</textarea><br>
    <button onclick="validateVowpalWabbit();">Validate</button>
  </div>
  
  <div id="vwFeedback" style="width: 80%; margin-left: 0.75em;">
  </div>
  
</body>
</html>

