/* Copyright (c) 2007, Google Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * ---
 *
 * Author: falmeida@google.com (Filipe Almeida)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "statemachine.h"
#include "jsparser.h"

/* So we can support both C and C++ compilers, we use the CAST() macro instead
 * of using C style casts or static_cast<>() directly.
 */
#ifdef __cplusplus
  #define CAST(type, expression) (static_cast<type>(expression))
#else
  #define CAST(type, expression) ((type)(expression))
#endif

#ifdef __cplusplus
namespace HTMLPARSER_NAMESPACE {
#endif /* __cplusplus */

/* Generated state machine definition. */
#include "jsparser_fsm.h"

/* List of keywords that can precede a regular expression literal. Taken from:
 * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
 *
 * 'void' was added to this list.
 * The list is used as input to a binary search, so it must be kept in a sorted
 * form.
 * There are a large number of keywords in here that don't exist in
 * ecmascript 3, either because they are reserved or because they are part of
 * ecmascript 4. However they weren't removed in order to keep the list in sync
 * with the previous document.
 */
static const char *regexp_token_prefix[] = {
  "abstract",
  "break",
  "case",
  "catch",
  "class",
  "const",
  "continue",
  "debugger",
  "default",
  "delete",
  "do",
  "else",
  "enum",
  "eval",
  "export",
  "extends",
  "field",
  "final",
  "finally",
  "for",
  "function",
  "goto",
  "if",
  "implements",
  "import",
  "in",
  "instanceof",
  "native",
  "new",
  "package",
  "private",
  "protected",
  "public",
  "return",
  "static",
  "switch",
  "synchronized",
  "throw",
  "throws",
  "transient",
  "try",
  "typeof",
  "var",
  "void",
  "volatile",
  "while",
  "with"
};

/* Utility functions */

/* Converts the internal state into the external superstate.
 */
static inline int state_external(int state)
{
  assert(state < JSPARSER_NUM_STATES);
  assert(state >= 0);
  return jsparser_states_external[state];
}

/* Returns true if the character is an ecmascript whitespace or line terminator
 * character. Includes the characters in sections 7.2 and 7.3 of ECMAScript 3
 * with the exception of unicode space and line terminators:
 * http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 */
static inline int js_is_whitespace(char c)
{
  return c == '\t' ||   /* Tab 0x09 */
         c == '\v' ||   /* Vertical Tab 0x0B */
         c == '\f' ||   /* Form Feed 0x0C */
         c == ' ' ||    /* Space 0x20 */
         c == '\xa0' || /* No-Break Space 0xA0 */
         c == '\n' ||   /* line Feed 0x0A */
         c == '\r';     /* Carriage Return 0x0D */
}

/* Returns true if the character is part of a javascript identifier. The rules
 * are pretty relaxed here since we don't accept unicode and don't care if the
 * first character doesn't contain numbers or not.
 *
 * For more detail on the limitations of having this relaxed set of characters
 * please see the comments in_state_js_text().
 */
static inline int js_is_identifier(char c) {
  return (c >= 'a' && c <= 'z') ||
         (c >= 'A' && c <= 'Z') ||
         (c >= '0' && c <= '9') ||
         c == '_' ||
         c == '$';
}

/* Appends a character to the ring buffer. Sequences of whitespace and newlines
 * are folded into one.
 *
 * js->buffer_start points to the first character in the buffer and
 * js->buffer_end points to the position of the next character to be written,
 * or one plus the last character written. If the buffer is full there will be
 * an empty slot position so we can diferentiate an empty buffer from a full
 * buffer.
 *
 * If the buffer is empty, then:
 *   js->buffer_start == js->buffer_end.
 * If the buffer is full, then:
 *   (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE == js->buffer_start.
 *
 */
void jsparser_buffer_append_chr(jsparser_ctx *js, char chr)
{
  /* Fold whitespace so we have enough space in the buffer. */
  if (js_is_whitespace(chr) &&
      js_is_whitespace(jsparser_buffer_get(js, -1))) {
    return;
  }

  js->buffer[js->buffer_end] = chr;
  js->buffer_end = (js->buffer_end + 1) % JSPARSER_RING_BUFFER_SIZE;
  if (js->buffer_end == js->buffer_start) {
    js->buffer_start = (js->buffer_end + 1) %
        JSPARSER_RING_BUFFER_SIZE;
  }
}

/* Appends a string to the ring buffer. Sequences of whitespace and newlines
 * are folded into one.
 */
void jsparser_buffer_append_str(jsparser_ctx *js, const char *str)
{
  assert(js != NULL);
  assert(str != NULL);

  while(*str != '\0') {
    jsparser_buffer_append_chr(js, *str++);
  }
}

/* Returns the position relative to the start of the buffer or -1 if past the
 * size of the buffer..
 */
static inline int jsparser_buffer_absolute_pos(jsparser_ctx *js, int pos)
{
  int absolute_pos;
  int buffer_len;
  assert(pos < 0);

  if(pos <= -JSPARSER_RING_BUFFER_SIZE) {
    return -1;
  }

  buffer_len = js->buffer_end - js->buffer_start;
  if (buffer_len < 0) {
    buffer_len += JSPARSER_RING_BUFFER_SIZE;
  }

  if (pos < -buffer_len) {
    return -1;
  }

  absolute_pos = (pos + js->buffer_end) % JSPARSER_RING_BUFFER_SIZE;
  if (absolute_pos < 0) {
    absolute_pos += JSPARSER_RING_BUFFER_SIZE;
  }

  return absolute_pos;
}

/* Returns the last appended character and removes it from the buffer. If the
 * buffer is empty, then it returns ASCII 0 ('\0').
 */
char jsparser_buffer_pop(jsparser_ctx *js)
{
  if (js->buffer_start == js->buffer_end) {
    return '\0';
  }

  js->buffer_end--;
  if (js->buffer_end < 0) {
    js->buffer_end += JSPARSER_RING_BUFFER_SIZE;
  }

  return js->buffer[js->buffer_end];
}

/* Returns the value of the character at a certain index in the buffer or an
 * ASCII 0 ('\0') character if the index is outside the buffer boundaries.
 *
 * Index positions are negative, were -1 is the last character appended to the
 * buffer. Using positive integers for the index will result in undefined
 * behaviour.
 */
char jsparser_buffer_get(jsparser_ctx *js, int pos)
{
  int absolute_pos;
  assert(pos < 0);

  absolute_pos = jsparser_buffer_absolute_pos(js, pos);
  if (absolute_pos < 0) {
    return '\0';
  }

  return js->buffer[absolute_pos];
}

/* Sets the value of the character at a certain index in the buffer. Returns
 * true if the write was successful or false if there was an attempt to write
 * outside of the buffer boundaries.
 *
 * Index positions are negative, were -1 is the last character appended to the
 * buffer. Using positive integers for the index will result in undefined
 * behaviour.
 */
int jsparser_buffer_set(jsparser_ctx *js, int pos, char value)
{
  int absolute_pos;
  assert(pos < 0);

  absolute_pos = jsparser_buffer_absolute_pos(js, pos);
  if (absolute_pos < 0) {
    return 0;
  }

  js->buffer[absolute_pos] = value;
  return 1;
}

/* Copies a slice of the buffer to the string pointed to by output. start and
 * end are the indexes of the sliced region. If start extends beyond the
 * beginning of the buffer, the slice will only contain character from the
 * beginning of the buffer.
 */
void jsparser_buffer_slice(jsparser_ctx *js, char *output, int start, int end)
{
  int pos;

  assert(start <= end);
  assert(start < 0);
  assert(end < 0);

  for (pos = start; pos <= end; ++pos) {
    char c;
    c = jsparser_buffer_get(js, pos);
    if (c != '\0') {
      *output++ = jsparser_buffer_get(js, pos);
    }
  }
  *output++ = '\0';
}

/* Copy the last javascript identifier or keyword found in the buffer to the
 * string pointed by identifier.
 *
 * For simplicity, we consider an identifier to be a sequence of alphanumeric
 * characters (as opposed to a digit followed by an alphanumeric character.
 *
 * Returns 0 if no identifier was matched, in which case the identifier
 * argument is replaced with an empty string, or non 0 if the identifier was
 * found.
 */
int jsparser_buffer_last_identifier(jsparser_ctx *js, char *identifier)
{
  int end;
  int pos;

  assert(identifier != NULL);

  end = -1;
  /* Ignore the optional whitespace delimiter */
  if (js_is_whitespace(jsparser_buffer_get(js, -1))) {
    --end;
  }

  /* Find the beginning of the identifier. This loop ends either when we find a
   * character that doesn't belong to an identifier, or when we find a '\0'
   * character, which means we reached the end of the buffer. */
  for(pos = end; js_is_identifier(jsparser_buffer_get(js, pos)); --pos) {
  }
  if (pos + 1 >= end) {
    identifier[0] = '\0';
    return 0;
  }

  jsparser_buffer_slice(js, identifier, pos + 1, end);
  return 1;
}

/* Callback used in bsearch() for comparing a string against an array of
 * strings.
 */
static int bsearch_strcmp(const void *a, const void *b)
{
  return strcmp(CAST(const char*, a), *CAST(const char * const *, b));
}

/* Returns true if the token argument can be a token prefix to a javascript
 * regular expression.
 *
 * The token argument is compared against a list of identifiers that can
 * precede a regular expression in the javascript grammar, and returns true if
 * the argument is found on that list.
 */
static inline int is_regexp_token_prefix(char *token)
{
  assert(token != NULL);

  return bsearch(token,
                 regexp_token_prefix,
                 sizeof(regexp_token_prefix) / sizeof(char *),
                 sizeof(char *), bsearch_strcmp) != NULL;
}

/* Called for every character in state text.
 *
 * We copy every character we find when we are in state text to the ring
 * buffer. This has the side effect of also pushing slash characters that are
 * part of comments into the buffer, although for parsing purposes these should
 * be treated as whitespace. This issue is addressed in
 * enter_state_js_comment_ml_after().
 */
static void in_state_js_text(statemachine_ctx *ctx, int start, char chr,
                             int end)
{
  jsparser_ctx *js = CAST(jsparser_ctx *, ctx->user);
  assert(js != NULL);

  jsparser_buffer_append_chr(js, chr);
}

/* This function is called every time we find a slash ('/') character in the
 * javascript text (except for slashes that close comments or regexp literals).
 *
 * Implements the logic to figure out if this slash character is a division
 * operator or if it opens a regular expression literal. This is heavily
 * inspired by the syntactic resynchronization for javascript 2.0:
 * http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html
 *
 * When we receive a '/', we look at the previous non space character to figure
 * out if it's the ending of a punctuator that can precede a regexp literal, in
 * which case we assume the current '/' is part of a regular expression literal
 * (or the opening of a javascript comment, but that part is dealt with in the
 * state machine). The exceptions to this are unary operators, so we look back
 * a second character to rule out '++' and '--'. Although it is not
 * straightforward to figure out if the binary operator is a postfix of the
 * previous expression or a prefix of the regular expression, we rule out the
 * later as it is an uncommon practice.
 *
 * If we ruled out the previous token to be a valid regexp preceding
 * punctuator, we extract the last identifier in the buffer and match against a
 * list of keywords that are known to precede expressions in the grammar. If we
 * get a match on any of these keywords, then we are opening a regular
 * expression, if not, then we have a division operator.
 *
 * Known cases that are accepted by the grammar but we handle differently,
 * although I don't believe there is a legitimate usage for those:
 *
 * Division of a regular expression:
 * var result = /test/ / 5;
 *
 * Prefix unary increment of a regular expression:
 * var result = ++/test/;
 *
 * Division of an object literal:
 * { a: 1 } /x/.exec('x');
 *
 * We only support ascii right now, so unicode characters in identifiers will
 * be treated as delimiters, effectively breaking the identifier name where
 * they appear, and this may cause issues in very specific situations. Namely,
 * if we have a unicode character in an identifier directly preceding a suffix
 * that matches one of the keywords in regexp_token_prefix[], if this
 * identifier precedes a / (slash) character:
 *
 * var x = test<unicode char>return / 5;
 *
 * We will interpret that slash as the start of a regular expression, when in
 * reality it is a division operator.
 */
static void enter_state_js_slash(statemachine_ctx *ctx, int start, char chr,
                                 int end)
{
  jsparser_ctx *js;
  char buffer[JSPARSER_RING_BUFFER_SIZE];
  int pos;

  assert(ctx != NULL);
  assert(ctx->user != NULL);

  js = CAST(jsparser_ctx *, ctx->user);
  assert(js != NULL);

  pos = -1;
  /* Consume the last whitespace. */
  if (js_is_whitespace(jsparser_buffer_get(js, pos))) {
    --pos;
  }

  switch (jsparser_buffer_get(js, pos)) {
    /* Ignore unary increment */
    case '+':
      if (jsparser_buffer_get(js, pos - 1) != '+') {
        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
      }
      break;

    /* Ignore unary decrement */
    case '-':
      if (jsparser_buffer_get(js, pos - 1) != '-') {
        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
      }
      break;

    /* List of punctuator endings except ), ], }, + and - */
    case '=':
    case '<':
    case '>':
    case '&':
    case '|':
    case '!':
    case '%':
    case '*':
    case '/':
    case ',':
    case ';':
    case '?':
    case ':':
    case '^':
    case '~':
    case '{':
    case '(':
    case '[':
    case '}':
    case '\0':
      ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
      break;

    default:
      if (jsparser_buffer_last_identifier(js, buffer) &&
          is_regexp_token_prefix(buffer)) {
        ctx->next_state = JSPARSER_STATE_INT_JS_REGEXP_SLASH;
      }
  }

  jsparser_buffer_append_chr(js, chr);
}

/* Called at the end of a javascript comment.
 *
 * When we open a comment, the initial '/' was inserted into the ring buffer,
 * but it is not a token and should be considered whitespace for parsing
 * purposes.
 *
 * When we first saw the '/' character, we didn't yet know if it was the
 * beginning of a comment, a division operator, or a regexp.
 *
 * In this function we just replace the inital '/' with a whitespace character,
 * unless we had a preceding whitespace character, in which case we just remove
 * the '/'. This is needed to ensure all spaces in the buffer are correctly
 * folded.
 */
static void enter_state_js_comment_after(statemachine_ctx *ctx, int start,
                                         char chr, int end)
{
  jsparser_ctx *js;

  assert(ctx != NULL);
  assert(ctx->user != NULL);

  js = CAST(jsparser_ctx *, ctx->user);

  if (js_is_whitespace(jsparser_buffer_get(js, -2))) {
    (void)jsparser_buffer_pop(js);
  } else {
    jsparser_buffer_set(js, -1, ' ');
  }
}

static statemachine_definition *create_statemachine_definition()
{
  statemachine_definition *def;
  def = statemachine_definition_new(JSPARSER_NUM_STATES);
  if (def == NULL)
    return NULL;

  /* TODO(falmeida): Check return value. */
  statemachine_definition_populate(def, jsparser_state_transitions,
                                   jsparser_states_internal_names);

  statemachine_in_state(def, JSPARSER_STATE_INT_JS_TEXT,
                        in_state_js_text);
  statemachine_enter_state(def, JSPARSER_STATE_INT_JS_SLASH,
                           enter_state_js_slash);
  statemachine_enter_state(def, JSPARSER_STATE_INT_JS_COMMENT_AFTER,
                           enter_state_js_comment_after);

  return def;
}


/* Initializes a new jsparser instance.
 *
 * Returns a pointer to the new instance or NULL if the initialization
 * fails.
 *
 * Initialization failure is fatal, and if this function fails it may not
 * deallocate all previsouly allocated memory.
 */

jsparser_ctx *jsparser_new()
{
    jsparser_ctx *js;

    js = CAST(jsparser_ctx *, calloc(1, sizeof(jsparser_ctx)));
    if (js == NULL)
      return NULL;

    js->statemachine_def = create_statemachine_definition();
    if (js->statemachine_def == NULL)
      return NULL;

    js->statemachine = statemachine_new(js->statemachine_def, js);
    if (js->statemachine == NULL)
      return NULL;

    jsparser_reset(js);

    return js;
}

/* Returns a pointer to a context which is a duplicate of the jsparser src.
 */
jsparser_ctx *jsparser_duplicate(jsparser_ctx *src)
{
  jsparser_ctx *dst;
  assert(src != NULL);

  dst = jsparser_new();
  if (dst == NULL)
    return NULL;

  jsparser_copy(dst, src);

  return dst;
}

/* Copies the context of the jsparser pointed to by src to the jsparser dst.
 *
 * The state machine definition is preserved since it is read only.
 */
void jsparser_copy(jsparser_ctx *dst, jsparser_ctx *src)
{

  dst->buffer_start = src->buffer_start;
  dst->buffer_end = src->buffer_end;
  memcpy(dst->buffer, src->buffer, sizeof(src->buffer));

  statemachine_copy(dst->statemachine,
                    src->statemachine,
                    dst->statemachine_def,
                    dst);

}

void jsparser_reset(jsparser_ctx *ctx)
{
  assert(ctx != NULL);
  ctx->statemachine->current_state = 0;
  ctx->buffer_start = 0;
  ctx->buffer_end = 0;
}

int jsparser_state(jsparser_ctx *ctx)
{
  return state_external(ctx->statemachine->current_state);
}

int jsparser_parse(jsparser_ctx *ctx, const char *str, int size)
{
    int internal_state;
    internal_state = statemachine_parse(ctx->statemachine, str, size);
    return state_external(internal_state);
}

void jsparser_delete(jsparser_ctx *ctx)
{
    assert(ctx != NULL);
    statemachine_delete(ctx->statemachine);
    statemachine_definition_delete(ctx->statemachine_def);
    free(ctx);
}

#ifdef __cplusplus
}  /* namespace security_streamhtmlparser */
#endif /* __cplusplus */
