1 files changed, 420 insertions, 0 deletions
diff --git a/include/inja/lexer.hpp b/include/inja/lexer.hpp
new file mode 100644
index 0000000..e31c3d6
--- /dev/null
+++ b/include/inja/lexer.hpp
@@ -0,0 +1,420 @@
+// Copyright (c) 2020 Pantor. All rights reserved.
+
+#ifndef INCLUDE_INJA_LEXER_HPP_
+#define INCLUDE_INJA_LEXER_HPP_
+
+#include <cctype>
+#include <locale>
+
+#include "config.hpp"
+#include "token.hpp"
+#include "utils.hpp"
+
+namespace inja {
+
+/*!
+ * \brief Class for lexing an inja Template.
+ */
+class Lexer {
+  enum class State {
+    Text,
+    ExpressionStart,
+    ExpressionStartForceLstrip,
+    ExpressionBody,
+    LineStart,
+    LineBody,
+    StatementStart,
+    StatementStartNoLstrip,
+    StatementStartForceLstrip,
+    StatementBody,
+    CommentStart,
+    CommentBody,
+  };
+
+  enum class MinusState {
+    Operator,
+    Number,
+  };
+
+  const LexerConfig &config;
+
+  State state;
+  MinusState minus_state;
+  nonstd::string_view m_in;
+  size_t tok_start;
+  size_t pos;
+
+
+  Token scan_body(nonstd::string_view close, Token::Kind closeKind, nonstd::string_view close_trim = nonstd::string_view(), bool trim = false) {
+  again:
+    // skip whitespace (except for \n as it might be a close)
+    if (tok_start >= m_in.size()) {
+      return make_token(Token::Kind::Eof);
+    }
+    char ch = m_in[tok_start];
+    if (ch == ' ' || ch == '\t' || ch == '\r') {
+      tok_start += 1;
+      goto again;
+    }
+
+    // check for close
+    if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) {
+      state = State::Text;
+      pos = tok_start + close_trim.size();
+      Token tok = make_token(closeKind);
+      skip_whitespaces_and_newlines();
+      return tok;
+    }
+
+    if (inja::string_view::starts_with(m_in.substr(tok_start), close)) {
+      state = State::Text;
+      pos = tok_start + close.size();
+      Token tok = make_token(closeKind);
+      if (trim) {
+        skip_whitespaces_and_first_newline();
+      }
+      return tok;
+    }
+
+    // skip \n
+    if (ch == '\n') {
+      tok_start += 1;
+      goto again;
+    }
+
+    pos = tok_start + 1;
+    if (std::isalpha(ch)) {
+      minus_state = MinusState::Operator;
+      return scan_id();
+    }
+
+    MinusState current_minus_state = minus_state;
+    if (minus_state == MinusState::Operator) {
+      minus_state = MinusState::Number;
+    }
+
+    switch (ch) {
+    case '+':
+      return make_token(Token::Kind::Plus);
+    case '-':
+      if (current_minus_state == MinusState::Operator) {
+        return make_token(Token::Kind::Minus);
+      }
+      return scan_number();
+    case '*':
+      return make_token(Token::Kind::Times);
+    case '/':
+      return make_token(Token::Kind::Slash);
+    case '^':
+      return make_token(Token::Kind::Power);
+    case '%':
+      return make_token(Token::Kind::Percent);
+    case '.':
+      return make_token(Token::Kind::Dot);
+    case ',':
+      return make_token(Token::Kind::Comma);
+    case ':':
+      return make_token(Token::Kind::Colon);
+    case '(':
+      return make_token(Token::Kind::LeftParen);
+    case ')':
+      minus_state = MinusState::Operator;
+      return make_token(Token::Kind::RightParen);
+    case '[':
+      return make_token(Token::Kind::LeftBracket);
+    case ']':
+      minus_state = MinusState::Operator;
+      return make_token(Token::Kind::RightBracket);
+    case '{':
+      return make_token(Token::Kind::LeftBrace);
+    case '}':
+      minus_state = MinusState::Operator;
+      return make_token(Token::Kind::RightBrace);
+    case '>':
+      if (pos < m_in.size() && m_in[pos] == '=') {
+        pos += 1;
+        return make_token(Token::Kind::GreaterEqual);
+      }
+      return make_token(Token::Kind::GreaterThan);
+    case '<':
+      if (pos < m_in.size() && m_in[pos] == '=') {
+        pos += 1;
+        return make_token(Token::Kind::LessEqual);
+      }
+      return make_token(Token::Kind::LessThan);
+    case '=':
+      if (pos < m_in.size() && m_in[pos] == '=') {
+        pos += 1;
+        return make_token(Token::Kind::Equal);
+      }
+      return make_token(Token::Kind::Unknown);
+    case '!':
+      if (pos < m_in.size() && m_in[pos] == '=') {
+        pos += 1;
+        return make_token(Token::Kind::NotEqual);
+      }
+      return make_token(Token::Kind::Unknown);
+    case '\"':
+      return scan_string();
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      minus_state = MinusState::Operator;
+      return scan_number();
+    case '_':
+    case '@':
+    case '$':
+      minus_state = MinusState::Operator;
+      return scan_id();
+    default:
+      return make_token(Token::Kind::Unknown);
+    }
+  }
+
+  Token scan_id() {
+    for (;;) {
+      if (pos >= m_in.size()) {
+        break;
+      }
+      char ch = m_in[pos];
+      if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') {
+        break;
+      }
+      pos += 1;
+    }
+    return make_token(Token::Kind::Id);
+  }
+
+  Token scan_number() {
+    for (;;) {
+      if (pos >= m_in.size()) {
+        break;
+      }
+      char ch = m_in[pos];
+      // be very permissive in lexer (we'll catch errors when conversion happens)
+      if (!std::isdigit(ch) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-') {
+        break;
+      }
+      pos += 1;
+    }
+    return make_token(Token::Kind::Number);
+  }
+
+  Token scan_string() {
+    bool escape {false};
+    for (;;) {
+      if (pos >= m_in.size()) {
+        break;
+      }
+      char ch = m_in[pos++];
+      if (ch == '\\') {
+        escape = true;
+      } else if (!escape && ch == m_in[tok_start]) {
+        break;
+      } else {
+        escape = false;
+      }
+    }
+    return make_token(Token::Kind::String);
+  }
+
+  Token make_token(Token::Kind kind) const { return Token(kind, string_view::slice(m_in, tok_start, pos)); }
+
+  void skip_whitespaces_and_newlines() {
+    if (pos < m_in.size()) {
+      while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) {
+        pos += 1;
+      }
+    }
+  }
+
+  void skip_whitespaces_and_first_newline() {
+    if (pos < m_in.size()) {
+      while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) {
+        pos += 1;
+      }
+    }
+
+    if (pos < m_in.size()) {
+      char ch = m_in[pos];
+      if (ch == '\n') {
+        pos += 1;
+      } else if (ch == '\r') {
+        pos += 1;
+        if (pos < m_in.size() && m_in[pos] == '\n') {
+          pos += 1;
+        }
+      }
+    }
+  }
+
+  static nonstd::string_view clear_final_line_if_whitespace(nonstd::string_view text) {
+    nonstd::string_view result = text;
+    while (!result.empty()) {
+      char ch = result.back();
+      if (ch == ' ' || ch == '\t') {
+        result.remove_suffix(1);
+      } else if (ch == '\n' || ch == '\r') {
+        break;
+      } else {
+        return text;
+      }
+    }
+    return result;
+  }
+
+public:
+  explicit Lexer(const LexerConfig &config) : config(config), state(State::Text), minus_state(MinusState::Number) {}
+
+  SourceLocation current_position() const {
+    return get_source_location(m_in, tok_start);
+  }
+
+  void start(nonstd::string_view input) {
+    m_in = input;
+    tok_start = 0;
+    pos = 0;
+    state = State::Text;
+    minus_state = MinusState::Number;
+
+    // Consume byte order mark (BOM) for UTF-8
+    if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) {
+      m_in = m_in.substr(3);
+    }
+  }
+
+  Token scan() {
+    tok_start = pos;
+
+  again:
+    if (tok_start >= m_in.size()) {
+      return make_token(Token::Kind::Eof);
+    }
+
+    switch (state) {
+    default:
+    case State::Text: {
+      // fast-scan to first open character
+      size_t open_start = m_in.substr(pos).find_first_of(config.open_chars);
+      if (open_start == nonstd::string_view::npos) {
+        // didn't find open, return remaining text as text token
+        pos = m_in.size();
+        return make_token(Token::Kind::Text);
+      }
+      pos += open_start;
+
+      // try to match one of the opening sequences, and get the close
+      nonstd::string_view open_str = m_in.substr(pos);
+      bool must_lstrip = false;
+      if (inja::string_view::starts_with(open_str, config.expression_open)) {
+        if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) {
+          state = State::ExpressionStartForceLstrip;
+          must_lstrip = true;
+        } else {
+          state = State::ExpressionStart;
+        }
+      } else if (inja::string_view::starts_with(open_str, config.statement_open)) {
+        if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) {
+          state = State::StatementStartNoLstrip;
+        } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip )) {
+          state = State::StatementStartForceLstrip;
+          must_lstrip = true;
+        } else {
+          state = State::StatementStart;
+          must_lstrip = config.lstrip_blocks;
+        }
+      } else if (inja::string_view::starts_with(open_str, config.comment_open)) {
+        state = State::CommentStart;
+        must_lstrip = config.lstrip_blocks;
+      } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) {
+        state = State::LineStart;
+      } else {
+        pos += 1; // wasn't actually an opening sequence
+        goto again;
+      }
+
+      nonstd::string_view text = string_view::slice(m_in, tok_start, pos);
+      if (must_lstrip) {
+        text = clear_final_line_if_whitespace(text);
+      }
+
+      if (text.empty()) {
+        goto again; // don't generate empty token
+      }
+      return Token(Token::Kind::Text, text);
+    }
+    case State::ExpressionStart: {
+      state = State::ExpressionBody;
+      pos += config.expression_open.size();
+      return make_token(Token::Kind::ExpressionOpen);
+    }
+    case State::ExpressionStartForceLstrip: {
+      state = State::ExpressionBody;
+      pos += config.expression_open_force_lstrip.size();
+      return make_token(Token::Kind::ExpressionOpen);
+    }
+    case State::LineStart: {
+      state = State::LineBody;
+      pos += config.line_statement.size();
+      return make_token(Token::Kind::LineStatementOpen);
+    }
+    case State::StatementStart: {
+      state = State::StatementBody;
+      pos += config.statement_open.size();
+      return make_token(Token::Kind::StatementOpen);
+    }
+    case State::StatementStartNoLstrip: {
+      state = State::StatementBody;
+      pos += config.statement_open_no_lstrip.size();
+      return make_token(Token::Kind::StatementOpen);
+    }
+    case State::StatementStartForceLstrip: {
+      state = State::StatementBody;
+      pos += config.statement_open_force_lstrip.size();
+      return make_token(Token::Kind::StatementOpen);
+    }
+    case State::CommentStart: {
+      state = State::CommentBody;
+      pos += config.comment_open.size();
+      return make_token(Token::Kind::CommentOpen);
+    }
+    case State::ExpressionBody:
+      return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip);
+    case State::LineBody:
+      return scan_body("\n", Token::Kind::LineStatementClose);
+    case State::StatementBody:
+      return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks);
+    case State::CommentBody: {
+      // fast-scan to comment close
+      size_t end = m_in.substr(pos).find(config.comment_close);
+      if (end == nonstd::string_view::npos) {
+        pos = m_in.size();
+        return make_token(Token::Kind::Eof);
+      }
+      // return the entire comment in the close token
+      state = State::Text;
+      pos += end + config.comment_close.size();
+      Token tok = make_token(Token::Kind::CommentClose);
+      if (config.trim_blocks) {
+        skip_whitespaces_and_first_newline();
+      }
+      return tok;
+    }
+    }
+  }
+
+  const LexerConfig &get_config() const {
+    return config;
+  }
+};
+
+} // namespace inja
+
+#endif // INCLUDE_INJA_LEXER_HPP_