diff options
Diffstat (limited to 'include/inja/lexer.hpp')
-rw-r--r-- | include/inja/lexer.hpp | 420 |
1 files changed, 420 insertions, 0 deletions
diff --git a/include/inja/lexer.hpp b/include/inja/lexer.hpp new file mode 100644 index 0000000..e31c3d6 --- /dev/null +++ b/include/inja/lexer.hpp @@ -0,0 +1,420 @@ +// Copyright (c) 2020 Pantor. All rights reserved. + +#ifndef INCLUDE_INJA_LEXER_HPP_ +#define INCLUDE_INJA_LEXER_HPP_ + +#include <cctype> +#include <locale> + +#include "config.hpp" +#include "token.hpp" +#include "utils.hpp" + +namespace inja { + +/*! + * \brief Class for lexing an inja Template. + */ +class Lexer { + enum class State { + Text, + ExpressionStart, + ExpressionStartForceLstrip, + ExpressionBody, + LineStart, + LineBody, + StatementStart, + StatementStartNoLstrip, + StatementStartForceLstrip, + StatementBody, + CommentStart, + CommentBody, + }; + + enum class MinusState { + Operator, + Number, + }; + + const LexerConfig &config; + + State state; + MinusState minus_state; + nonstd::string_view m_in; + size_t tok_start; + size_t pos; + + + Token scan_body(nonstd::string_view close, Token::Kind closeKind, nonstd::string_view close_trim = nonstd::string_view(), bool trim = false) { + again: + // skip whitespace (except for \n as it might be a close) + if (tok_start >= m_in.size()) { + return make_token(Token::Kind::Eof); + } + char ch = m_in[tok_start]; + if (ch == ' ' || ch == '\t' || ch == '\r') { + tok_start += 1; + goto again; + } + + // check for close + if (!close_trim.empty() && inja::string_view::starts_with(m_in.substr(tok_start), close_trim)) { + state = State::Text; + pos = tok_start + close_trim.size(); + Token tok = make_token(closeKind); + skip_whitespaces_and_newlines(); + return tok; + } + + if (inja::string_view::starts_with(m_in.substr(tok_start), close)) { + state = State::Text; + pos = tok_start + close.size(); + Token tok = make_token(closeKind); + if (trim) { + skip_whitespaces_and_first_newline(); + } + return tok; + } + + // skip \n + if (ch == '\n') { + tok_start += 1; + goto again; + } + + pos = tok_start + 1; + if (std::isalpha(ch)) { + minus_state = MinusState::Operator; + return scan_id(); + } + + MinusState current_minus_state = minus_state; + if (minus_state == MinusState::Operator) { + minus_state = MinusState::Number; + } + + switch (ch) { + case '+': + return make_token(Token::Kind::Plus); + case '-': + if (current_minus_state == MinusState::Operator) { + return make_token(Token::Kind::Minus); + } + return scan_number(); + case '*': + return make_token(Token::Kind::Times); + case '/': + return make_token(Token::Kind::Slash); + case '^': + return make_token(Token::Kind::Power); + case '%': + return make_token(Token::Kind::Percent); + case '.': + return make_token(Token::Kind::Dot); + case ',': + return make_token(Token::Kind::Comma); + case ':': + return make_token(Token::Kind::Colon); + case '(': + return make_token(Token::Kind::LeftParen); + case ')': + minus_state = MinusState::Operator; + return make_token(Token::Kind::RightParen); + case '[': + return make_token(Token::Kind::LeftBracket); + case ']': + minus_state = MinusState::Operator; + return make_token(Token::Kind::RightBracket); + case '{': + return make_token(Token::Kind::LeftBrace); + case '}': + minus_state = MinusState::Operator; + return make_token(Token::Kind::RightBrace); + case '>': + if (pos < m_in.size() && m_in[pos] == '=') { + pos += 1; + return make_token(Token::Kind::GreaterEqual); + } + return make_token(Token::Kind::GreaterThan); + case '<': + if (pos < m_in.size() && m_in[pos] == '=') { + pos += 1; + return make_token(Token::Kind::LessEqual); + } + return make_token(Token::Kind::LessThan); + case '=': + if (pos < m_in.size() && m_in[pos] == '=') { + pos += 1; + return make_token(Token::Kind::Equal); + } + return make_token(Token::Kind::Unknown); + case '!': + if (pos < m_in.size() && m_in[pos] == '=') { + pos += 1; + return make_token(Token::Kind::NotEqual); + } + return make_token(Token::Kind::Unknown); + case '\"': + return scan_string(); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + minus_state = MinusState::Operator; + return scan_number(); + case '_': + case '@': + case '$': + minus_state = MinusState::Operator; + return scan_id(); + default: + return make_token(Token::Kind::Unknown); + } + } + + Token scan_id() { + for (;;) { + if (pos >= m_in.size()) { + break; + } + char ch = m_in[pos]; + if (!std::isalnum(ch) && ch != '.' && ch != '/' && ch != '_' && ch != '-') { + break; + } + pos += 1; + } + return make_token(Token::Kind::Id); + } + + Token scan_number() { + for (;;) { + if (pos >= m_in.size()) { + break; + } + char ch = m_in[pos]; + // be very permissive in lexer (we'll catch errors when conversion happens) + if (!std::isdigit(ch) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-') { + break; + } + pos += 1; + } + return make_token(Token::Kind::Number); + } + + Token scan_string() { + bool escape {false}; + for (;;) { + if (pos >= m_in.size()) { + break; + } + char ch = m_in[pos++]; + if (ch == '\\') { + escape = true; + } else if (!escape && ch == m_in[tok_start]) { + break; + } else { + escape = false; + } + } + return make_token(Token::Kind::String); + } + + Token make_token(Token::Kind kind) const { return Token(kind, string_view::slice(m_in, tok_start, pos)); } + + void skip_whitespaces_and_newlines() { + if (pos < m_in.size()) { + while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t' || m_in[pos] == '\n' || m_in[pos] == '\r')) { + pos += 1; + } + } + } + + void skip_whitespaces_and_first_newline() { + if (pos < m_in.size()) { + while (pos < m_in.size() && (m_in[pos] == ' ' || m_in[pos] == '\t')) { + pos += 1; + } + } + + if (pos < m_in.size()) { + char ch = m_in[pos]; + if (ch == '\n') { + pos += 1; + } else if (ch == '\r') { + pos += 1; + if (pos < m_in.size() && m_in[pos] == '\n') { + pos += 1; + } + } + } + } + + static nonstd::string_view clear_final_line_if_whitespace(nonstd::string_view text) { + nonstd::string_view result = text; + while (!result.empty()) { + char ch = result.back(); + if (ch == ' ' || ch == '\t') { + result.remove_suffix(1); + } else if (ch == '\n' || ch == '\r') { + break; + } else { + return text; + } + } + return result; + } + +public: + explicit Lexer(const LexerConfig &config) : config(config), state(State::Text), minus_state(MinusState::Number) {} + + SourceLocation current_position() const { + return get_source_location(m_in, tok_start); + } + + void start(nonstd::string_view input) { + m_in = input; + tok_start = 0; + pos = 0; + state = State::Text; + minus_state = MinusState::Number; + + // Consume byte order mark (BOM) for UTF-8 + if (inja::string_view::starts_with(m_in, "\xEF\xBB\xBF")) { + m_in = m_in.substr(3); + } + } + + Token scan() { + tok_start = pos; + + again: + if (tok_start >= m_in.size()) { + return make_token(Token::Kind::Eof); + } + + switch (state) { + default: + case State::Text: { + // fast-scan to first open character + size_t open_start = m_in.substr(pos).find_first_of(config.open_chars); + if (open_start == nonstd::string_view::npos) { + // didn't find open, return remaining text as text token + pos = m_in.size(); + return make_token(Token::Kind::Text); + } + pos += open_start; + + // try to match one of the opening sequences, and get the close + nonstd::string_view open_str = m_in.substr(pos); + bool must_lstrip = false; + if (inja::string_view::starts_with(open_str, config.expression_open)) { + if (inja::string_view::starts_with(open_str, config.expression_open_force_lstrip)) { + state = State::ExpressionStartForceLstrip; + must_lstrip = true; + } else { + state = State::ExpressionStart; + } + } else if (inja::string_view::starts_with(open_str, config.statement_open)) { + if (inja::string_view::starts_with(open_str, config.statement_open_no_lstrip)) { + state = State::StatementStartNoLstrip; + } else if (inja::string_view::starts_with(open_str, config.statement_open_force_lstrip )) { + state = State::StatementStartForceLstrip; + must_lstrip = true; + } else { + state = State::StatementStart; + must_lstrip = config.lstrip_blocks; + } + } else if (inja::string_view::starts_with(open_str, config.comment_open)) { + state = State::CommentStart; + must_lstrip = config.lstrip_blocks; + } else if ((pos == 0 || m_in[pos - 1] == '\n') && inja::string_view::starts_with(open_str, config.line_statement)) { + state = State::LineStart; + } else { + pos += 1; // wasn't actually an opening sequence + goto again; + } + + nonstd::string_view text = string_view::slice(m_in, tok_start, pos); + if (must_lstrip) { + text = clear_final_line_if_whitespace(text); + } + + if (text.empty()) { + goto again; // don't generate empty token + } + return Token(Token::Kind::Text, text); + } + case State::ExpressionStart: { + state = State::ExpressionBody; + pos += config.expression_open.size(); + return make_token(Token::Kind::ExpressionOpen); + } + case State::ExpressionStartForceLstrip: { + state = State::ExpressionBody; + pos += config.expression_open_force_lstrip.size(); + return make_token(Token::Kind::ExpressionOpen); + } + case State::LineStart: { + state = State::LineBody; + pos += config.line_statement.size(); + return make_token(Token::Kind::LineStatementOpen); + } + case State::StatementStart: { + state = State::StatementBody; + pos += config.statement_open.size(); + return make_token(Token::Kind::StatementOpen); + } + case State::StatementStartNoLstrip: { + state = State::StatementBody; + pos += config.statement_open_no_lstrip.size(); + return make_token(Token::Kind::StatementOpen); + } + case State::StatementStartForceLstrip: { + state = State::StatementBody; + pos += config.statement_open_force_lstrip.size(); + return make_token(Token::Kind::StatementOpen); + } + case State::CommentStart: { + state = State::CommentBody; + pos += config.comment_open.size(); + return make_token(Token::Kind::CommentOpen); + } + case State::ExpressionBody: + return scan_body(config.expression_close, Token::Kind::ExpressionClose, config.expression_close_force_rstrip); + case State::LineBody: + return scan_body("\n", Token::Kind::LineStatementClose); + case State::StatementBody: + return scan_body(config.statement_close, Token::Kind::StatementClose, config.statement_close_force_rstrip, config.trim_blocks); + case State::CommentBody: { + // fast-scan to comment close + size_t end = m_in.substr(pos).find(config.comment_close); + if (end == nonstd::string_view::npos) { + pos = m_in.size(); + return make_token(Token::Kind::Eof); + } + // return the entire comment in the close token + state = State::Text; + pos += end + config.comment_close.size(); + Token tok = make_token(Token::Kind::CommentClose); + if (config.trim_blocks) { + skip_whitespaces_and_first_newline(); + } + return tok; + } + } + } + + const LexerConfig &get_config() const { + return config; + } +}; + +} // namespace inja + +#endif // INCLUDE_INJA_LEXER_HPP_ |