diff options
Diffstat (limited to 'flatcc/external/lex/luthor.c')
-rw-r--r-- | flatcc/external/lex/luthor.c | 1509 |
1 files changed, 1509 insertions, 0 deletions
diff --git a/flatcc/external/lex/luthor.c b/flatcc/external/lex/luthor.c new file mode 100644 index 0000000..fc81985 --- /dev/null +++ b/flatcc/external/lex/luthor.c @@ -0,0 +1,1509 @@ +/* + * Designed to be included in other C files which define emitter + * operations. The same source may thus be used to parse different + * grammars. + * + * The operators cover the most common operators i the C family. Each + * operator does not have a name, it is represent by a long token code + * with up to 4 ASCII characters embedded literally. This avoids any + * semantic meaning at the lexer level. Emitters macros can redefine + * this behavior. + * + * No real harm is done in accepting a superset, but the source is + * intended to be modified, have things flagged or removed, other things + * added. The real complicity is in numbers, identifiers, and comments, + * which should be fairly complete with flagging as is. + * + * Keyword handling is done at macroes, and described elsewhere, but for + * identifier compatible keywords, this is quite efficient to handle on + * a per language basis without modifying this source. + * + * The Lisp language family is somewhat different and not directly + * suited for this lexer, although it can easily be modified to suit. + * The main reason is ';' for comments, and operators used as part of + * the identifier symbol set, and no need for operator classification, + * and different handling of single character symbols. + * + * So overall, we more or less have one efficient unified lexer that can + * manage many languages - this is good, because it is a pain to write a + * new lexer by hand, and lexer tools are what they are. + */ + +#include "luthor.h" + +#ifdef LEX_C99_NUMERIC +#define LEX_C_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC +#endif + +#ifdef LEX_C_NUMERIC +#define LEX_C_OCTAL_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +#ifdef LEX_JULIA_NUMERIC +#ifdef LEX_C_OCTAL_NUMERIC +/* + * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically + * coexist, but leading zeroes give C style leading zero numbers + * which can lead to incorrect values depending on expectations. + * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this. + */ +#error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers" +#endif + +/* + * Julia v0.3 insists on lower case, and has a different meaning for + * upper case. + */ +#define LEX_LOWER_CASE_NUMERIC_PREFIX +#define LEX_JULIA_OCTAL_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC + +#endif + +#ifdef LEX_HEX_FLOAT_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +/* + * Numeric and string constants do not accept prefixes such as u, l, L, + * U, ll, LL, f, or F in C, or various others in Julia strings. Use the + * parser to detect juxtaposition between identifier and constant. In + * Julia numeric suffix means multiplication, in C it is a type + * qualifier. Sign, such as defined in JSON, are also not accepted - + * they must be operators. See source for various flag to enable + * different token types. + */ + +/* + * Includes '_' in identifers by default. Defines follow characters in + * identifiers but not the lead character - it must be defined in switch + * cases. If the identifier allows for dash '-', it is probably better + * to handle it as an operator and flag surrounding space in the parser. + */ +#ifndef lex_isalnum + +/* + * NOTE: isalnum, isalpha, is locale dependent. We only want to + * to consider that ASCII-7 subset and treat everything else as utf-8. + * This table is not for leading identifiers, as it contains 0..9. + * + * For more correct handling of UTF-8, see: + * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon + * based on Java Ident = NameStartChar NameChar* + * + * While the following is UTF-16, it can be adapted to UTF-8 easily. + + + fragment + NameChar + : NameStartChar + | '0'..'9' + | '_' + | '\u00B7' + | '\u0300'..'\u036F' + | '\u203F'..'\u2040' + ; + fragment + NameStartChar + : 'A'..'Z' | 'a'..'z' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u02FF' + | '\u0370'..'\u037D' + | '\u037F'..'\u1FFF' + | '\u200C'..'\u200D' + | '\u2070'..'\u218F' + | '\u2C00'..'\u2FEF' + | '\u3001'..'\uD7FF' + | '\uF900'..'\uFDCF' + | '\uFDF0'..'\uFFFD' + ; + */ + +static const char lex_alnum[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0..9 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + /* A..O */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* P..Z, _ */ +#ifdef LEX_ID_WITHOUT_UNDERSCORE + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#else + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, +#endif + /* a..o */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* p..z */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#ifdef LEX_ID_WITH_UTF8 + /* utf-8 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +#else + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +#endif +}; + +#define lex_isalnum(c) (lex_alnum[(unsigned char)(c)]) +#endif + +#ifndef lex_isbindigit +#define lex_isbindigit(c) ((c) == '0' || (c) == '1') +#endif + +#ifndef lex_isoctdigit +#define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8) +#endif + +#ifndef lex_isdigit +#define lex_isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#ifndef lex_ishexdigit +#define lex_ishexdigit(c) (((c) >= '0' && (c) <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f')) +#endif + +#ifndef lex_isctrl +#include <ctype.h> +#define lex_isctrl(c) ((c) < 0x20 || (c) == 0x7f) +#endif + +#ifndef lex_isblank +#define lex_isblank(c) ((c) == ' ' || (c) == '\t') +#endif + +#ifndef lex_iszterm +#define lex_iszterm(c) ((c) == '\0') +#endif + +/* + * If ZTERM is disabled, zero will be a LEX_CTRL token + * and allowed to be embedded in comments and strings, or + * elsewhere, as long as the parser accepts the token. + */ +#ifdef LEX_DISABLE_ZTERM +#undef lex_iszterm +#define lex_iszterm(c) (0) +#endif + +/* + * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned + * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned. + * If a buffer stopped in the middle of a string or a comment, the mode + * will reflect that. In all cases some amount of recovery is needed + * before starting a new buffer - see detailed comments in header file. + * If only a single buffer is used, special handling is still needed if + * the last line contains a single line comment because it will not be + * terminated, but it amounts to replace the emitted unterminated + * comment token with an end of comment token. + * + * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will + * an strip optional BOM before moving to normal mode. Currently only + * UTF-8 BOM is supported, and this is unlikely to change. + * + * The context variable is user-defined and available to emitter macros. + * It may be null if unused. + * + */ +static int lex(const char *buf, size_t len, int mode, void *context) +{ + const char *p, *q, *s, *d; +#if 0 + /* TODO: old, remove this */ + , *z, *f; +#endif + + p = buf; /* next char */ + q = p + len; /* end of buffer */ + s = p; /* start of token */ + d = p; /* end of integer part */ + +#if 0 + /* TODO: old, remove this */ + + /* Used for float and leading zero detection in numerics. */ + z = p; + f = p; +#endif + + /* + * Handle mid string and mid comment for reentering across + * buffer boundaries. Strip embedded counter from mode. + */ + switch(mode & (LEX_MODE_COUNT_BASE - 1)) { + + case LEX_MODE_NORMAL: + goto lex_mode_normal; + + case LEX_MODE_BOM: + goto lex_mode_bom; + +#ifdef LEX_C_STRING + case LEX_MODE_C_STRING: + goto lex_mode_c_string; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING + case LEX_MODE_PYTHON_BLOCK_STRING: + goto lex_mode_python_block_string; +#endif +#ifdef LEX_C_STRING_SQ + case LEX_MODE_C_STRING_SQ: + goto lex_mode_c_string_sq; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + case LEX_MODE_PYTHON_BLOCK_STRING_SQ: + goto lex_mode_python_block_string_sq; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case LEX_MODE_C_BLOCK_COMMENT: + goto lex_mode_c_block_comment; +#endif +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) + case LEX_MODE_LINE_COMMENT: + goto lex_mode_line_comment; +#endif +#ifdef LEX_JULIA_NESTED_COMMENT + case LEX_MODE_JULIA_NESTED_COMMENT: + goto lex_mode_julia_nested_comment; +#endif + + default: + /* + * This is mostly to kill unused label warning when comments + * are disabled. + */ + goto lex_mode_exit; + } + +lex_mode_bom: + + mode = LEX_MODE_BOM; + + /* + * Special entry mode to consume utf-8 bom if present. We don't + * support other boms, but we would use the same token if we did. + * + * We generally expect no bom present, but it is here if needed + * without requiring ugly hacks elsewhere. + */ + if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') { + p += 3; + lex_emit_bom(s, p); + } + goto lex_mode_normal; + +/* If source is updated, also update LEX_C_STRING_SQ accordingly. */ +#ifdef LEX_C_STRING +lex_mode_c_string: + + mode = LEX_MODE_C_STRING; + + for (;;) { + --p; + /* We do not allow blanks that are also control characters, such as \t. */ + while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\"') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\"') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * This is a copy if LEX_C_STRING with single quote. It's not DRY, but + * no reason to parameterized inner loops, just because. Recopy of + * changes are to the above. + * + * Even if single quote is only used for CHAR types, it makes sense to + * parse as a full string since there can be all sorts of unicocde + * escapes and line continuations, newlines to report and unexpected + * control characters to deal with. + */ +#ifdef LEX_C_STRING_SQ +lex_mode_c_string_sq: + + mode = LEX_MODE_C_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\'') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\'') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * """ Triple quoted Python block strings. """ + * Single quoted version (''') is a direct copy, update both places + * if a changed is needed. + * + * Note: there is no point in disabling line continuation + * for block strings, since it only affects unterminated + * string errors at newline. It all comes down to how + * escaped newline is interpreted by the parser. + */ +#ifdef LEX_PYTHON_BLOCK_STRING +lex_mode_python_block_string: + + mode = LEX_MODE_PYTHON_BLOCK_STRING; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\"') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\"')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * Python ''' style strings. + * Direct copy of """ quote version, update both if changed. + */ +#ifdef LEX_PYTHON_BLOCK_STRING_SQ +lex_mode_python_block_string_sq: + + mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\'') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\'')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * We don't really care if it is a shell style comment or a C99, + * or any other line oriented commment, as the termination is + * the same. + */ +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) +lex_mode_line_comment: + + mode = LEX_MODE_LINE_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + /* + * Unterminated comment here is not necessarily true, + * not even likely, nor possible, but we do this to + * handle buffer switch consistently: any non-normal + * mode exit will have an unterminated token to fix up. + * Here it would be conversion to end of comment, which + * we cannot know yet, since the line might continue in + * the next buffer. This is a zero length token. + */ + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n' || *p == '\r') { + lex_emit_comment_end(s, p); + goto lex_mode_normal; + } + ++p; + lex_emit_comment_ctrl(s); + s = p; + } +#endif + +#ifdef LEX_C_BLOCK_COMMENT +lex_mode_c_block_comment: + + mode = LEX_MODE_C_BLOCK_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + if (*p == '/' && p[-1] == '*') { + --p; + break; + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + p += 2; + lex_emit_comment_end(s, p); + s = p; + goto lex_mode_normal; + } +#endif + + /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */ +#ifdef LEX_JULIA_NESTED_COMMENT +lex_mode_julia_nested_comment: + + /* Preserve nesting level on re-entrance. */ + if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) { + mode = LEX_MODE_JULIA_NESTED_COMMENT; + } + /* We have already entered. */ + mode += LEX_MODE_COUNT_BASE; + + for (;;) { + --p; + while (++p != q && !lex_isctrl(*p)) { + if (*p == '#') { + if (p[-1] == '=') { + --p; + break; + } + if (p + 1 != q && p[1] == '=') { + break; + } + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + if (*p == '=') { + p += 2; + lex_emit_comment_end(s, p); + s = p; + mode -= LEX_MODE_COUNT_BASE; + if (mode / LEX_MODE_COUNT_BASE > 0) { + continue; + } + goto lex_mode_normal; + } + /* The upper bits are used as counter. */ + mode += LEX_MODE_COUNT_BASE; + p += 2; + lex_emit_comment_begin(s, p, 0); + s = p; + if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) { + /* Prevent malicious input from overflowing counter. */ + lex_emit_comment_deeply_nested(p); + lex_emit_abort(p); + return mode; + } + } +#endif + +/* Unlike other modes, we can always jump here without updating token start `s` first. */ +lex_mode_normal: + + mode = LEX_MODE_NORMAL; + + while (p != q) { + s = p; + + switch(*p) { + +#ifndef LEX_DISABLE_ZTERM + case '\0': + lex_emit_eos(s, p); + return mode; +#endif + + /* \v, \f etc. are covered by the CTRL token, don't put it here. */ + case '\t': case ' ': + while (++p != q && lex_isblank(*p)) { + } + lex_emit_blank(s, p); + continue; + + /* + * Newline should be emitter in all constructs, also comments + * and strings which have their own newline handling. + * Only one line is emitted at a time permitting simple line + * counting. + */ + case '\n': + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + continue; + + case '\r': + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + continue; + + /* + * C-style string, and Python style triple double quote + * delimited multi-line string. Prefix and suffix symbols + * should be parsed separately, e.g. L"hello" are two + * tokens. + */ +#if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING) + case '\"': +#ifdef LEX_PYTHON_BLOCK_STRING + if (p + 2 < q && p[1] == '\"' && p[2] == '\"') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string; + } +#endif +#ifdef LEX_C_STRING + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string; +#endif +#endif + + /* + * Single quoted version of strings, otherwise identical + * behavior. Can also be used for char constants if checked + * by parser subsequently. + */ +#if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ) + case '\'': +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + if (p + 2 < q && p[1] == '\'' && p[2] == '\'') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string_sq; + } +#endif +#ifdef LEX_C_STRING_SQ + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string_sq; +#endif +#endif + +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT) + /* + * Line comment excluding terminal line break. + * + * See also C99 line comment `//`. + * + * Julia uses `#=` and `=#` for nested block comments. + * (According to Julia developers, '#=` is motivated by `=` + * not being likely to start anything that you would put a + * comment around, unlike `#{`, `}#` or `#(`, `)#`)). + * + * Some known doc comment formats are identified and + * included in the comment_begin token. + */ + case '#': + ++p; +#ifdef LEX_JULIA_NESTED_COMMENT + if (p != q && *p == '=') { + ++p; + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_julia_nested_comment; + } +#endif + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_line_comment; +#endif + + case '/': + ++p; + if (p != q) { + switch (*p) { +#ifdef LEX_C99_LINE_COMMENT + case '/': + ++p; + p += p != q && (*p == '/' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_line_comment; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case '*': + ++p; + p += p != q && (*p == '*' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_c_block_comment; +#endif + case '=': + ++p; + lex_emit_compound_op('/', '=', s, p); + continue; + default: + break; + } + } + lex_emit_op('/', s, p); + continue; + + case '(': case ')': case '[': case ']': case '{': case '}': + case ',': case ';': case '\\': case '?': + ++p; + lex_emit_op(*s, s, p); + continue; + + case '%': case '!': case '~': case '^': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_compound_op(*s, '=', s, p); + continue; + } + lex_emit_op(*s, s, p); + continue; + + case '|': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('|', '=', s, p); + continue; + case '|': + ++p; + lex_emit_compound_op('|', '|', s, p); + break; + default: + break; + } + } + lex_emit_op('|', s, p); + continue; + + case '&': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('&', '=', s, p); + continue; + case '&': + ++p; + lex_emit_compound_op('&', '&', s, p); + break; + default: + break; + } + } + lex_emit_op('&', s, p); + continue; + + case '=': + ++p; + if (p != q) { + switch (*p) { + case '>': + ++p; + lex_emit_compound_op('=', '>', s, p); + continue; + case '=': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op('=', '=', '=', s, p); + continue; + } + lex_emit_compound_op('=', '=', s, p); + break; + default: + break; + } + } + lex_emit_op('=', s, p); + continue; + + case ':': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op(':', '=', s, p); + continue; + case ':': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op(':', ':', '=', s, p); + continue; + } + lex_emit_compound_op(':', ':', s, p); + continue; + default: + break; + } + } + lex_emit_op(':', s, p); + continue; + + case '*': + ++p; + if (p != q) { + switch (*p) { + case '=': + lex_emit_compound_op('*', '=', s, p); + continue; + case '*': + /* **= hardly used anywhere? */ + lex_emit_compound_op('*', '*', s, p); + continue; + default: + break; + } + } + lex_emit_op('*', s, p); + continue; + + case '<': + ++p; + if (p != q) { + switch (*p) { + case '-': + ++p; + lex_emit_compound_op('<', '-', s, p); + continue; + case '=': + ++p; + lex_emit_compound_op('<', '=', s, p); + continue; + case '<': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('<', '<', '=', s, p); + continue; + case '<': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('<', '<', '<', '=', s, p); + continue; + } + lex_emit_tricompound_op('<', '<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_op('<', s, p); + continue; + + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('>', '=', s, p); + continue; + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('>', '>', '=', s, p); + continue; + case '>': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('>', '>', '>', '=', s, p); + continue; + } + lex_emit_tricompound_op('>', '>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('>', s, p); + continue; + + case '-': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('-', '=', s, p); + continue; + case '-': + ++p; + lex_emit_compound_op('-', '-', s, p); + continue; + case '>': + ++p; + lex_emit_compound_op('-', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('-', s, p); + continue; + + case '+': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('+', '=', s, p); + continue; + + case '+': + ++p; + lex_emit_compound_op('+', '+', s, p); + continue; + default: + break; + } + } + lex_emit_op('+', s, p); + continue; + + case '.': + ++p; + if (p != q) { + switch (*p) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + d = s; + goto lex_dot_to_fraction_part; + case '.': + ++p; + if (p != q && *p == '.') { + ++p; + lex_emit_tricompound_op('.', '.', '.', s, p); + continue; + } + lex_emit_compound_op('.', '.', s, p); + continue; + default: + break; + } + } + lex_emit_op('.', s, p); + continue; + + case '0': + if (++p != q) { + switch (*p) { +#ifdef LEX_C_OCTAL_NUMERIC + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + while (++p != q && lex_isoctdigit(*p)) { + } + d = p; + if (p != q) { + /* + * Leading zeroes like 00.10 are valid C + * floating point constants. + */ + if (*p == '.') { + goto lex_c_octal_to_fraction_part; + } + if (*p == 'e' || *p == 'E') { + goto lex_c_octal_to_exponent_part; + } + } + lex_emit_octal(s, p); + /* + * If we have a number like 0079, it becomes + * 007(octal), 9(decimal). The parser should + * deal with this. + * + * To add to confusion i64 is a C integer suffix + * like in 007i64, but 2+2i is a Go complex + * constant. (Not specific to octals). + * + * This can all be handled by having the parser inspect + * following identifier or numeric, parser + * here meaning a lexer post processing step, not + * necessarily the parser itself. + */ + + continue; +#else + /* + * All integers reach default and enter + * integer part. As a result, leading zeroes are + * mapped to floats and integers which matches + * Julia behavior. Other languages should decide + * if leading zero is valid or not. JSON + * disallows leading zero. + */ +#endif + +#ifdef LEX_JULIA_OCTAL_NUMERIC + /* + * This is the style of octal, not 100% Julia + * compatible. Also define Julia numeric to enforce + * lower case. + */ +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'O': +#endif + /* + * Julia accepts 0o700 as octal and 0b100 as + * binary, and 0xa00 as hex, and 0100 as + * integer, and 1e2 as 64 bit float and 1f2 as + * 32 bit float. Julia 0.3 does not support + * octal and binary fractions. + */ + case 'o': + while (++p != q && lex_isoctdigit(*p)) { + } + lex_emit_octal(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_BINARY_NUMERIC + /* Binary in C++14. */ + case 'b': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'B': +#endif + while (++p != q && lex_isbindigit(*p)) { + } + lex_emit_binary(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_HEX_NUMERIC + case 'x': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* + * Julia v0.3 does not allow this, it thinks 0X1 is + * 0 * X1, X1 being an identifier. + * while 0x1 is a hex value due to precedence. + * + * TODO: This might change. + */ + + case 'X': +#endif + while (++p != q && lex_ishexdigit(*p)) { + } +#ifdef LEX_HEX_FLOAT_NUMERIC + /* + * Most hexadecimal floating poing conversion + * functions, including Pythons + * float.fromhex("0x1.0"), Julias parse + * function, and and C strtod on + * supporting platforms, will parse without + * exponent. The same languages do not support + * literal constants without the p exponent. + * First it is named p because e is a hex digit, + * second, the float suffix f is also a hex + * digit: 0x1.f is ambigious in C without that + * rule. Conversions have no such ambiguity. + * In Julia, juxtaposition means that 0x1.f + * could mean 0x1p0 * f or 0x1.fp0. + * + * Since we are not doing conversion here but + * lexing a stream, we opt to require the p + * suffix because making it optional could end + * up consuming parts of the next token. + * + * But, we also make a flag to make the exponent + * optional, anyway. It could be used for better + * error reporting than just consuming the hex + * part since we likely should accept the ambigous + * syntax either way. + */ + d = p; + if (p != q && *p == '.') { + while (++p != q && lex_ishexdigit(*p)) { + } + } + if (p != q && (*p == 'p' || *p == 'P')) { + if (++p != q && *p != '+' && *p != '-') { + --p; + } + /* The exponent is a decimal power of 2. */ + while (++p != q && lex_isdigit(*p)) { + } + lex_emit_hex_float(s, p); + continue; + } +#ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT + if (d != p) { + lex_emit_hex_float(s, p); + continue; + } +#else + /* + * Backtrack to decimal point. We require p to + * be present because we could otherwise consume + * part of the next token. + */ + p = d; +#endif +#endif /* LEX_HEX_FLOAT_NUMERIC */ + lex_emit_hex(s, p); + continue; +#endif /* LEX_HEX_NUMERIC */ + + default: + /* + * This means leading zeroes like 001 or 001.0 are + * treated like like int and float respectively, + * iff C octals are flaggged out. Otherwise they + * become 001(octal), and 001(octal),.0(float) + * which should be treated as an error because + * future extensions might allow octal floats. + * (Not likely, but interpretion is ambigious). + */ + break; + } /* Switch under '0' case. */ + + /* + * Pure single digit '0' is an octal number in the C + * spec. We have the option to treat it as an integer, + * or as an octal. For strict C behavior, this can be + * flagged in, but is disabled by default. It only + * applies to single digit 0. Thus, with C octal + * enabled, leading zeroes always go octal. + */ + } /* If condition around switch under '0' case. */ + --p; + goto lex_fallthrough_1; /* silence warning */ + + lex_fallthrough_1: + /* Leading integer digit in C integers. */ + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + while (++p && lex_isdigit(*p)) { + } + d = p; + if (*p == '.') { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_fraction_part: +#endif +lex_dot_to_fraction_part: + while (++p != q && lex_isdigit(*p)) { + } + } + if (p != q && (*p == 'e' || *p == 'E')) { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_exponent_part: +#endif + if (++p != q && *p != '+' && *p != '-') { + --p; + } + while (++p != q && lex_isdigit(*p)) { + } + } + if (d != p) { + lex_emit_float(s, p); + } else { +#ifdef LEX_C_OCTAL_NUMERIC + if (*s == '0') { + lex_emit_octal(s, p); + continue; + } +#endif + lex_emit_int(s, p); + } + continue; + +#ifndef LEX_ID_WITHOUT_UNDERSCORE + case '_': +#endif + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + + /* + * We do not try to ensure utf-8 is terminated correctly nor + * that any unicode character above ASCII is a character + * suitable for identifiers. + * + * tag is calculated for keyword lookup, and we assume these + * are always ASCII-7bit. It has the form: length, first + * char, second, char, last char in lsb to msb order. If the + * second char is missing, it becomes '\0'. The tag is not + * entirely unique, but suitable for fast lookup. + * + * If utf-8 appears in tag, the tag is undefined except the + * length is valid or overflows (meaning longer than any + * keyword and thus safe to compare against if tag matches). + * + * If the grammar is case insensitive, the tag be can + * downcased trivially by or'ring with 0x20202000 which + * preserves the length field (clever design by ASCII + * designers). After tag matching, a case insentive + * compare is obviously also needed against the full lexeme. + */ + + { + unsigned long tag; + + tag = (unsigned long)*p << 8; + if (++p != q && lex_isalnum(*p)) { + tag |= (unsigned long)*p << 16; + while (++p != q && lex_isalnum(*p)) { + } + } + tag |= (unsigned long)p[-1] << 24; + tag |= (unsigned char)(p - s) + (unsigned long)'0'; + lex_emit_id(s, p, tag); + continue; + } + + default: + +#ifdef LEX_ID_WITH_UTF8 + /* + * Identifier again, in case it starts with a utf-8 lead + * character. This time we can ignore the tag, except the + * length char must be valid to avoid buffer overruns + * on potential kw check upstream. + */ + if (*p & '\x80') { + unsigned long tag; + + while (++p != q && lex_isalnum(*p)) { + } + tag = (unsigned char)(p - s) + '0'; + lex_emit_id(s, p, tag); + continue; + } +#endif + ++p; + /* normally 0x7f DEL and 0x00..0x1f incl. */ + if (lex_isctrl(*s) && !lex_isblank(*s)) { + lex_emit_ctrl(s); + } else { + lex_emit_symbol(*s, s, p); + } + continue; + } /* Main switch in normal mode. */ + } /* Main while loop in normal mode. */ + +lex_mode_exit: + if (mode == LEX_MODE_INVALID) { + return mode; + } + +#ifndef LEX_DISABLE_ZTERM + if (p != q && lex_iszterm(*p)) { + lex_emit_eos(s, p); + return mode; + } +#endif + lex_emit_eob(p); + return mode; +} + |