diff options
Diffstat (limited to 'flatcc/external/lex')
-rw-r--r-- | flatcc/external/lex/LICENSE | 21 | ||||
-rw-r--r-- | flatcc/external/lex/README.md | 3 | ||||
-rw-r--r-- | flatcc/external/lex/luthor.c | 1509 | ||||
-rw-r--r-- | flatcc/external/lex/luthor.h | 472 | ||||
-rw-r--r-- | flatcc/external/lex/tokens.h | 554 |
5 files changed, 2559 insertions, 0 deletions
diff --git a/flatcc/external/lex/LICENSE b/flatcc/external/lex/LICENSE new file mode 100644 index 0000000..8e84a48 --- /dev/null +++ b/flatcc/external/lex/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/flatcc/external/lex/README.md b/flatcc/external/lex/README.md new file mode 100644 index 0000000..3144091 --- /dev/null +++ b/flatcc/external/lex/README.md @@ -0,0 +1,3 @@ +Essential files extracted from the luthor scanner - a generic scanner +similar to a handwritten scanner, but covering many common cases by +default. diff --git a/flatcc/external/lex/luthor.c b/flatcc/external/lex/luthor.c new file mode 100644 index 0000000..fc81985 --- /dev/null +++ b/flatcc/external/lex/luthor.c @@ -0,0 +1,1509 @@ +/* + * Designed to be included in other C files which define emitter + * operations. The same source may thus be used to parse different + * grammars. + * + * The operators cover the most common operators i the C family. Each + * operator does not have a name, it is represent by a long token code + * with up to 4 ASCII characters embedded literally. This avoids any + * semantic meaning at the lexer level. Emitters macros can redefine + * this behavior. + * + * No real harm is done in accepting a superset, but the source is + * intended to be modified, have things flagged or removed, other things + * added. The real complicity is in numbers, identifiers, and comments, + * which should be fairly complete with flagging as is. + * + * Keyword handling is done at macroes, and described elsewhere, but for + * identifier compatible keywords, this is quite efficient to handle on + * a per language basis without modifying this source. + * + * The Lisp language family is somewhat different and not directly + * suited for this lexer, although it can easily be modified to suit. + * The main reason is ';' for comments, and operators used as part of + * the identifier symbol set, and no need for operator classification, + * and different handling of single character symbols. + * + * So overall, we more or less have one efficient unified lexer that can + * manage many languages - this is good, because it is a pain to write a + * new lexer by hand, and lexer tools are what they are. + */ + +#include "luthor.h" + +#ifdef LEX_C99_NUMERIC +#define LEX_C_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC +#endif + +#ifdef LEX_C_NUMERIC +#define LEX_C_OCTAL_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +#ifdef LEX_JULIA_NUMERIC +#ifdef LEX_C_OCTAL_NUMERIC +/* + * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically + * coexist, but leading zeroes give C style leading zero numbers + * which can lead to incorrect values depending on expectations. + * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this. + */ +#error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers" +#endif + +/* + * Julia v0.3 insists on lower case, and has a different meaning for + * upper case. + */ +#define LEX_LOWER_CASE_NUMERIC_PREFIX +#define LEX_JULIA_OCTAL_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC + +#endif + +#ifdef LEX_HEX_FLOAT_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +/* + * Numeric and string constants do not accept prefixes such as u, l, L, + * U, ll, LL, f, or F in C, or various others in Julia strings. Use the + * parser to detect juxtaposition between identifier and constant. In + * Julia numeric suffix means multiplication, in C it is a type + * qualifier. Sign, such as defined in JSON, are also not accepted - + * they must be operators. See source for various flag to enable + * different token types. + */ + +/* + * Includes '_' in identifers by default. Defines follow characters in + * identifiers but not the lead character - it must be defined in switch + * cases. If the identifier allows for dash '-', it is probably better + * to handle it as an operator and flag surrounding space in the parser. + */ +#ifndef lex_isalnum + +/* + * NOTE: isalnum, isalpha, is locale dependent. We only want to + * to consider that ASCII-7 subset and treat everything else as utf-8. + * This table is not for leading identifiers, as it contains 0..9. + * + * For more correct handling of UTF-8, see: + * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon + * based on Java Ident = NameStartChar NameChar* + * + * While the following is UTF-16, it can be adapted to UTF-8 easily. + + + fragment + NameChar + : NameStartChar + | '0'..'9' + | '_' + | '\u00B7' + | '\u0300'..'\u036F' + | '\u203F'..'\u2040' + ; + fragment + NameStartChar + : 'A'..'Z' | 'a'..'z' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u02FF' + | '\u0370'..'\u037D' + | '\u037F'..'\u1FFF' + | '\u200C'..'\u200D' + | '\u2070'..'\u218F' + | '\u2C00'..'\u2FEF' + | '\u3001'..'\uD7FF' + | '\uF900'..'\uFDCF' + | '\uFDF0'..'\uFFFD' + ; + */ + +static const char lex_alnum[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0..9 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + /* A..O */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* P..Z, _ */ +#ifdef LEX_ID_WITHOUT_UNDERSCORE + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#else + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, +#endif + /* a..o */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* p..z */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#ifdef LEX_ID_WITH_UTF8 + /* utf-8 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +#else + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +#endif +}; + +#define lex_isalnum(c) (lex_alnum[(unsigned char)(c)]) +#endif + +#ifndef lex_isbindigit +#define lex_isbindigit(c) ((c) == '0' || (c) == '1') +#endif + +#ifndef lex_isoctdigit +#define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8) +#endif + +#ifndef lex_isdigit +#define lex_isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#ifndef lex_ishexdigit +#define lex_ishexdigit(c) (((c) >= '0' && (c) <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f')) +#endif + +#ifndef lex_isctrl +#include <ctype.h> +#define lex_isctrl(c) ((c) < 0x20 || (c) == 0x7f) +#endif + +#ifndef lex_isblank +#define lex_isblank(c) ((c) == ' ' || (c) == '\t') +#endif + +#ifndef lex_iszterm +#define lex_iszterm(c) ((c) == '\0') +#endif + +/* + * If ZTERM is disabled, zero will be a LEX_CTRL token + * and allowed to be embedded in comments and strings, or + * elsewhere, as long as the parser accepts the token. + */ +#ifdef LEX_DISABLE_ZTERM +#undef lex_iszterm +#define lex_iszterm(c) (0) +#endif + +/* + * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned + * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned. + * If a buffer stopped in the middle of a string or a comment, the mode + * will reflect that. In all cases some amount of recovery is needed + * before starting a new buffer - see detailed comments in header file. + * If only a single buffer is used, special handling is still needed if + * the last line contains a single line comment because it will not be + * terminated, but it amounts to replace the emitted unterminated + * comment token with an end of comment token. + * + * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will + * an strip optional BOM before moving to normal mode. Currently only + * UTF-8 BOM is supported, and this is unlikely to change. + * + * The context variable is user-defined and available to emitter macros. + * It may be null if unused. + * + */ +static int lex(const char *buf, size_t len, int mode, void *context) +{ + const char *p, *q, *s, *d; +#if 0 + /* TODO: old, remove this */ + , *z, *f; +#endif + + p = buf; /* next char */ + q = p + len; /* end of buffer */ + s = p; /* start of token */ + d = p; /* end of integer part */ + +#if 0 + /* TODO: old, remove this */ + + /* Used for float and leading zero detection in numerics. */ + z = p; + f = p; +#endif + + /* + * Handle mid string and mid comment for reentering across + * buffer boundaries. Strip embedded counter from mode. + */ + switch(mode & (LEX_MODE_COUNT_BASE - 1)) { + + case LEX_MODE_NORMAL: + goto lex_mode_normal; + + case LEX_MODE_BOM: + goto lex_mode_bom; + +#ifdef LEX_C_STRING + case LEX_MODE_C_STRING: + goto lex_mode_c_string; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING + case LEX_MODE_PYTHON_BLOCK_STRING: + goto lex_mode_python_block_string; +#endif +#ifdef LEX_C_STRING_SQ + case LEX_MODE_C_STRING_SQ: + goto lex_mode_c_string_sq; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + case LEX_MODE_PYTHON_BLOCK_STRING_SQ: + goto lex_mode_python_block_string_sq; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case LEX_MODE_C_BLOCK_COMMENT: + goto lex_mode_c_block_comment; +#endif +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) + case LEX_MODE_LINE_COMMENT: + goto lex_mode_line_comment; +#endif +#ifdef LEX_JULIA_NESTED_COMMENT + case LEX_MODE_JULIA_NESTED_COMMENT: + goto lex_mode_julia_nested_comment; +#endif + + default: + /* + * This is mostly to kill unused label warning when comments + * are disabled. + */ + goto lex_mode_exit; + } + +lex_mode_bom: + + mode = LEX_MODE_BOM; + + /* + * Special entry mode to consume utf-8 bom if present. We don't + * support other boms, but we would use the same token if we did. + * + * We generally expect no bom present, but it is here if needed + * without requiring ugly hacks elsewhere. + */ + if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') { + p += 3; + lex_emit_bom(s, p); + } + goto lex_mode_normal; + +/* If source is updated, also update LEX_C_STRING_SQ accordingly. */ +#ifdef LEX_C_STRING +lex_mode_c_string: + + mode = LEX_MODE_C_STRING; + + for (;;) { + --p; + /* We do not allow blanks that are also control characters, such as \t. */ + while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\"') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\"') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * This is a copy if LEX_C_STRING with single quote. It's not DRY, but + * no reason to parameterized inner loops, just because. Recopy of + * changes are to the above. + * + * Even if single quote is only used for CHAR types, it makes sense to + * parse as a full string since there can be all sorts of unicocde + * escapes and line continuations, newlines to report and unexpected + * control characters to deal with. + */ +#ifdef LEX_C_STRING_SQ +lex_mode_c_string_sq: + + mode = LEX_MODE_C_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\'') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\'') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * """ Triple quoted Python block strings. """ + * Single quoted version (''') is a direct copy, update both places + * if a changed is needed. + * + * Note: there is no point in disabling line continuation + * for block strings, since it only affects unterminated + * string errors at newline. It all comes down to how + * escaped newline is interpreted by the parser. + */ +#ifdef LEX_PYTHON_BLOCK_STRING +lex_mode_python_block_string: + + mode = LEX_MODE_PYTHON_BLOCK_STRING; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\"') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\"')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * Python ''' style strings. + * Direct copy of """ quote version, update both if changed. + */ +#ifdef LEX_PYTHON_BLOCK_STRING_SQ +lex_mode_python_block_string_sq: + + mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\'') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\'')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * We don't really care if it is a shell style comment or a C99, + * or any other line oriented commment, as the termination is + * the same. + */ +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) +lex_mode_line_comment: + + mode = LEX_MODE_LINE_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + /* + * Unterminated comment here is not necessarily true, + * not even likely, nor possible, but we do this to + * handle buffer switch consistently: any non-normal + * mode exit will have an unterminated token to fix up. + * Here it would be conversion to end of comment, which + * we cannot know yet, since the line might continue in + * the next buffer. This is a zero length token. + */ + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n' || *p == '\r') { + lex_emit_comment_end(s, p); + goto lex_mode_normal; + } + ++p; + lex_emit_comment_ctrl(s); + s = p; + } +#endif + +#ifdef LEX_C_BLOCK_COMMENT +lex_mode_c_block_comment: + + mode = LEX_MODE_C_BLOCK_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + if (*p == '/' && p[-1] == '*') { + --p; + break; + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + p += 2; + lex_emit_comment_end(s, p); + s = p; + goto lex_mode_normal; + } +#endif + + /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */ +#ifdef LEX_JULIA_NESTED_COMMENT +lex_mode_julia_nested_comment: + + /* Preserve nesting level on re-entrance. */ + if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) { + mode = LEX_MODE_JULIA_NESTED_COMMENT; + } + /* We have already entered. */ + mode += LEX_MODE_COUNT_BASE; + + for (;;) { + --p; + while (++p != q && !lex_isctrl(*p)) { + if (*p == '#') { + if (p[-1] == '=') { + --p; + break; + } + if (p + 1 != q && p[1] == '=') { + break; + } + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + if (*p == '=') { + p += 2; + lex_emit_comment_end(s, p); + s = p; + mode -= LEX_MODE_COUNT_BASE; + if (mode / LEX_MODE_COUNT_BASE > 0) { + continue; + } + goto lex_mode_normal; + } + /* The upper bits are used as counter. */ + mode += LEX_MODE_COUNT_BASE; + p += 2; + lex_emit_comment_begin(s, p, 0); + s = p; + if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) { + /* Prevent malicious input from overflowing counter. */ + lex_emit_comment_deeply_nested(p); + lex_emit_abort(p); + return mode; + } + } +#endif + +/* Unlike other modes, we can always jump here without updating token start `s` first. */ +lex_mode_normal: + + mode = LEX_MODE_NORMAL; + + while (p != q) { + s = p; + + switch(*p) { + +#ifndef LEX_DISABLE_ZTERM + case '\0': + lex_emit_eos(s, p); + return mode; +#endif + + /* \v, \f etc. are covered by the CTRL token, don't put it here. */ + case '\t': case ' ': + while (++p != q && lex_isblank(*p)) { + } + lex_emit_blank(s, p); + continue; + + /* + * Newline should be emitter in all constructs, also comments + * and strings which have their own newline handling. + * Only one line is emitted at a time permitting simple line + * counting. + */ + case '\n': + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + continue; + + case '\r': + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + continue; + + /* + * C-style string, and Python style triple double quote + * delimited multi-line string. Prefix and suffix symbols + * should be parsed separately, e.g. L"hello" are two + * tokens. + */ +#if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING) + case '\"': +#ifdef LEX_PYTHON_BLOCK_STRING + if (p + 2 < q && p[1] == '\"' && p[2] == '\"') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string; + } +#endif +#ifdef LEX_C_STRING + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string; +#endif +#endif + + /* + * Single quoted version of strings, otherwise identical + * behavior. Can also be used for char constants if checked + * by parser subsequently. + */ +#if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ) + case '\'': +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + if (p + 2 < q && p[1] == '\'' && p[2] == '\'') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string_sq; + } +#endif +#ifdef LEX_C_STRING_SQ + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string_sq; +#endif +#endif + +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT) + /* + * Line comment excluding terminal line break. + * + * See also C99 line comment `//`. + * + * Julia uses `#=` and `=#` for nested block comments. + * (According to Julia developers, '#=` is motivated by `=` + * not being likely to start anything that you would put a + * comment around, unlike `#{`, `}#` or `#(`, `)#`)). + * + * Some known doc comment formats are identified and + * included in the comment_begin token. + */ + case '#': + ++p; +#ifdef LEX_JULIA_NESTED_COMMENT + if (p != q && *p == '=') { + ++p; + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_julia_nested_comment; + } +#endif + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_line_comment; +#endif + + case '/': + ++p; + if (p != q) { + switch (*p) { +#ifdef LEX_C99_LINE_COMMENT + case '/': + ++p; + p += p != q && (*p == '/' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_line_comment; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case '*': + ++p; + p += p != q && (*p == '*' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_c_block_comment; +#endif + case '=': + ++p; + lex_emit_compound_op('/', '=', s, p); + continue; + default: + break; + } + } + lex_emit_op('/', s, p); + continue; + + case '(': case ')': case '[': case ']': case '{': case '}': + case ',': case ';': case '\\': case '?': + ++p; + lex_emit_op(*s, s, p); + continue; + + case '%': case '!': case '~': case '^': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_compound_op(*s, '=', s, p); + continue; + } + lex_emit_op(*s, s, p); + continue; + + case '|': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('|', '=', s, p); + continue; + case '|': + ++p; + lex_emit_compound_op('|', '|', s, p); + break; + default: + break; + } + } + lex_emit_op('|', s, p); + continue; + + case '&': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('&', '=', s, p); + continue; + case '&': + ++p; + lex_emit_compound_op('&', '&', s, p); + break; + default: + break; + } + } + lex_emit_op('&', s, p); + continue; + + case '=': + ++p; + if (p != q) { + switch (*p) { + case '>': + ++p; + lex_emit_compound_op('=', '>', s, p); + continue; + case '=': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op('=', '=', '=', s, p); + continue; + } + lex_emit_compound_op('=', '=', s, p); + break; + default: + break; + } + } + lex_emit_op('=', s, p); + continue; + + case ':': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op(':', '=', s, p); + continue; + case ':': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op(':', ':', '=', s, p); + continue; + } + lex_emit_compound_op(':', ':', s, p); + continue; + default: + break; + } + } + lex_emit_op(':', s, p); + continue; + + case '*': + ++p; + if (p != q) { + switch (*p) { + case '=': + lex_emit_compound_op('*', '=', s, p); + continue; + case '*': + /* **= hardly used anywhere? */ + lex_emit_compound_op('*', '*', s, p); + continue; + default: + break; + } + } + lex_emit_op('*', s, p); + continue; + + case '<': + ++p; + if (p != q) { + switch (*p) { + case '-': + ++p; + lex_emit_compound_op('<', '-', s, p); + continue; + case '=': + ++p; + lex_emit_compound_op('<', '=', s, p); + continue; + case '<': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('<', '<', '=', s, p); + continue; + case '<': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('<', '<', '<', '=', s, p); + continue; + } + lex_emit_tricompound_op('<', '<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_op('<', s, p); + continue; + + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('>', '=', s, p); + continue; + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('>', '>', '=', s, p); + continue; + case '>': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('>', '>', '>', '=', s, p); + continue; + } + lex_emit_tricompound_op('>', '>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('>', s, p); + continue; + + case '-': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('-', '=', s, p); + continue; + case '-': + ++p; + lex_emit_compound_op('-', '-', s, p); + continue; + case '>': + ++p; + lex_emit_compound_op('-', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('-', s, p); + continue; + + case '+': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('+', '=', s, p); + continue; + + case '+': + ++p; + lex_emit_compound_op('+', '+', s, p); + continue; + default: + break; + } + } + lex_emit_op('+', s, p); + continue; + + case '.': + ++p; + if (p != q) { + switch (*p) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + d = s; + goto lex_dot_to_fraction_part; + case '.': + ++p; + if (p != q && *p == '.') { + ++p; + lex_emit_tricompound_op('.', '.', '.', s, p); + continue; + } + lex_emit_compound_op('.', '.', s, p); + continue; + default: + break; + } + } + lex_emit_op('.', s, p); + continue; + + case '0': + if (++p != q) { + switch (*p) { +#ifdef LEX_C_OCTAL_NUMERIC + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + while (++p != q && lex_isoctdigit(*p)) { + } + d = p; + if (p != q) { + /* + * Leading zeroes like 00.10 are valid C + * floating point constants. + */ + if (*p == '.') { + goto lex_c_octal_to_fraction_part; + } + if (*p == 'e' || *p == 'E') { + goto lex_c_octal_to_exponent_part; + } + } + lex_emit_octal(s, p); + /* + * If we have a number like 0079, it becomes + * 007(octal), 9(decimal). The parser should + * deal with this. + * + * To add to confusion i64 is a C integer suffix + * like in 007i64, but 2+2i is a Go complex + * constant. (Not specific to octals). + * + * This can all be handled by having the parser inspect + * following identifier or numeric, parser + * here meaning a lexer post processing step, not + * necessarily the parser itself. + */ + + continue; +#else + /* + * All integers reach default and enter + * integer part. As a result, leading zeroes are + * mapped to floats and integers which matches + * Julia behavior. Other languages should decide + * if leading zero is valid or not. JSON + * disallows leading zero. + */ +#endif + +#ifdef LEX_JULIA_OCTAL_NUMERIC + /* + * This is the style of octal, not 100% Julia + * compatible. Also define Julia numeric to enforce + * lower case. + */ +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'O': +#endif + /* + * Julia accepts 0o700 as octal and 0b100 as + * binary, and 0xa00 as hex, and 0100 as + * integer, and 1e2 as 64 bit float and 1f2 as + * 32 bit float. Julia 0.3 does not support + * octal and binary fractions. + */ + case 'o': + while (++p != q && lex_isoctdigit(*p)) { + } + lex_emit_octal(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_BINARY_NUMERIC + /* Binary in C++14. */ + case 'b': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'B': +#endif + while (++p != q && lex_isbindigit(*p)) { + } + lex_emit_binary(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_HEX_NUMERIC + case 'x': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* + * Julia v0.3 does not allow this, it thinks 0X1 is + * 0 * X1, X1 being an identifier. + * while 0x1 is a hex value due to precedence. + * + * TODO: This might change. + */ + + case 'X': +#endif + while (++p != q && lex_ishexdigit(*p)) { + } +#ifdef LEX_HEX_FLOAT_NUMERIC + /* + * Most hexadecimal floating poing conversion + * functions, including Pythons + * float.fromhex("0x1.0"), Julias parse + * function, and and C strtod on + * supporting platforms, will parse without + * exponent. The same languages do not support + * literal constants without the p exponent. + * First it is named p because e is a hex digit, + * second, the float suffix f is also a hex + * digit: 0x1.f is ambigious in C without that + * rule. Conversions have no such ambiguity. + * In Julia, juxtaposition means that 0x1.f + * could mean 0x1p0 * f or 0x1.fp0. + * + * Since we are not doing conversion here but + * lexing a stream, we opt to require the p + * suffix because making it optional could end + * up consuming parts of the next token. + * + * But, we also make a flag to make the exponent + * optional, anyway. It could be used for better + * error reporting than just consuming the hex + * part since we likely should accept the ambigous + * syntax either way. + */ + d = p; + if (p != q && *p == '.') { + while (++p != q && lex_ishexdigit(*p)) { + } + } + if (p != q && (*p == 'p' || *p == 'P')) { + if (++p != q && *p != '+' && *p != '-') { + --p; + } + /* The exponent is a decimal power of 2. */ + while (++p != q && lex_isdigit(*p)) { + } + lex_emit_hex_float(s, p); + continue; + } +#ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT + if (d != p) { + lex_emit_hex_float(s, p); + continue; + } +#else + /* + * Backtrack to decimal point. We require p to + * be present because we could otherwise consume + * part of the next token. + */ + p = d; +#endif +#endif /* LEX_HEX_FLOAT_NUMERIC */ + lex_emit_hex(s, p); + continue; +#endif /* LEX_HEX_NUMERIC */ + + default: + /* + * This means leading zeroes like 001 or 001.0 are + * treated like like int and float respectively, + * iff C octals are flaggged out. Otherwise they + * become 001(octal), and 001(octal),.0(float) + * which should be treated as an error because + * future extensions might allow octal floats. + * (Not likely, but interpretion is ambigious). + */ + break; + } /* Switch under '0' case. */ + + /* + * Pure single digit '0' is an octal number in the C + * spec. We have the option to treat it as an integer, + * or as an octal. For strict C behavior, this can be + * flagged in, but is disabled by default. It only + * applies to single digit 0. Thus, with C octal + * enabled, leading zeroes always go octal. + */ + } /* If condition around switch under '0' case. */ + --p; + goto lex_fallthrough_1; /* silence warning */ + + lex_fallthrough_1: + /* Leading integer digit in C integers. */ + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + while (++p && lex_isdigit(*p)) { + } + d = p; + if (*p == '.') { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_fraction_part: +#endif +lex_dot_to_fraction_part: + while (++p != q && lex_isdigit(*p)) { + } + } + if (p != q && (*p == 'e' || *p == 'E')) { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_exponent_part: +#endif + if (++p != q && *p != '+' && *p != '-') { + --p; + } + while (++p != q && lex_isdigit(*p)) { + } + } + if (d != p) { + lex_emit_float(s, p); + } else { +#ifdef LEX_C_OCTAL_NUMERIC + if (*s == '0') { + lex_emit_octal(s, p); + continue; + } +#endif + lex_emit_int(s, p); + } + continue; + +#ifndef LEX_ID_WITHOUT_UNDERSCORE + case '_': +#endif + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + + /* + * We do not try to ensure utf-8 is terminated correctly nor + * that any unicode character above ASCII is a character + * suitable for identifiers. + * + * tag is calculated for keyword lookup, and we assume these + * are always ASCII-7bit. It has the form: length, first + * char, second, char, last char in lsb to msb order. If the + * second char is missing, it becomes '\0'. The tag is not + * entirely unique, but suitable for fast lookup. + * + * If utf-8 appears in tag, the tag is undefined except the + * length is valid or overflows (meaning longer than any + * keyword and thus safe to compare against if tag matches). + * + * If the grammar is case insensitive, the tag be can + * downcased trivially by or'ring with 0x20202000 which + * preserves the length field (clever design by ASCII + * designers). After tag matching, a case insentive + * compare is obviously also needed against the full lexeme. + */ + + { + unsigned long tag; + + tag = (unsigned long)*p << 8; + if (++p != q && lex_isalnum(*p)) { + tag |= (unsigned long)*p << 16; + while (++p != q && lex_isalnum(*p)) { + } + } + tag |= (unsigned long)p[-1] << 24; + tag |= (unsigned char)(p - s) + (unsigned long)'0'; + lex_emit_id(s, p, tag); + continue; + } + + default: + +#ifdef LEX_ID_WITH_UTF8 + /* + * Identifier again, in case it starts with a utf-8 lead + * character. This time we can ignore the tag, except the + * length char must be valid to avoid buffer overruns + * on potential kw check upstream. + */ + if (*p & '\x80') { + unsigned long tag; + + while (++p != q && lex_isalnum(*p)) { + } + tag = (unsigned char)(p - s) + '0'; + lex_emit_id(s, p, tag); + continue; + } +#endif + ++p; + /* normally 0x7f DEL and 0x00..0x1f incl. */ + if (lex_isctrl(*s) && !lex_isblank(*s)) { + lex_emit_ctrl(s); + } else { + lex_emit_symbol(*s, s, p); + } + continue; + } /* Main switch in normal mode. */ + } /* Main while loop in normal mode. */ + +lex_mode_exit: + if (mode == LEX_MODE_INVALID) { + return mode; + } + +#ifndef LEX_DISABLE_ZTERM + if (p != q && lex_iszterm(*p)) { + lex_emit_eos(s, p); + return mode; + } +#endif + lex_emit_eob(p); + return mode; +} + diff --git a/flatcc/external/lex/luthor.h b/flatcc/external/lex/luthor.h new file mode 100644 index 0000000..6ca373d --- /dev/null +++ b/flatcc/external/lex/luthor.h @@ -0,0 +1,472 @@ +/* + * Mostly generic lexer that can be hacked to suit specific syntax. See + * more detailed comments further down in this file. + * + * Normally include luthor.c instead of luthor.h so emitter functions + * can be custom defined, and optionally also fast keyword definitions. + * + * At the very minimum, define lex_emit which other emitters default to. + * + * Create a wrapper function to drive the lex function in said file. + * + * Use this header in separate parser logic to access the token values + * if relevant. + */ + +#ifndef LUTHOR_H +#define LUTHOR_H + +#ifdef LEX_KEYWORDS +#include <string.h> /* memcmp for kw match */ +#endif + +#include "tokens.h" + +#ifndef lex_emit +#define lex_emit(token, first, last) ((void)0) +#endif + +/* + * Default for comments, bom, and other things that are not necessarily + * of interest to the parser, but may be to buffer wrap handling, + * debugging, and pretty printers. + */ +#ifndef lex_emit_other +#define lex_emit_other(token, first, last) ((void)0) +#endif + +#ifndef lex_emit_eof +#define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos) +#endif + +#ifndef lex_emit_abort +#define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos) +#endif + +#ifndef lex_emit_eob +#define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos) +#endif + +#ifndef lex_emit_eos +#define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last) +#endif + +#ifndef lex_emit_bom +#define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last) +#endif + +#ifndef lex_emit_id +#ifdef LEX_KEYWORDS +/* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */ +#define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last) +#else +#define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last) +#endif +#endif + +/* + * This is a default for unknown symbols. It may be treated as an error, + * or it can be processed further by the parser instead of customizing + * the lexer. It ensures that there is always a token for every part of + * the input stream. + */ +#ifndef lex_emit_symbol +#define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last) +#endif + +/* + * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have + * separate tokens. + * + * Control characters in strings and comments are passed on as body + * elements, except \0\r\n which breaks the string up. + */ +#ifndef lex_emit_ctrl +#define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1) +#endif + +#ifndef lex_emit_string_ctrl +#define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1) +#endif + +#ifndef lex_emit_comment_ctrl +#define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1) +#endif + +/* + * This enables user to both count lines, and to calculate character + * offset for subsequent lexemes. New line starts a lexeme, line break + * symbol is located at lexeme - skipped and with have length 2 if \r\n + * or \n\r break, and 1 otherwise. + */ +#ifndef lex_emit_newline +#define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last) +#endif + +#ifndef lex_emit_string_newline +#define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last) +#endif + +#ifndef lex_emit_int +#define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last) +#endif + +#ifndef lex_emit_float +#define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last) +#endif + +#ifndef lex_emit_int_suffix +#define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last) +#endif + +#ifndef lex_emit_float_suffix +#define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last) +#endif + +#ifndef lex_emit_binary +#define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last) +#endif + +#ifndef lex_emit_octal +#define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last) +#endif + +#ifndef lex_emit_hex +#define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last) +#endif + +#ifndef lex_emit_hex_float +#define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last) +#endif + +/* + * The comment token can be used to aid backtracking during buffer + * switch. + */ +#ifndef lex_emit_comment_begin +#define lex_emit_comment_begin(first, last, is_doc) \ + lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last) +#endif + +#ifndef lex_emit_comment_part +#define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last) +#endif + +#ifndef lex_emit_comment_end +#define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last) +#endif + +#ifndef lex_emit_comment_unterminated +#define lex_emit_comment_unterminated(pos) \ + lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos) +#endif + +#ifndef lex_emit_comment_deeply_nested +#define lex_emit_comment_deeply_nested(pos) \ + lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos) +#endif + +#ifndef lex_emit_string_begin +#define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last) +#endif + +#ifndef lex_emit_string_part +#define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last) +#endif + +#ifndef lex_emit_string_end +#define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last) +#endif + +#ifndef lex_emit_string_escape +#define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last) +#endif + +#ifndef lex_emit_string_unterminated +#define lex_emit_string_unterminated(pos) \ + lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos) +#endif + +#ifndef lex_emit_blank +#define lex_emit_blank(first, last) \ + lex_emit_other(LEX_TOK_BLANK, first, last) +#endif + +#ifndef lex_emit_op +#define lex_emit_op(op, first, last) lex_emit((long)(op), first, last) +#endif + +#ifndef lex_emit_compound_op +#define lex_emit_compound_op(op1, op2, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last) +#endif + +#ifndef lex_emit_tricompound_op +#define lex_emit_tricompound_op(op1, op2, op3, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ + ((long)(op3)<<16), first, last) +#endif + +#ifndef lex_emit_quadcompound_op +#define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ + ((long)(op3) << 16) | ((long)(op4) << 24), first, last) +#endif + +/* Used to limit number of nested comment level. */ +#ifndef LEX_MAX_NESTING_LEVELS +#define LEX_MAX_NESTING_LEVELS 100 +#endif + + +/* Keyword handling macros, see `keywords.c` for an example usage. */ +#ifdef LEX_KEYWORDS + +/* + * This implements a switch statement branching on the 4 character + * keyword tag (unsigned long value) which is produced by the lexers id + * recognizer. A final check is needed with to ensure an exact + * match with a given id. Two keywords rarely conflicts, but it is + * possible, and therefore kw_begin kw_match kw_match ... kw_end is used + * to cover this. + * + * See example usage elsewhere for details. + * + * The first element x0 is length '0'..'9' and ensure comparisons will + * not overrun the buffer where the lexeme is stored during string + * comparison, iff the keywords report the length correctly. + * + * The next elements in the tag are the first, second, and last + * character of lexeme / keyword, replacing second character with '\0' + * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e', + * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb + * and last chararacter is msb. + * + * An enum with tok_kw_<name> elements is expected to provide return + * values on match. These should start at LEX_TOK_KW_BASE and are + * negative. + * + */ +#define lex_kw_begin(x0, x1, x2, x3) \ + case \ + ((unsigned long)(x0) | \ + ((unsigned long)(x1) << 8) | \ + ((unsigned long)(x2) << 16) | \ + ((unsigned long)(x3) << 24)) : + +#define lex_kw_match(kw) \ + if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0) \ + return tok_kw_##kw; + +#define lex_kw_end() \ + break; + +#define lex_kw(kw, x0, x1, x2, x3) \ + lex_kw_begin(x0, x1, x2, x3) \ + lex_kw_match(kw) \ + lex_kw_end() + +static long lex_match_kw(unsigned long tag, const char *lexeme); + +/* Static so multiple grammers are possible in a single program. */ +#define LEX_KW_TABLE_BEGIN \ +static long lex_match_kw(unsigned long tag, const char *lexeme) \ +{ \ + switch (tag) { \ + +#define LEX_KW_TABLE_END \ + default: \ + break; \ + } \ + return LEX_TOK_KW_NOT_FOUND; \ +} + +#else + +/* Allow flagging in and out without unused warning or missing macros */ +#define lex_kw_begin(x0, x1, x2, x3) +#define lex_kw_match(kw) +#define lex_kw_end() +#define lex_kw(kw, x0, x1, x2, x3) +#define LEX_KEYWORD_TABLE_BEGIN +#define LEX_KEYWORD_TABLE_END + +#endif /* LEX_KEYWORDS */ + + + +/* + * Modes used for recovery when switching to a new buffer and handling + * internal state changes for strings and comments. + */ +enum { + /* Always 0, is initial lexer state. */ + LEX_MODE_NORMAL = 0, + + /* Returned if lex is given unsupported mode. */ + LEX_MODE_INVALID = 1, + + /* + * Can be used in place of normal mode to consume optional bom + * marker at buffer start. Only utf-8 bom is supported. + */ + LEX_MODE_BOM, + + /* + * Returned at end of buffer if mid string or mid comment, may also + * be larger for nested comments as nesting level is encoded. + */ + LEX_MODE_C_STRING, + LEX_MODE_C_STRING_SQ, + LEX_MODE_PYTHON_BLOCK_STRING, + LEX_MODE_PYTHON_BLOCK_STRING_SQ, + LEX_MODE_C_BLOCK_COMMENT, + LEX_MODE_LINE_COMMENT, + LEX_MODE_JULIA_NESTED_COMMENT, + + + /* Counter embedded in mode. */ + LEX_MODE_COUNT_BASE = 16, +}; + + + +/* ON CALLING AND USING LEX FUNCTION + * + * If utf-8 BOM possible, detect this before calling the lexer and + * advance the buffer. JSON explititly disallows BOM, but recommends + * consuming it if present. If some other Unicode BOM is found, convert + * the buffer first. The lexer assumes ALL non-ascii characters are + * valid trailing identifiers which mostly works well. Strings with + * broken utf-8 are passed on as is. utf-8 identifiers must be enabled + * with #define LEX_ENABLE_UTF8_ID + * + * If required, postprocess identifiers and strings for valid utf-8. It + * is assumed that all keywords are at most 9 characters long and always + * ASCII. Otherwise post process them in a hash table on identifier + * event. This enables a fast compiled trie lookup of keywords. + * + * Newline and control characters are always emitted, also inside + * strings and comments. The exception is \r, \n, \t, \0 which are + * handled specially, or if the lexer is adapted to handle certain + * control characters specially. + * + * Each token is not guaranteed correct, only to be delimited correct, + * if it is indeed correct. Only very few tokens can be zero length, for + * example, the parser can rely on string part token not being empty + * which is important in dealing with line continuation. The end of + * buffer token is empty, and so is the unterminates string token, and + * also the comment end token for single line tokens, but not the + * multi-line version. There is a token for every part of the input + * stream, but the parser can easily define some to be ignored and have + * them optimized out. + * + * Strings have start token, and optionally sequences of control, + * escape, and newline tokens, followed by either string end token or + * string unterminated token. Strings delimiters can be one + * (single-line) or three double quotes (multi-line, like python, but + * cannot be single quotes, unlike Python. Python, C and Javascript + * string continuation is handled by having the parser observing string + * escape followed by newline token. Escape is always a single + * character '\' token, and the parser is responsible for consuming the + * following content. If string syntax with double delimiter is used to + * define escaped delimiter, this will occur as two separate strings + * with no space between. The parser can handle this on its own; if, in + * such strings, '\"' does not mean escaped delimiter, the string will + * not terminate correctly, and the lexer must be apapted. Unterminated + * string may happen at end of buffer, also for single line comments. + * This is because the string might continue in a new buffer. The parser + * should deal with this. + * + * Comments always start with a start token, followed by zero or more + * comment part tokens interleaved with control and newline tokens, + * terminated by either comment end token, or unterminated comment + * token. If the comment is single, the unterminated comment token may + * appear at the last line instead of the expected end of comment token + * because the comment might continue in a new buffer. The parser + * should deal with this. Escapes and line continuations have no effects + * in comments, unlike strings. + * + * The lexer only carries one state variable: the mode. The mode can be + * normal (default and equals zero), or single or multi string or + * comment modes. These modes are used to to recover after switching + * buffers as discussed below. + * + * The lexer can run to completion without involving the parser and + * could be used to pipeline tokens into another thread for concurrent + * parsing which is safe since the input buffer is considered read-only. + * + * + * KEYWORDS + * + * Keywords are treated as identifiers by default. By including a + * keyword table the `lex_emit_id` macro will check if the id is a + * keyword and translate the token if it is. Using the provided keyword + * table macros is just one way to do it. This is better explained by + * looking at an example. Keyword lookup based on the precomputed keyword + * tag provided to the lookup function are limited to 9 characters, but a + * custom lookup function need not use it and then the tag precomputation + * will be optimized out. + * + * Keywords are defined by the lookup function and should be negative + * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types. + * + * + * WRAPPING MULTIPLE BUFFERS + * + * The user may need to deal with multiple buffers because data may + * arrive asynchronously over the network, and may have many concurrent + * lexing jobs. The emitter part is not difficult since a ring buffer + * can grow, or the parser can be called directly (except queuing a few + * tokens for backtracking as we shall see). + * + * If the lexer were an explicit statemachine as in Flex, we could get + * an yywrap event to fill buffers, but our state is on the stack and in + * registers for optimization. We may use co-routines, but it doesn't + * cover all issues, and, as it turns out is not necessary with the + * following restrictions on syntax: + * + * All variable length tokens such as numerics and identifiers are + * limited in length. Strings and comments are not, but are broken into + * zero, one, or several body tokens per line. ANSI-C limits line length + * to 509 characters (allowing for continuation and two byte linebreaks + * in a 512 byte buffer). But JSON has no line continuation for strings + * and may (and often do) store everything on a single line. Whitespace + * can also extend beyond given limit. + * + * If we ignore whitespace, strings and comments, we can discard the + * last token (or last two in case there are paired tokens, such as + * leading zero followed by numeric. Parsing can then resume in a new + * buffer where the first 512 bytes (or similar) are duplicated from the + * previous buffer. The lexer is then restarted at the last token (pair) + * start which may turn out to change the length or even introduce a + * different result such introducing leading zero. The lexer need no + * specific state to do this. + * + * For strings and comments, we need a flag to allow entering the lexer + * mid string or mid comment. The newline and line continuation tokens + * need to be dropped, and the last body may need to be truncated as it + * can embed a partial delimiter. The simplest way to deal with this is + * to backtrack tokens until the last token begins at a safe position, + * about 3-6 charaters earlier, and truncating body segments that span + * this barrier. Whitespace can also be truncated. + * + * We can generalize this further by going at least K bytes back in an N + * overlap buffer region and require non-strings (and non-comments) to + * not exceed N-K bytes, where K and N are specific to the syntax and + * the I/O topology. + * + * We can add flags to tokens that can help decide how to enter + * backtracking mode without covering every possible scanner loop - i.e. + * are we mid string, mid comment, single-line or multi-line. + * + * All the lexer needs to do then, is to receive the backtracking mode + * flags. A wrapping driver can deal with backtrack logic, which is + * specific to how tokens are emitted. Whitespace need no recovery mode + * but perhaps new whitespace should extend existing to simplify + * parsing. + */ + + +#endif /* LUTHOR_H */ + diff --git a/flatcc/external/lex/tokens.h b/flatcc/external/lex/tokens.h new file mode 100644 index 0000000..2bdbd7c --- /dev/null +++ b/flatcc/external/lex/tokens.h @@ -0,0 +1,554 @@ +#ifndef LEX_TOKENS_H +#define LEX_TOKENS_H + +/* Define LEX_DEBUG to enable token printing and describing functions. */ + + +enum { + + /* + * EOF is not emitted by lexer, but may be used by driver after + * last buffer is processed. + */ + LEX_TOK_EOF = 0, + + /* + * Either EOB or EOS is emitted as the last token before exit, + * or also ABORT in some lexers. Unterminated string or comment + * will be emitted immediately one of these when relevant. + * + * It may be useful to redefine lex_emit_eos and lex_emit_eob to + * produce LEX_TOK_EOF or error directly for simple string lexing. + */ + LEX_TOK_EOB = 1, + LEX_TOK_EOS = 2, + + /* + * ABORT can be used for early exit by some lexers while other + * lexers may choose to run to buffer end regardless of input (with + * the exception of deeply nested comments). + */ + LEX_TOK_ABORT = 3, + + /* + * Byte order marker. Only happen if lexer was started in bom mode + * and the input stream contains a leading bom marker. + * The token can only be the first token in the stream. Utf-8 is the + * only supported bom, but the lexeme may be checked in case other + * boms are added later. Normally it is routed to lex_emit_other + * along with comments so it just ignores the bom if present. It is + * generally recommended to consume utf-8 bom for interoperability, + * but also to not store it for the same reason. + */ + LEX_TOK_BOM, + + /* + * Any control character that is not newline or blank will be + * emitted as single character token here. This token is discussed + * in several comments below. For strings and comments, also + * blank control characters will be emitted since they are usually + * not desired unexpectd. + */ + LEX_TOK_CTRL, + LEX_TOK_STRING_CTRL, + LEX_TOK_COMMENT_CTRL, + + /* + * Any printable ASCII character that is not otherwise consumed will + * be issued as a single length symbol token. Further discussion + * below. The symbol and CTRL tokens ensure that the entire input + * stream is covered by tokens. If utf-8 identifies have not been + * flagged, utf-8 leading characters may also end up here, and so + * my utf-8 characters in general, that are not viewed as valid + * identifiers (depending on configuration). + */ + LEX_TOK_SYMBOL, + + /* + * Variable length identifier starting with (_A-Za-z) by default and + * followed by zero or more (_A-Za-z0-9) characters. (_) can be + * flagged out. utf-8 can be flagged in. Be default any non-ASCII + * character (0x80 and above), is treated as part of an identifier + * for simplicity and speed, but this may be redefined. Any broken + * utf-8 is not sanitized, thus 0x80 would be a valid identifier + * token with utf-8 identifiers enabled, and otherwise it would be a + * symbol token. + * + * The ID does a magic trick: It maps the lexeme to a very simple + * and fast 32 bit hash code called a tag. The tag is emitted with + * the id token and can be used for fast keyword lookup. The + * hash tag is: + * + * (length)(first char)(second char)(last char) + * + * where length is ASCII '0' .. '9' where any length overflow is an + * arbitrary value, but such that the length is never longer than + * the lexeme. The last char is the last char regardless of length. + * For short identifiers, the second char may be the first char + * duplicated, and the last char may be first char. + * + * This code is very simple to write by hand: "5whe" means while, + * and can be used in a case switch before a strcmp with "while". + * Conflicts are possible, but then several keywords are tested like + * any other hash conflict. This keyword lookup is user driven, but + * can follow example code quite straightforward. + * + * The lex_emit_id macro can be implemented to provide the above + * lookup and inject a keyword token instead. By convention such + * tokens have negative values to avoid conflicts with lexer + * generated tokens. + * + * The ID also has a special role in prefixes and suffixes: C string + * literals like (L"hello") and numeric literals like (42f) are + * lexed as two tokens, one of which is an ID. The parser must + * process this and observe absence of whitespace where such syntax + * is relevant. + * + * While not specific to ID, the emitter macroes can be designed to + * keep track of start of lines and end of whitespace and attach + * state flags to each token (at line start, after whitespace). The + * whitespace tokens can then be dropped. This might help parsing + * things like suffixes efficiently. + */ + LEX_TOK_ID, + + /* + * C-int :: pos-dec-digit dec-digit * + * Julia-int ::= dec-digit+ + * + * pos-dec-digit ::= '1'..'9' + * dec-digit ::= '0'..'9' + * + * Floating point numbers take precedence when possible so 00.10 is + * always a deciaml floating point value when decimal floats are + * enabled. + * + * The C-int is automatically enabled if C-octals are enabled, and + * disabled otherwise. There is no specific Julia-int type - we just + * use the terminology to represent integers with leading zeroes. + * + * Julia style integers accept leading zeroes. C style integers with + * leading zeroes are consumed as C style octal numbers, so 0019 is + * parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int). + * + * Single digit '0' maps to octal when C-octals are enabled and to + * Julia-int otherwise. (Yes, integers are not that simple, it + * seems). + * + * Both C and Julia octal numbers (see octal token) can be active + * simultaneously. This can be used to control leading zero + * behavior, even if C-octal numbers are not part of the grammar + * being parsed. For example, a language might use 0o777 octal + * numbers and disallow 0777 integers. Enabling C-octals makes this + * easy to detect (but should accept octal 0). + * + * There is no destinction between the styles in the int token, but + * leading zeroes are easily detected in the lexeme. + * + * Constant suffixes like 1L are treated as 1(INT), and L(ID). The + * same goes for other numeric values. + * + * Parser should check for leading zeroes and decide if it is valid, + * a warning, or an error (it is in JSON). This also goes for float. + * + * Numericals, not limited to INT, may appear shorter than they are + * due to buffer splits. Special recovery is required, but will only + * happen just before EOS or EOB tokens (i.e. buffer split events). + */ + LEX_TOK_INT, + + /* + * float ::= (int ['.' dec-digits*] dec-exponent) + * | ([int] '.' dec-digits* [dec-exponent]) + * dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits* + * dec-digits ::= '0'..'9' + * int ::= dec-digits* + * + * Consumes a superset of C float representation without suffix. + * Some invalid tokens such as 0.E are accepted. Valid tokens such + * as 00.10 take precedence over octal numbers even if it is a + * prefix, and the same is obviously true with respect to decimal + * integers. + * + * JSON does not allow leading zeroes, and also not leading '.'. + * This can easily be checked in the lexeme. + * + * The octal notation affecting integer leading zeroes is not + * relevant to floats because floats take precedence over octal and + * decimal int when containing '.', 'e' or 'E'. + */ + LEX_TOK_FLOAT, + + /* + * binary ::= (0b | 0B) ('0' | '1')* + * + * 0b100 or just 0b, parser must check that digits are present, + * otherwise it may be interpreted as zero, just like octal zero + * in C. + * + * Like 0X hex, 0B can be flagged out because Julia v0.3 does not + * support uppercase 0B. + */ + LEX_TOK_BINARY, + + /* + * C-octal ::= 0 octal-digit* + * octal-digits ::= '0'..'7' + * + * Julia-octal ::= 0o octal-digits* + * octal-digits ::= '0'..'7' + * + * 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia + * v.0.3 does not allow uppercase 0O777, it would mean 0 * O777. + * + * When enabled, decimal floating points take precedence: 00.10 is + * parsed as 00.10(decimal float), as per C standard. + * + * NOTE: It is possible for both styles to be active simultaneously. + * This may be relevant in order to control handling of leading + * zeroes in decimal integers. + * + * If C-octal numbers are flagged out, leading zeroes are mapped to + * integers and the numerical value may change. Julia behaves this + * way. Nothing prevents support of both C and Julia octal numbers, + * but leading zeroes will then be interpreted the C way - it is not + * recommended to do this. + */ + LEX_TOK_OCTAL, + + /* + * hex ::= hex-int + * hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9' + * hex-int ::= (0x | 0X) hex_digts* + * + * where hex_digits are customizable (e.g. all lower case), and hex + * prefix 0x can be flagged to be lower case only (as in Julia). + * + * If hex floats are enabled, they take precedence: + * 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed + * by .0(decimal float). + * + * The lead prefix 0x may by flagged to be lower case only because + * this is required by Julia v0.3 where 0X means 0 * X. Julia + * accepts uppercase in the remaining hex digits (and exponent for + * floats). This could possibly change in future versions. + * + * The zero length sequence (0x | 0X) is accepted and left to the + * parser since the lexer emits a token for everything it sees. + * Conceptually it may be interpreted as zero, equivalent to 0 being + * both octal prefix and numeric 0 in C style octal representation. + * Or it may be an error. + */ + LEX_TOK_HEX, + + /* + * hex_float ::= hex-int ['.' hex_digit*] hex-exponent + * hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit* + * decimal-digit ::= '0'..'9' + * + * A superset of IEEE-754-2008 Hexadecimal Floating Point notation. + * + * We require the exponent to be present, but does not ensure the + * value is otherwise complete, e.g. 0x1p+ would be accepted. The p + * is needed because otherwise 0x1.f could be accepted, and f is a + * float suffix in C, and juxtapostion factor (0x1. * f) in Julia, + * at least, that is one possible interpretation. + * + * The exponent can be flagged optional in which case 0x1.f will be + * consumed as a single hex float toke as a single hex float token. + * This may either simply be accepted in some grammars, or used to + * provide an error message. If the exponent is required, 0x1.f will + * be lexed as three tokens: + * + * <'0x1'(hex int), '.'(op), 'f'(id)>. + * + * Thus it may be a good idea to allow the exponent to be optional + * anyway and issue an error message or warning if the p is absent + * later in the parsing stage. + * + * Note that, as per IEEE-754, the exponent is a decimal power of + * two. In other words, the number of bits to shift the + * (hexa)decimal point. Also note that it is p and not e because e + * is a hex digit. + */ + LEX_TOK_HEX_FLOAT, + + /* + * blank ::= ('\t' | '\x20')+ + * + * Longest run in buffer holding only '\t' and '\x20' (space). + * + * buffer splits may generate adjacent blanks depending on recovery + * processing. (The same goes for other line oriented runs such as + * string parts and comment parts). + */ + LEX_TOK_BLANK, + + /* newline ::= '\r' | '\n' | '\r\n' | '\n\r' + * + * Will always appear, also inside strings and comments. Can be used + * to track line starts and counts reliably as only one newline is + * issued at a time, and it is issued everywhere, also in strings + * and comments. + * + * May be preceeded by string escape token inside strings. This can + * be interpreted as line continuation within strings specifically, + * as is the case in Python and Javascript (and in C via + * pre-processor). + * + * The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary + * newline may be ignored in comments and other non-string content. + */ + LEX_TOK_NEWLINE, + LEX_TOK_STRING_NEWLINE, + + /* + * string ::= string_start + * (string_part | string_escape | + * string_ctrl | string_newline)* + * (string_end | string_unterminated) + * + * There are several optional string styles. They all start with + * this token. The length and content provided details. Python + * may start with """ or ''' and this token will then have length + * 3 and three quotes as lexeme content. If the lexer exits before + * string end token, the returned lexer mode will remember the + * state and can be used for reentry - this also goes for comments. + * + * Strings can only contain part, escape, newline, and control + * tokens, and either string unterminated or string end token + * at last. + */ + LEX_TOK_STRING_BEGIN, + + /* Longest run without control characters, without (\), without + * newline, and without the relevant end delimiter. The run may be + * shortened due to buffer splits. The part may, as an exception, + * begin with an end delimiter character or a (\) if it was + * preceeded by a string escape token. The escape character is + * always (\). Strings that use "" or '' as escape will be treated + * as start and end of separate strings. Strings that do not supoort + * (\) should just treat escape as a part of the string. + */ + LEX_TOK_STRING_PART, + + /* + * This is always a single character token (\) and only happens + * inside strings. See also string part token. + */ + LEX_TOK_STRING_ESCAPE, + + /* This token is similar to string start. It may be absent at buffer + * splits, but will then an unterminated string token will be used + * just before the split event token. + * + * */ + LEX_TOK_STRING_END, + + /* + * This is emitted before the buffer ends, or before unescaped + * newlines for line oriented string types (the usual strings). + * At buffer splits, recovery should clean it up. The returned + * mode allow parsing to continue in a new buffer with a slight + * content overlap. + * + * If string like ("hello, world!") in C, reaches end of line, it + * may be continued" ("hello, \)newline(world!"). If this line + * continuation is flagged out, this will lead to string + * unterminated, even if not at end of buffer. For block strings + * like """hello""", this only happens at end of buffer. + */ + LEX_TOK_STRING_UNTERMINATED, + + /* + * + * comment ::= comment_start + * (comment_part | ctrl | newline)* + * (comment_end | comment_unterminated) + * + * + * Comments work like strings in most respects. They emit parts, and + * control characters, but not escape characters, and cannot be + * continued at end of line. Block comments are like python block + * strings ('''). + * + * Julia supports nested comments (#= ... #= =# =#). In this case + * a new start token can be emitted before an end token. If the + * parser exits due to buffer split, the mode has the nesting level + * encoded so it can resumed in a new buffer. + * + * Line comments will have their end token just before newline, or + * unterminated comment just before buffer split token (EOB or EOS). + * (\) characters are consumed by the comment part tokens and do not + * affect the end of any comment. + * + * Comment begin may include extra characters when a doc comment is + * recognized. The emitter flags this. End comments are unaffected. + */ + LEX_TOK_COMMENT_BEGIN, + LEX_TOK_COMMENT_PART, + LEX_TOK_COMMENT_END, + LEX_TOK_COMMENT_UNTERMINATED, + + /* + * Issued before ABORT token if nesting level is above a predefined + * level. This is to protect against malicious and misguided + * content, otherwise the nesting level counter could wrap and + * generate a different interpretation, which could be bad. The + * parser would probably do similar things with nested tokens. + */ + LEX_TOK_COMMENT_DEEPLY_NESTED, + + + /* Operators are all recognized single character symbols, or up to + * four characters. The token value is the ASCII codes shifted 8 + * bits per extra character, by default, but the emitter macros + * can redefine this. Values below 32 are reserved token types as + * discussed above. + * + * What exactly represents an operator depends on what the lexer has + * enabled. + * + * Printable ASCII symbols that are NOT recognized, are emitted as + * the SYMBOL token and is always length 1. The value can be derived + * from the lexeme, but not the token itself. This may be perfectly + * fine for the parser, or it may be used to indicate an error. + * There are no illegal characters per se. + * + * Non-printable ASCII characters that are not covered by newline or + * blank, are emitted as CTRL tokens. These act the same as the + * symbol token and may be used to indicate error, or to handle form + * feed and other whitespace not handled by default. Unlike symbol, + * however, CTRL also appear in strings and comments since they are + * generally not allowed and this makes it easy to capture (there is + * virtually no performance overhead in providing this service + * unless attempting to parse a binary format). + */ + + /* Don't bleed into this range. */ + LEX_TOK_OPERATOR_BASE = 32, + + + /* + * Operators use ASCII range. + * Compound operators use range 0x80 to 0x7fff + * and possibly above for triple sequences. + * Custom keywords are normally negative but can be mapped + * to any other. + * + * The layout is designed for efficient table lookup. + * Compound operators might benefit from remapping down to a smaller + * range for compact lookup tables, but it depends on the parser. + */ +}; + +/* + * Custom keyword token range is negative, and well below -99..0 where + * special codes are reserved. + */ +#ifndef LEX_TOK_KW_BASE +#define LEX_TOK_KW_BASE -1000 +#endif + +#ifndef LEX_TOK_KW_NOT_FOUND +#define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID +#endif + + +#ifdef LEX_DEBUG + +#include <stdio.h> +#include <string.h> + +static const char *lex_describe_token(long token) +{ + switch(token) { + case LEX_TOK_BOM: return "BOM marker"; + case LEX_TOK_EOF: return "EOF"; + case LEX_TOK_EOS: return "buffer zero terminated"; + case LEX_TOK_EOB: return "buffer exhausted"; + case LEX_TOK_ABORT: return "abort"; + case LEX_TOK_CTRL: return "control"; + case LEX_TOK_STRING_CTRL: return "string control"; + case LEX_TOK_COMMENT_CTRL: return "comment control"; + case LEX_TOK_SYMBOL: return "symbol"; + case LEX_TOK_ID: return "identifier"; + case LEX_TOK_INT: return "integer"; + case LEX_TOK_FLOAT: return "float"; + case LEX_TOK_BINARY: return "binary"; + case LEX_TOK_OCTAL: return "octal"; + case LEX_TOK_HEX: return "hex"; + case LEX_TOK_HEX_FLOAT: return "hex float"; + case LEX_TOK_BLANK: return "blank"; + case LEX_TOK_NEWLINE: return "newline"; + case LEX_TOK_STRING_NEWLINE: return "string newline"; + case LEX_TOK_STRING_BEGIN: return "string begin"; + case LEX_TOK_STRING_PART: return "string part"; + case LEX_TOK_STRING_END: return "string end"; + case LEX_TOK_STRING_ESCAPE: return "string escape"; + case LEX_TOK_STRING_UNTERMINATED: return "unterminated string"; + case LEX_TOK_COMMENT_BEGIN: return "comment begin"; + case LEX_TOK_COMMENT_PART: return "comment part"; + case LEX_TOK_COMMENT_END: return "comment end"; + case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment"; + case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment"; + + default: + if (token < LEX_TOK_EOF) { + return "keyword"; + } + if (token < 32) { + return "undefined"; + } + if (token < 0x100L) { + return "operator"; + } + if (token < 0x10000L) { + return "compound operator"; + } + if (token < 0x1000000L) { + return "tricompound operator"; + } + if (token < 0x7f0000000L) { + return "quadcompound operator"; + } + return "reserved"; + } +} + +static void lex_fprint_token(FILE *fp, + long token, + const char *first, const char *last, + int line, int pos) +{ + char buf[10]; + const char *lexeme = first; + int len = (int)(last - first); + switch (token) { + case LEX_TOK_EOS: + case LEX_TOK_CTRL: + sprintf(buf, "^%02x", (int)*first); + lexeme = buf; + len = strlen(buf); + break; + default: + break; + } + fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n", + line, pos, lex_describe_token(token), token, len, lexeme); +} + +#define lex_print_token(token, first, last, line, pos) \ + lex_fprint_token(stdout, token, first, last, line, pos) + +#else /* LEX_DEBUG */ + +#define lex_describe_token(token) "debug not available" +#define lex_fprint_token(fp, token, first, last, line, pos) ((void)0) +#define lex_print_token(token, first, last, line, pos) ((void)0) + +#endif /* LEX_DEBUG */ + + +#endif /* LEX_TOKENS_H */ + |