5 files changed, 2559 insertions, 0 deletions
diff --git a/flatcc/external/lex/LICENSE b/flatcc/external/lex/LICENSE
new file mode 100644
index 0000000..8e84a48
--- /dev/null
+++ b/flatcc/external/lex/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/flatcc/external/lex/README.md b/flatcc/external/lex/README.md
new file mode 100644
index 0000000..3144091
--- /dev/null
+++ b/flatcc/external/lex/README.md
@@ -0,0 +1,3 @@
+Essential files extracted from the luthor scanner - a generic scanner
+similar to a handwritten scanner, but covering many common cases by
+default.
diff --git a/flatcc/external/lex/luthor.c b/flatcc/external/lex/luthor.c
new file mode 100644
index 0000000..fc81985
--- /dev/null
+++ b/flatcc/external/lex/luthor.c
@@ -0,0 +1,1509 @@
+/*
+ * Designed to be included in other C files which define emitter
+ * operations. The same source may thus be used to parse different
+ * grammars.
+ *
+ * The operators cover the most common operators i the C family.  Each
+ * operator does not have a name, it is represent by a long token code
+ * with up to 4 ASCII characters embedded literally. This avoids any
+ * semantic meaning at the lexer level. Emitters macros can redefine
+ * this behavior.
+ *
+ * No real harm is done in accepting a superset, but the source is
+ * intended to be modified, have things flagged or removed, other things
+ * added. The real complicity is in numbers, identifiers, and comments,
+ * which should be fairly complete with flagging as is.
+ *
+ * Keyword handling is done at macroes, and described elsewhere, but for
+ * identifier compatible keywords, this is quite efficient to handle on
+ * a per language basis without modifying this source.
+ *
+ * The Lisp language family is somewhat different and not directly
+ * suited for this lexer, although it can easily be modified to suit.
+ * The main reason is ';' for comments, and operators used as part of
+ * the identifier symbol set, and no need for operator classification,
+ * and different handling of single character symbols.
+ *
+ * So overall, we more or less have one efficient unified lexer that can
+ * manage many languages - this is good, because it is a pain to write a
+ * new lexer by hand, and lexer tools are what they are.
+ */
+
+#include "luthor.h"
+
+#ifdef LEX_C99_NUMERIC
+#define LEX_C_NUMERIC
+#define LEX_HEX_FLOAT_NUMERIC
+#define LEX_BINARY_NUMERIC
+#endif
+
+#ifdef LEX_C_NUMERIC
+#define LEX_C_OCTAL_NUMERIC
+#define LEX_HEX_NUMERIC
+#endif
+
+#ifdef LEX_JULIA_NUMERIC
+#ifdef LEX_C_OCTAL_NUMERIC
+/*
+ * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically
+ * coexist, but leading zeroes give C style leading zero numbers
+ * which can lead to incorrect values depending on expectations.
+ * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this.
+ */
+#error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers"
+#endif
+
+/*
+ * Julia v0.3 insists on lower case, and has a different meaning for
+ * upper case.
+ */
+#define LEX_LOWER_CASE_NUMERIC_PREFIX
+#define LEX_JULIA_OCTAL_NUMERIC
+#define LEX_HEX_FLOAT_NUMERIC
+#define LEX_BINARY_NUMERIC
+
+#endif
+
+#ifdef LEX_HEX_FLOAT_NUMERIC
+#define LEX_HEX_NUMERIC
+#endif
+
+/*
+ * Numeric and string constants do not accept prefixes such as u, l, L,
+ * U, ll, LL, f, or F in C, or various others in Julia strings. Use the
+ * parser to detect juxtaposition between identifier and constant. In
+ * Julia numeric suffix means multiplication, in C it is a type
+ * qualifier.  Sign, such as defined in JSON, are also not accepted -
+ * they must be operators.  See source for various flag to enable
+ * different token types.
+ */
+
+/*
+ * Includes '_' in identifers by default. Defines follow characters in
+ * identifiers but not the lead character - it must be defined in switch
+ * cases.  If the identifier allows for dash '-', it is probably better
+ * to handle it as an operator and flag surrounding space in the parser.
+ */
+#ifndef lex_isalnum
+
+/*
+ * NOTE: isalnum, isalpha, is locale dependent. We only want to
+ * to consider that ASCII-7 subset and treat everything else as utf-8.
+ * This table is not for leading identifiers, as it contains 0..9.
+ *
+ * For more correct handling of UTF-8, see:
+ * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon
+ * based on Java Ident = NameStartChar NameChar*
+ *
+ * While the following is UTF-16, it can be adapted to UTF-8 easily.
+
+
+    fragment
+    NameChar
+       : NameStartChar
+       | '0'..'9'
+       | '_'
+       | '\u00B7'
+       | '\u0300'..'\u036F'
+       | '\u203F'..'\u2040'
+       ;
+    fragment
+    NameStartChar
+       : 'A'..'Z' | 'a'..'z'
+       | '\u00C0'..'\u00D6'
+       | '\u00D8'..'\u00F6'
+       | '\u00F8'..'\u02FF'
+       | '\u0370'..'\u037D'
+       | '\u037F'..'\u1FFF'
+       | '\u200C'..'\u200D'
+       | '\u2070'..'\u218F'
+       | '\u2C00'..'\u2FEF'
+       | '\u3001'..'\uD7FF'
+       | '\uF900'..'\uFDCF'
+       | '\uFDF0'..'\uFFFD'
+       ;
+ */
+
+static const char lex_alnum[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    /* 0..9 */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+    /* A..O */
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* P..Z, _ */
+#ifdef LEX_ID_WITHOUT_UNDERSCORE
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+#else
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+#endif
+    /* a..o */
+    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    /* p..z */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+#ifdef LEX_ID_WITH_UTF8
+    /* utf-8 */
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+#else
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#endif
+};
+
+#define lex_isalnum(c) (lex_alnum[(unsigned char)(c)])
+#endif
+
+#ifndef lex_isbindigit
+#define lex_isbindigit(c) ((c) == '0' || (c) == '1')
+#endif
+
+#ifndef lex_isoctdigit
+#define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8)
+#endif
+
+#ifndef lex_isdigit
+#define lex_isdigit(c) ((c) >= '0' && (c) <= '9')
+#endif
+
+#ifndef lex_ishexdigit
+#define lex_ishexdigit(c) (((c) >= '0' && (c) <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f'))
+#endif
+
+#ifndef lex_isctrl
+#include <ctype.h>
+#define lex_isctrl(c) ((c) < 0x20 || (c) == 0x7f)
+#endif
+
+#ifndef lex_isblank
+#define lex_isblank(c) ((c) == ' ' || (c) == '\t')
+#endif
+
+#ifndef lex_iszterm
+#define lex_iszterm(c) ((c) == '\0')
+#endif
+
+/*
+ * If ZTERM is disabled, zero will be a LEX_CTRL token
+ * and allowed to be embedded in comments and strings, or
+ * elsewhere, as long as the parser accepts the token.
+ */
+#ifdef LEX_DISABLE_ZTERM
+#undef lex_iszterm
+#define lex_iszterm(c) (0)
+#endif
+
+/*
+ * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned
+ * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned.
+ * If a buffer stopped in the middle of a string or a comment, the mode
+ * will reflect that. In all cases some amount of recovery is needed
+ * before starting a new buffer - see detailed comments in header file.
+ * If only a single buffer is used, special handling is still needed if
+ * the last line contains a single line comment because it will not be
+ * terminated, but it amounts to replace the emitted unterminated
+ * comment token with an end of comment token.
+ *
+ * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will
+ * an strip optional BOM before moving to normal mode. Currently only
+ * UTF-8 BOM is supported, and this is unlikely to change.
+ *
+ * The context variable is user-defined and available to emitter macros.
+ * It may be null if unused.
+ *
+ */
+static int lex(const char *buf, size_t len, int mode, void *context)
+{
+    const char *p, *q, *s, *d;
+#if 0
+    /* TODO: old, remove this */
+    , *z, *f;
+#endif
+
+    p = buf;        /* next char */
+    q = p + len;    /* end of buffer */
+    s = p;          /* start of token */
+    d = p;          /* end of integer part */
+
+#if 0
+    /* TODO: old, remove this */
+
+    /* Used for float and leading zero detection in numerics. */
+    z = p;
+    f = p;
+#endif
+
+    /*
+     * Handle mid string and mid comment for reentering across
+     * buffer boundaries. Strip embedded counter from mode.
+     */
+    switch(mode & (LEX_MODE_COUNT_BASE - 1)) {
+
+    case LEX_MODE_NORMAL:
+        goto lex_mode_normal;
+
+    case LEX_MODE_BOM:
+        goto lex_mode_bom;
+
+#ifdef LEX_C_STRING
+    case LEX_MODE_C_STRING:
+        goto lex_mode_c_string;
+#endif
+#ifdef LEX_PYTHON_BLOCK_STRING
+    case LEX_MODE_PYTHON_BLOCK_STRING:
+        goto lex_mode_python_block_string;
+#endif
+#ifdef LEX_C_STRING_SQ
+    case LEX_MODE_C_STRING_SQ:
+        goto lex_mode_c_string_sq;
+#endif
+#ifdef LEX_PYTHON_BLOCK_STRING_SQ
+    case LEX_MODE_PYTHON_BLOCK_STRING_SQ:
+        goto lex_mode_python_block_string_sq;
+#endif
+#ifdef LEX_C_BLOCK_COMMENT
+    case LEX_MODE_C_BLOCK_COMMENT:
+        goto lex_mode_c_block_comment;
+#endif
+#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT)
+    case LEX_MODE_LINE_COMMENT:
+        goto lex_mode_line_comment;
+#endif
+#ifdef LEX_JULIA_NESTED_COMMENT
+    case LEX_MODE_JULIA_NESTED_COMMENT:
+        goto lex_mode_julia_nested_comment;
+#endif
+
+    default:
+        /*
+         * This is mostly to kill unused label warning when comments
+         * are disabled.
+         */
+        goto lex_mode_exit;
+    }
+
+lex_mode_bom:
+
+    mode = LEX_MODE_BOM;
+
+    /*
+     * Special entry mode to consume utf-8 bom if present. We don't
+     * support other boms, but we would use the same token if we did.
+     *
+     * We generally expect no bom present, but it is here if needed
+     * without requiring ugly hacks elsewhere.
+     */
+    if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') {
+        p += 3;
+        lex_emit_bom(s, p);
+    }
+    goto lex_mode_normal;
+
+/* If source is updated, also update LEX_C_STRING_SQ accordingly. */
+#ifdef LEX_C_STRING
+lex_mode_c_string:
+
+    mode = LEX_MODE_C_STRING;
+
+    for (;;) {
+        --p;
+        /* We do not allow blanks that are also control characters, such as \t. */
+        while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) {
+        }
+        if (s != p) {
+            lex_emit_string_part(s, p);
+            s = p;
+        }
+        if (*p == '\"') {
+            ++p;
+            lex_emit_string_end(s, p);
+            goto lex_mode_normal;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\\') {
+            ++p;
+             /* Escape is only itself, whatever is escped follows separately. */
+            lex_emit_string_escape(s, p);
+            s = p;
+            if (p == q || lex_iszterm(*p)) {
+                lex_emit_string_unterminated(p);
+                goto lex_mode_normal;
+            }
+            if (*p == '\\' || *p == '\"') {
+                ++p;
+                continue;
+            }
+            /*
+             * Flag only relevant for single line strings, as it
+             * controls whether we fail on unterminated string at line
+             * ending with '\'.
+             *
+             * Julia does not support line continuation in strings
+             * (or elsewhere). C, Python, and Javascript do.
+             */
+#ifndef LEX_DISABLE_STRING_CONT
+            if (*p == '\n') {
+                if (++p != q && *p == '\r') {
+                    ++p;
+                }
+                lex_emit_string_newline(s, p);
+                s = p;
+                continue;
+            }
+            if (*p == '\r') {
+                if (++p != q && *p == '\n') {
+                    ++p;
+                }
+                lex_emit_string_newline(s, p);
+                s = p;
+                continue;
+            }
+#endif
+        }
+        if (*p == '\n' || *p == '\r') {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        ++p;
+        lex_emit_string_ctrl(s);
+        s = p;
+    }
+#endif
+
+/*
+ * This is a copy if LEX_C_STRING with single quote. It's not DRY, but
+ * no reason to parameterized inner loops, just because. Recopy of
+ * changes are to the above.
+ *
+ * Even if single quote is only used for CHAR types, it makes sense to
+ * parse as a full string since there can be all sorts of unicocde
+ * escapes and line continuations, newlines to report and unexpected
+ * control characters to deal with.
+ */
+#ifdef LEX_C_STRING_SQ
+lex_mode_c_string_sq:
+
+    mode = LEX_MODE_C_STRING_SQ;
+
+    for (;;) {
+        --p;
+        while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) {
+        }
+        if (s != p) {
+            lex_emit_string_part(s, p);
+            s = p;
+        }
+        if (*p == '\'') {
+            ++p;
+            lex_emit_string_end(s, p);
+            goto lex_mode_normal;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\\') {
+            ++p;
+             /* Escape is only itself, whatever is escped follows separately. */
+            lex_emit_string_escape(s, p);
+            s = p;
+            if (p == q || lex_iszterm(*p)) {
+                lex_emit_string_unterminated(p);
+                goto lex_mode_normal;
+            }
+            if (*p == '\\' || *p == '\'') {
+                ++p;
+                continue;
+            }
+            /*
+             * Flag only relevant for single line strings, as it
+             * controls whether we fail on unterminated string at line
+             * ending with '\'.
+             *
+             * Julia does not support line continuation in strings
+             * (or elsewhere). C, Python, and Javascript do.
+             */
+#ifndef LEX_DISABLE_STRING_CONT
+            if (*p == '\n') {
+                if (++p != q && *p == '\r') {
+                    ++p;
+                }
+                lex_emit_string_newline(s, p);
+                s = p;
+                continue;
+            }
+            if (*p == '\r') {
+                if (++p != q && *p == '\n') {
+                    ++p;
+                }
+                lex_emit_string_newline(s, p);
+                s = p;
+                continue;
+            }
+#endif
+        }
+        if (*p == '\n' || *p == '\r') {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        ++p;
+        lex_emit_string_ctrl(s);
+        s = p;
+    }
+#endif
+
+/*
+ * """ Triple quoted Python block strings. """
+ * Single quoted version (''') is a direct copy, update both places
+ * if a changed is needed.
+ *
+ * Note: there is no point in disabling line continuation
+ * for block strings, since it only affects unterminated
+ * string errors at newline. It all comes down to how
+ * escaped newline is interpreted by the parser.
+ */
+#ifdef LEX_PYTHON_BLOCK_STRING
+lex_mode_python_block_string:
+
+    mode = LEX_MODE_PYTHON_BLOCK_STRING;
+
+    for (;;) {
+        --p;
+        while (++p != q && *p != '\\' && !lex_isctrl(*p)) {
+            if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') {
+                break;
+            }
+        }
+        if (s != p) {
+            lex_emit_string_part(s, p);
+            s = p;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\"') {
+            p += 3;
+            lex_emit_string_end(s, p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\\') {
+             /* Escape is only itself, allowing parser to interpret and validate. */
+            ++p;
+            lex_emit_string_escape(s, p);
+            s = p;
+            if (p + 1 != q && (*p == '\\' || *p == '\"')) {
+                ++p;
+            }
+            continue;
+        }
+        if (*p == '\n') {
+            if (++p != q && *p == '\r') {
+                ++p;
+            }
+            lex_emit_string_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (*p == '\r') {
+            if (++p != q && *p == '\n') {
+                ++p;
+            }
+            lex_emit_string_newline(s, p);
+            s = p;
+            continue;
+        }
+        ++p;
+        lex_emit_string_ctrl(s);
+        s = p;
+    }
+#endif
+
+/*
+ * Python ''' style strings.
+ * Direct copy of """ quote version, update both if changed.
+ */
+#ifdef LEX_PYTHON_BLOCK_STRING_SQ
+lex_mode_python_block_string_sq:
+
+    mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ;
+
+    for (;;) {
+        --p;
+        while (++p != q && *p != '\\' && !lex_isctrl(*p)) {
+            if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') {
+                break;
+            }
+        }
+        if (s != p) {
+            lex_emit_string_part(s, p);
+            s = p;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_string_unterminated(p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\'') {
+            p += 3;
+            lex_emit_string_end(s, p);
+            goto lex_mode_normal;
+        }
+        if (*p == '\\') {
+             /* Escape is only itself, allowing parser to interpret and validate. */
+            ++p;
+            lex_emit_string_escape(s, p);
+            s = p;
+            if (p + 1 != q && (*p == '\\' || *p == '\'')) {
+                ++p;
+            }
+            continue;
+        }
+        if (*p == '\n') {
+            if (++p != q && *p == '\r') {
+                ++p;
+            }
+            lex_emit_string_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (*p == '\r') {
+            if (++p != q && *p == '\n') {
+                ++p;
+            }
+            lex_emit_string_newline(s, p);
+            s = p;
+            continue;
+        }
+        ++p;
+        lex_emit_string_ctrl(s);
+        s = p;
+    }
+#endif
+
+/*
+ * We don't really care if it is a shell style comment or a C99,
+ * or any other line oriented commment, as the termination is
+ * the same.
+ */
+#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT)
+lex_mode_line_comment:
+
+    mode = LEX_MODE_LINE_COMMENT;
+
+    for (;;) {
+        --p;
+        while (++p != q && (!lex_isctrl(*p))) {
+        }
+        if (s != p) {
+            lex_emit_comment_part(s, p);
+            s = p;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            /*
+             * Unterminated comment here is not necessarily true,
+             * not even likely, nor possible, but we do this to
+             * handle buffer switch consistently: any non-normal
+             * mode exit will have an unterminated token to fix up.
+             * Here it would be conversion to end of comment, which
+             * we cannot know yet, since the line might continue in
+             * the next buffer. This is a zero length token.
+             */
+            lex_emit_comment_unterminated(p);
+            goto lex_mode_exit;
+        }
+        if (*p == '\n' || *p == '\r') {
+            lex_emit_comment_end(s, p);
+            goto lex_mode_normal;
+        }
+        ++p;
+        lex_emit_comment_ctrl(s);
+        s = p;
+    }
+#endif
+
+#ifdef LEX_C_BLOCK_COMMENT
+lex_mode_c_block_comment:
+
+    mode = LEX_MODE_C_BLOCK_COMMENT;
+
+    for (;;) {
+        --p;
+        while (++p != q && (!lex_isctrl(*p))) {
+            if (*p == '/' && p[-1] == '*') {
+                --p;
+                break;
+            }
+        }
+        if (s != p) {
+            lex_emit_comment_part(s, p);
+            s = p;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_comment_unterminated(p);
+            goto lex_mode_exit;
+        }
+        if (*p == '\n') {
+            if (++p != q && *p == '\r') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (*p == '\r') {
+            if (++p != q && *p == '\n') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (lex_isctrl(*p)) {
+            ++p;
+            lex_emit_comment_ctrl(s);
+            s = p;
+            continue;
+        }
+        p += 2;
+        lex_emit_comment_end(s, p);
+        s = p;
+        goto lex_mode_normal;
+    }
+#endif
+
+    /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */
+#ifdef LEX_JULIA_NESTED_COMMENT
+lex_mode_julia_nested_comment:
+
+    /* Preserve nesting level on re-entrance. */
+    if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) {
+        mode = LEX_MODE_JULIA_NESTED_COMMENT;
+    }
+    /* We have already entered. */
+    mode += LEX_MODE_COUNT_BASE;
+
+    for (;;) {
+        --p;
+        while (++p != q && !lex_isctrl(*p)) {
+            if (*p == '#') {
+                if (p[-1] == '=') {
+                    --p;
+                    break;
+                }
+                if (p + 1 != q && p[1] == '=') {
+                    break;
+                }
+            }
+        }
+        if (s != p) {
+            lex_emit_comment_part(s, p);
+            s = p;
+        }
+        if (p == q || lex_iszterm(*p)) {
+            lex_emit_comment_unterminated(p);
+            goto lex_mode_exit;
+        }
+        if (*p == '\n') {
+            if (++p != q && *p == '\r') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (*p == '\r') {
+            if (++p != q && *p == '\n') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            s = p;
+            continue;
+        }
+        if (lex_isctrl(*p)) {
+            ++p;
+            lex_emit_comment_ctrl(s);
+            s = p;
+            continue;
+        }
+        if (*p == '=') {
+            p += 2;
+            lex_emit_comment_end(s, p);
+            s = p;
+            mode -= LEX_MODE_COUNT_BASE;
+            if (mode / LEX_MODE_COUNT_BASE > 0) {
+                continue;
+            }
+            goto lex_mode_normal;
+        }
+        /* The upper bits are used as counter. */
+        mode += LEX_MODE_COUNT_BASE;
+        p += 2;
+        lex_emit_comment_begin(s, p, 0);
+        s = p;
+        if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) {
+            /* Prevent malicious input from overflowing counter. */
+            lex_emit_comment_deeply_nested(p);
+            lex_emit_abort(p);
+            return mode;
+        }
+    }
+#endif
+
+/* Unlike other modes, we can always jump here without updating token start `s` first. */
+lex_mode_normal:
+
+    mode = LEX_MODE_NORMAL;
+
+    while (p != q) {
+        s = p;
+
+        switch(*p) {
+
+#ifndef LEX_DISABLE_ZTERM
+        case '\0':
+            lex_emit_eos(s, p);
+            return mode;
+#endif
+
+        /* \v, \f etc. are covered by the CTRL token, don't put it here. */
+        case '\t': case ' ':
+            while (++p != q && lex_isblank(*p)) {
+            }
+            lex_emit_blank(s, p);
+            continue;
+
+        /*
+         * Newline should be emitter in all constructs, also comments
+         * and strings which have their own newline handling.
+         * Only one line is emitted at a time permitting simple line
+         * counting.
+         */
+        case '\n':
+            if (++p != q && *p == '\r') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            continue;
+
+        case '\r':
+            if (++p != q && *p == '\n') {
+                ++p;
+            }
+            lex_emit_newline(s, p);
+            continue;
+
+            /*
+             * C-style string, and Python style triple double quote
+             * delimited multi-line string. Prefix and suffix symbols
+             * should be parsed separately, e.g. L"hello" are two
+             * tokens.
+             */
+#if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING)
+        case '\"':
+#ifdef LEX_PYTHON_BLOCK_STRING
+            if (p + 2 < q && p[1] == '\"' && p[2] == '\"') {
+                p += 3;
+                lex_emit_string_begin(s, p);
+                s = p;
+                goto lex_mode_python_block_string;
+            }
+#endif
+#ifdef LEX_C_STRING
+            ++p;
+            lex_emit_string_begin(s, p);
+            s = p;
+            goto lex_mode_c_string;
+#endif
+#endif
+
+            /*
+             * Single quoted version of strings, otherwise identical
+             * behavior. Can also be used for char constants if checked
+             * by parser subsequently.
+             */
+#if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ)
+        case '\'':
+#ifdef LEX_PYTHON_BLOCK_STRING_SQ
+            if (p + 2 < q && p[1] == '\'' && p[2] == '\'') {
+                p += 3;
+                lex_emit_string_begin(s, p);
+                s = p;
+                goto lex_mode_python_block_string_sq;
+            }
+#endif
+#ifdef LEX_C_STRING_SQ
+            ++p;
+            lex_emit_string_begin(s, p);
+            s = p;
+            goto lex_mode_c_string_sq;
+#endif
+#endif
+
+#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT)
+            /*
+             * Line comment excluding terminal line break.
+             *
+             * See also C99 line comment `//`.
+             *
+             * Julia uses `#=` and `=#` for nested block comments.
+             * (According to Julia developers, '#=` is motivated by `=`
+             * not being likely to start anything that you would put a
+             * comment around, unlike `#{`, `}#` or `#(`, `)#`)).
+             *
+             * Some known doc comment formats are identified and
+             * included in the comment_begin token.
+             */
+        case '#':
+            ++p;
+#ifdef LEX_JULIA_NESTED_COMMENT
+            if (p != q && *p == '=') {
+                ++p;
+                lex_emit_comment_begin(s, p, 0);
+                s = p;
+                goto lex_mode_julia_nested_comment;
+            }
+#endif
+            lex_emit_comment_begin(s, p, 0);
+            s = p;
+            goto lex_mode_line_comment;
+#endif
+
+        case '/':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+#ifdef LEX_C99_LINE_COMMENT
+                case '/':
+                    ++p;
+                    p += p != q && (*p == '/' || *p == '!');
+                    lex_emit_comment_begin(s, p, (p - s == 3));
+                    s = p;
+                    goto lex_mode_line_comment;
+#endif
+#ifdef LEX_C_BLOCK_COMMENT
+                case '*':
+                    ++p;
+                    p += p != q && (*p == '*' || *p == '!');
+                    lex_emit_comment_begin(s, p, (p - s == 3));
+                    s = p;
+                    goto lex_mode_c_block_comment;
+#endif
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('/', '=', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('/', s, p);
+            continue;
+
+        case '(': case ')': case '[': case ']': case '{': case '}':
+        case ',': case ';': case '\\': case '?':
+            ++p;
+            lex_emit_op(*s, s, p);
+            continue;
+
+        case '%': case '!': case '~': case '^':
+            ++p;
+            if (p != q && *p == '=') {
+                ++p;
+                lex_emit_compound_op(*s, '=', s, p);
+                continue;
+            }
+            lex_emit_op(*s, s, p);
+            continue;
+
+        case '|':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('|', '=', s, p);
+                    continue;
+                case '|':
+                    ++p;
+                    lex_emit_compound_op('|', '|', s, p);
+                    break;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('|', s, p);
+            continue;
+
+        case '&':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('&', '=', s, p);
+                    continue;
+                case '&':
+                    ++p;
+                    lex_emit_compound_op('&', '&', s, p);
+                    break;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('&', s, p);
+            continue;
+
+        case '=':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '>':
+                    ++p;
+                    lex_emit_compound_op('=', '>', s, p);
+                    continue;
+                case '=':
+                    ++p;
+                    if (p != q && *p == '=') {
+                        ++p;
+                        lex_emit_tricompound_op('=', '=', '=', s, p);
+                        continue;
+                    }
+                    lex_emit_compound_op('=', '=', s, p);
+                    break;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('=', s, p);
+            continue;
+
+        case ':':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op(':', '=', s, p);
+                    continue;
+                case ':':
+                    ++p;
+                    if (p != q && *p == '=') {
+                        ++p;
+                        lex_emit_tricompound_op(':', ':', '=', s, p);
+                        continue;
+                    }
+                    lex_emit_compound_op(':', ':', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op(':', s, p);
+            continue;
+
+        case '*':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    lex_emit_compound_op('*', '=', s, p);
+                    continue;
+                case '*':
+                    /* **= hardly used anywhere? */
+                    lex_emit_compound_op('*', '*', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('*', s, p);
+            continue;
+
+        case '<':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '-':
+                    ++p;
+                    lex_emit_compound_op('<', '-', s, p);
+                    continue;
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('<', '=', s, p);
+                    continue;
+                case '<':
+                    ++p;
+                    if (p != q) {
+                        switch (*p) {
+                        case '=':
+                            ++p;
+                            lex_emit_tricompound_op('<', '<', '=', s, p);
+                            continue;
+                        case '<':
+                            ++p;
+                            if (p != q && *p == '=') {
+                                ++p;
+                                lex_emit_quadcompound_op('<', '<', '<', '=', s, p);
+                                continue;
+                            }
+                            lex_emit_tricompound_op('<', '<', '<', s, p);
+                            continue;
+                        default:
+                            break;
+                        }
+                    }
+                    lex_emit_compound_op('<', '<', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('<', s, p);
+            continue;
+
+        case '>':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('>', '=', s, p);
+                    continue;
+                case '>':
+                    ++p;
+                    if (p != q) {
+                        switch (*p) {
+                        case '=':
+                            ++p;
+                            lex_emit_tricompound_op('>', '>', '=', s, p);
+                            continue;
+                        case '>':
+                            ++p;
+                            if (p != q && *p == '=') {
+                                ++p;
+                                lex_emit_quadcompound_op('>', '>', '>', '=', s, p);
+                                continue;
+                            }
+                            lex_emit_tricompound_op('>', '>', '>', s, p);
+                            continue;
+                        default:
+                            break;
+                        }
+                    }
+                    lex_emit_compound_op('>', '>', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('>', s, p);
+            continue;
+
+        case '-':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('-', '=', s, p);
+                    continue;
+                case '-':
+                    ++p;
+                    lex_emit_compound_op('-', '-', s, p);
+                    continue;
+                case '>':
+                    ++p;
+                    lex_emit_compound_op('-', '>', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('-', s, p);
+            continue;
+
+        case '+':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '=':
+                    ++p;
+                    lex_emit_compound_op('+', '=', s, p);
+                    continue;
+
+                case '+':
+                    ++p;
+                    lex_emit_compound_op('+', '+', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('+', s, p);
+            continue;
+
+        case '.':
+            ++p;
+            if (p != q) {
+                switch (*p) {
+                case '0': case '1': case '2': case '3': case '4':
+                case '5': case '6': case '7': case '8': case '9':
+                    d = s;
+                    goto lex_dot_to_fraction_part;
+                case '.':
+                    ++p;
+                    if (p != q && *p == '.') {
+                        ++p;
+                        lex_emit_tricompound_op('.', '.', '.', s, p);
+                        continue;
+                    }
+                    lex_emit_compound_op('.', '.', s, p);
+                    continue;
+                default:
+                    break;
+                }
+            }
+            lex_emit_op('.', s, p);
+            continue;
+
+        case '0':
+            if (++p != q) {
+                switch (*p) {
+#ifdef LEX_C_OCTAL_NUMERIC
+
+                case '0': case '1': case '2': case '3':
+                case '4': case '5': case '6': case '7':
+                    while (++p != q && lex_isoctdigit(*p)) {
+                    }
+                    d = p;
+                    if (p != q) {
+                        /*
+                         * Leading zeroes like 00.10 are valid C
+                         * floating point constants.
+                         */
+                        if (*p == '.') {
+                            goto lex_c_octal_to_fraction_part;
+                        }
+                        if (*p == 'e' || *p == 'E') {
+                            goto lex_c_octal_to_exponent_part;
+                        }
+                    }
+                    lex_emit_octal(s, p);
+                    /*
+                     * If we have a number like 0079, it becomes
+                     * 007(octal), 9(decimal). The parser should
+                     * deal with this.
+                     *
+                     * To add to confusion i64 is a C integer suffix
+                     * like in 007i64, but 2+2i is a Go complex
+                     * constant. (Not specific to octals).
+                     *
+                     * This can all be handled by having the parser inspect
+                     * following identifier or numeric, parser
+                     * here meaning a lexer post processing step, not
+                     * necessarily the parser itself.
+                     */
+
+                    continue;
+#else
+                    /*
+                     * All integers reach default and enter
+                     * integer part. As a result, leading zeroes are
+                     * mapped to floats and integers which matches
+                     * Julia behavior. Other languages should decide
+                     * if leading zero is valid or not. JSON
+                     * disallows leading zero.
+                     */
+#endif
+
+#ifdef LEX_JULIA_OCTAL_NUMERIC
+                    /*
+                     * This is the style of octal, not 100% Julia
+                     * compatible. Also define Julia numeric to enforce
+                     * lower case.
+                     */
+#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
+                    /* See also hex 0X. Julia v.0.3 uses lower case only here. */
+                case 'O':
+#endif
+                    /*
+                     * Julia accepts 0o700 as octal and 0b100 as
+                     * binary, and 0xa00 as hex, and 0100 as
+                     * integer, and 1e2 as 64 bit float and 1f2 as
+                     * 32 bit float. Julia 0.3 does not support
+                     * octal and binary fractions.
+                     */
+                case 'o':
+                    while (++p != q && lex_isoctdigit(*p)) {
+                    }
+                    lex_emit_octal(s, p);
+                    /* Avoid hitting int fall through. */
+                    continue;
+#endif
+#ifdef LEX_BINARY_NUMERIC
+                    /* Binary in C++14. */
+                case 'b':
+#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
+                    /* See also hex 0X. Julia v.0.3 uses lower case only here. */
+                case 'B':
+#endif
+                    while (++p != q && lex_isbindigit(*p)) {
+                    }
+                    lex_emit_binary(s, p);
+                    /* Avoid hitting int fall through. */
+                    continue;
+#endif
+#ifdef LEX_HEX_NUMERIC
+                case 'x':
+#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX
+                    /*
+                     * Julia v0.3 does not allow this, it thinks 0X1 is
+                     * 0 * X1, X1 being an identifier.
+                     * while 0x1 is a hex value due to precedence.
+                     *
+                     * TODO: This might change.
+                     */
+
+                case 'X':
+#endif
+                    while (++p != q && lex_ishexdigit(*p)) {
+                    }
+#ifdef LEX_HEX_FLOAT_NUMERIC
+                    /*
+                     * Most hexadecimal floating poing conversion
+                     * functions, including Pythons
+                     * float.fromhex("0x1.0"), Julias parse
+                     * function, and and C strtod on
+                     * supporting platforms, will parse without
+                     * exponent. The same languages do not support
+                     * literal constants without the p exponent.
+                     * First it is named p because e is a hex digit,
+                     * second, the float suffix f is also a hex
+                     * digit: 0x1.f is ambigious in C without that
+                     * rule. Conversions have no such ambiguity.
+                     * In Julia, juxtaposition means that 0x1.f
+                     * could mean 0x1p0 * f or 0x1.fp0.
+                     *
+                     * Since we are not doing conversion here but
+                     * lexing a stream, we opt to require the p
+                     * suffix because making it optional could end
+                     * up consuming parts of the next token.
+                     *
+                     * But, we also make a flag to make the exponent
+                     * optional, anyway. It could be used for better
+                     * error reporting than just consuming the hex
+                     * part since we likely should accept the ambigous
+                     * syntax either way.
+                     */
+                    d = p;
+                    if (p != q && *p == '.') {
+                        while (++p != q && lex_ishexdigit(*p)) {
+                        }
+                    }
+                    if (p != q && (*p == 'p' || *p == 'P')) {
+                        if (++p != q && *p != '+' && *p != '-') {
+                            --p;
+                        }
+                        /* The exponent is a decimal power of 2. */
+                        while (++p != q && lex_isdigit(*p)) {
+                        }
+                        lex_emit_hex_float(s, p);
+                        continue;
+                    }
+#ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT
+                    if (d != p) {
+                        lex_emit_hex_float(s, p);
+                        continue;
+                    }
+#else
+                    /*
+                     * Backtrack to decimal point. We require p to
+                     * be present because we could otherwise consume
+                     * part of the next token.
+                     */
+                    p = d;
+#endif
+#endif /* LEX_HEX_FLOAT_NUMERIC */
+                    lex_emit_hex(s, p);
+                    continue;
+#endif /* LEX_HEX_NUMERIC */
+
+                default:
+                    /*
+                     * This means leading zeroes like 001 or 001.0 are
+                     * treated like like int and float respectively,
+                     * iff C octals are flaggged out. Otherwise they
+                     * become 001(octal), and 001(octal),.0(float)
+                     * which should be treated as an error because
+                     * future extensions might allow octal floats.
+                     * (Not likely, but interpretion is ambigious).
+                     */
+                    break;
+                } /* Switch under '0' case. */
+
+                /*
+                 * Pure single digit '0' is an octal number in the C
+                 * spec. We have the option to treat it as an integer,
+                 * or as an octal. For strict C behavior, this can be
+                 * flagged in, but is disabled by default. It only
+                 * applies to single digit 0. Thus, with C octal
+                 * enabled, leading zeroes always go octal.
+                 */
+            } /* If condition around switch under '0' case. */
+            --p;
+            goto lex_fallthrough_1; /* silence warning */
+
+        lex_fallthrough_1:
+            /* Leading integer digit in C integers. */
+        case '1': case '2': case '3': case '4': case '5':
+        case '6': case '7': case '8': case '9':
+            while (++p && lex_isdigit(*p)) {
+            }
+            d = p;
+            if (*p == '.') {
+/* Silence unused label warnings when features are disabled. */
+#ifdef LEX_C_OCTAL_NUMERIC
+lex_c_octal_to_fraction_part:
+#endif
+lex_dot_to_fraction_part:
+                while (++p != q && lex_isdigit(*p)) {
+                }
+            }
+            if (p != q && (*p == 'e' || *p == 'E')) {
+/* Silence unused label warnings when features are disabled. */
+#ifdef LEX_C_OCTAL_NUMERIC
+lex_c_octal_to_exponent_part:
+#endif
+                if (++p != q && *p != '+' && *p != '-') {
+                    --p;
+                }
+                while (++p != q && lex_isdigit(*p)) {
+                }
+            }
+            if (d != p) {
+                lex_emit_float(s, p);
+            } else {
+#ifdef LEX_C_OCTAL_NUMERIC
+                if (*s == '0') {
+                    lex_emit_octal(s, p);
+                    continue;
+                }
+#endif
+                lex_emit_int(s, p);
+            }
+            continue;
+
+#ifndef LEX_ID_WITHOUT_UNDERSCORE
+            case '_':
+#endif
+            case 'A': case 'B': case 'C': case 'D': case 'E':
+            case 'F': case 'G': case 'H': case 'I': case 'J':
+            case 'K': case 'L': case 'M': case 'N': case 'O':
+            case 'P': case 'Q': case 'R': case 'S': case 'T':
+            case 'U': case 'V': case 'W': case 'X': case 'Y':
+            case 'Z':
+            case 'a': case 'b': case 'c': case 'd': case 'e':
+            case 'f': case 'g': case 'h': case 'i': case 'j':
+            case 'k': case 'l': case 'm': case 'n': case 'o':
+            case 'p': case 'q': case 'r': case 's': case 't':
+            case 'u': case 'v': case 'w': case 'x': case 'y':
+            case 'z':
+
+                /*
+                 * We do not try to ensure utf-8 is terminated correctly nor
+                 * that any unicode character above ASCII is a character
+                 * suitable for identifiers.
+                 *
+                 * tag is calculated for keyword lookup, and we assume these
+                 * are always ASCII-7bit.  It has the form: length, first
+                 * char, second, char, last char in lsb to msb order. If the
+                 * second char is missing, it becomes '\0'. The tag is not
+                 * entirely unique, but suitable for fast lookup.
+                 *
+                 * If utf-8 appears in tag, the tag is undefined except the
+                 * length is valid or overflows (meaning longer than any
+                 * keyword and thus safe to compare against if tag matches).
+                 *
+                 * If the grammar is case insensitive, the tag be can
+                 * downcased trivially by or'ring with 0x20202000 which
+                 * preserves the length field (clever design by ASCII
+                 * designers). After tag matching, a case insentive
+                 * compare is obviously also needed against the full lexeme.
+                 */
+
+                {
+                    unsigned long tag;
+
+                    tag = (unsigned long)*p << 8;
+                    if (++p != q && lex_isalnum(*p)) {
+                        tag |= (unsigned long)*p << 16;
+                        while (++p != q && lex_isalnum(*p)) {
+                        }
+                    }
+                    tag |= (unsigned long)p[-1] << 24;
+                    tag |= (unsigned char)(p - s) + (unsigned long)'0';
+                    lex_emit_id(s, p, tag);
+                    continue;
+                }
+
+            default:
+
+#ifdef LEX_ID_WITH_UTF8
+                /*
+                 * Identifier again, in case it starts with a utf-8 lead
+                 * character. This time we can ignore the tag, except the
+                 * length char must be valid to avoid buffer overruns
+                 * on potential kw check upstream.
+                 */
+                if (*p & '\x80') {
+                    unsigned long tag;
+
+                    while (++p != q && lex_isalnum(*p)) {
+                    }
+                    tag = (unsigned char)(p - s) + '0';
+                    lex_emit_id(s, p, tag);
+                    continue;
+                }
+#endif
+                ++p;
+                /* normally 0x7f DEL and 0x00..0x1f incl. */
+                if (lex_isctrl(*s) && !lex_isblank(*s)) {
+                    lex_emit_ctrl(s);
+                } else {
+                    lex_emit_symbol(*s, s, p);
+                }
+                continue;
+        } /* Main switch in normal mode. */
+    } /* Main while loop in normal mode. */
+
+lex_mode_exit:
+    if (mode == LEX_MODE_INVALID) {
+        return mode;
+    }
+
+#ifndef LEX_DISABLE_ZTERM
+    if (p != q && lex_iszterm(*p)) {
+        lex_emit_eos(s, p);
+        return mode;
+    }
+#endif
+    lex_emit_eob(p);
+    return mode;
+}
+
diff --git a/flatcc/external/lex/luthor.h b/flatcc/external/lex/luthor.h
new file mode 100644
index 0000000..6ca373d
--- /dev/null
+++ b/flatcc/external/lex/luthor.h
@@ -0,0 +1,472 @@
+/*
+ * Mostly generic lexer that can be hacked to suit specific syntax. See
+ * more detailed comments further down in this file.
+ *
+ * Normally include luthor.c instead of luthor.h so emitter functions
+ * can be custom defined, and optionally also fast keyword definitions.
+ *
+ * At the very minimum, define lex_emit which other emitters default to.
+ *
+ * Create a wrapper function to drive the lex function in said file.
+ *
+ * Use this header in separate parser logic to access the token values
+ * if relevant.
+ */
+
+#ifndef LUTHOR_H
+#define LUTHOR_H
+
+#ifdef LEX_KEYWORDS
+#include <string.h> /* memcmp for kw match */
+#endif
+
+#include "tokens.h"
+
+#ifndef lex_emit
+#define lex_emit(token, first, last) ((void)0)
+#endif
+
+/*
+ * Default for comments, bom, and other things that are not necessarily
+ * of interest to the parser, but may be to buffer wrap handling,
+ * debugging, and pretty printers.
+ */
+#ifndef lex_emit_other
+#define lex_emit_other(token, first, last) ((void)0)
+#endif
+
+#ifndef lex_emit_eof
+#define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos)
+#endif
+
+#ifndef lex_emit_abort
+#define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos)
+#endif
+
+#ifndef lex_emit_eob
+#define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos)
+#endif
+
+#ifndef lex_emit_eos
+#define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last)
+#endif
+
+#ifndef lex_emit_bom
+#define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last)
+#endif
+
+#ifndef lex_emit_id
+#ifdef LEX_KEYWORDS
+/* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */
+#define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last)
+#else
+#define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last)
+#endif
+#endif
+
+/*
+ * This is a default for unknown symbols. It may be treated as an error,
+ * or it can be processed further by the parser instead of customizing
+ * the lexer. It ensures that there is always a token for every part of
+ * the input stream.
+ */
+#ifndef lex_emit_symbol
+#define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last)
+#endif
+
+/*
+ * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have
+ * separate tokens.
+ *
+ * Control characters in strings and comments are passed on as body
+ * elements, except \0\r\n which breaks the string up.
+ */
+#ifndef lex_emit_ctrl
+#define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1)
+#endif
+
+#ifndef lex_emit_string_ctrl
+#define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1)
+#endif
+
+#ifndef lex_emit_comment_ctrl
+#define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1)
+#endif
+
+/*
+ * This enables user to both count lines, and to calculate character
+ * offset for subsequent lexemes. New line starts a lexeme, line break
+ * symbol is located at lexeme - skipped and with have length 2 if \r\n
+ * or \n\r break, and 1 otherwise.
+ */
+#ifndef lex_emit_newline
+#define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last)
+#endif
+
+#ifndef lex_emit_string_newline
+#define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last)
+#endif
+
+#ifndef lex_emit_int
+#define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last)
+#endif
+
+#ifndef lex_emit_float
+#define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last)
+#endif
+
+#ifndef lex_emit_int_suffix
+#define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last)
+#endif
+
+#ifndef lex_emit_float_suffix
+#define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last)
+#endif
+
+#ifndef lex_emit_binary
+#define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last)
+#endif
+
+#ifndef lex_emit_octal
+#define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last)
+#endif
+
+#ifndef lex_emit_hex
+#define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last)
+#endif
+
+#ifndef lex_emit_hex_float
+#define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last)
+#endif
+
+/*
+ * The comment token can be used to aid backtracking during buffer
+ * switch.
+ */
+#ifndef lex_emit_comment_begin
+#define lex_emit_comment_begin(first, last, is_doc)                         \
+    lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last)
+#endif
+
+#ifndef lex_emit_comment_part
+#define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last)
+#endif
+
+#ifndef lex_emit_comment_end
+#define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last)
+#endif
+
+#ifndef lex_emit_comment_unterminated
+#define lex_emit_comment_unterminated(pos)                                  \
+        lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos)
+#endif
+
+#ifndef lex_emit_comment_deeply_nested
+#define lex_emit_comment_deeply_nested(pos)                                 \
+        lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos)
+#endif
+
+#ifndef lex_emit_string_begin
+#define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last)
+#endif
+
+#ifndef lex_emit_string_part
+#define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last)
+#endif
+
+#ifndef lex_emit_string_end
+#define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last)
+#endif
+
+#ifndef lex_emit_string_escape
+#define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last)
+#endif
+
+#ifndef lex_emit_string_unterminated
+#define lex_emit_string_unterminated(pos)                                   \
+        lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos)
+#endif
+
+#ifndef lex_emit_blank
+#define lex_emit_blank(first, last)                                         \
+        lex_emit_other(LEX_TOK_BLANK, first, last)
+#endif
+
+#ifndef lex_emit_op
+#define lex_emit_op(op, first, last)  lex_emit((long)(op), first, last)
+#endif
+
+#ifndef lex_emit_compound_op
+#define lex_emit_compound_op(op1, op2, first, last)                         \
+        lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last)
+#endif
+
+#ifndef lex_emit_tricompound_op
+#define lex_emit_tricompound_op(op1, op2, op3, first, last)                 \
+        lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
+                ((long)(op3)<<16), first, last)
+#endif
+
+#ifndef lex_emit_quadcompound_op
+#define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last)           \
+        lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
+                ((long)(op3) << 16) | ((long)(op4) << 24), first, last)
+#endif
+
+/* Used to limit number of nested comment level. */
+#ifndef LEX_MAX_NESTING_LEVELS
+#define LEX_MAX_NESTING_LEVELS 100
+#endif
+
+
+/* Keyword handling macros, see `keywords.c` for an example usage. */
+#ifdef LEX_KEYWORDS
+
+/*
+ * This implements a switch statement branching on the 4 character
+ * keyword tag (unsigned long value) which is produced by the lexers id
+ * recognizer. A final check is needed with to ensure an exact
+ * match with a given id. Two keywords rarely conflicts, but it is
+ * possible, and therefore kw_begin kw_match kw_match ... kw_end is used
+ * to cover this.
+ *
+ * See example usage elsewhere for details.
+ *
+ * The first element x0 is length '0'..'9' and ensure comparisons will
+ * not overrun the buffer where the lexeme is stored during string
+ * comparison, iff the keywords report the length correctly.
+ *
+ * The next elements in the tag are the first, second, and last
+ * character of lexeme / keyword, replacing second character with '\0'
+ * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e',
+ * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb
+ * and last chararacter is msb.
+ *
+ * An enum with tok_kw_<name> elements is expected to provide return
+ * values on match. These should start at LEX_TOK_KW_BASE and are
+ * negative.
+ *
+ */
+#define lex_kw_begin(x0, x1, x2, x3)                                    \
+        case                                                            \
+            ((unsigned long)(x0) |                                      \
+            ((unsigned long)(x1) << 8) |                                \
+            ((unsigned long)(x2) << 16) |                               \
+            ((unsigned long)(x3) << 24)) :
+
+#define lex_kw_match(kw)                                                \
+        if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0)                  \
+                return tok_kw_##kw;
+
+#define lex_kw_end()                                                    \
+        break;
+
+#define lex_kw(kw, x0, x1, x2, x3)                                      \
+        lex_kw_begin(x0, x1, x2, x3)                                    \
+            lex_kw_match(kw)                                            \
+        lex_kw_end()
+
+static long lex_match_kw(unsigned long tag, const char *lexeme);
+
+/* Static so multiple grammers are possible in a single program. */
+#define LEX_KW_TABLE_BEGIN                                              \
+static long lex_match_kw(unsigned long tag, const char *lexeme)         \
+{                                                                       \
+    switch (tag) {                                                      \
+
+#define LEX_KW_TABLE_END                                                \
+    default:                                                            \
+        break;                                                          \
+    }                                                                   \
+    return LEX_TOK_KW_NOT_FOUND;                                        \
+}
+
+#else
+
+/* Allow flagging in and out without unused warning or missing macros */
+#define lex_kw_begin(x0, x1, x2, x3)
+#define lex_kw_match(kw)
+#define lex_kw_end()
+#define lex_kw(kw, x0, x1, x2, x3)
+#define LEX_KEYWORD_TABLE_BEGIN
+#define LEX_KEYWORD_TABLE_END
+
+#endif /* LEX_KEYWORDS */
+
+
+
+/*
+ * Modes used for recovery when switching to a new buffer and handling
+ * internal state changes for strings and comments.
+ */
+enum {
+    /* Always 0, is initial lexer state. */
+    LEX_MODE_NORMAL = 0,
+
+    /* Returned if lex is given unsupported mode. */
+    LEX_MODE_INVALID = 1,
+
+    /*
+     * Can be used in place of normal mode to consume optional bom
+     * marker at buffer start. Only utf-8 bom is supported.
+     */
+    LEX_MODE_BOM,
+
+    /*
+     * Returned at end of buffer if mid string or mid comment, may also
+     * be larger for nested comments as nesting level is encoded.
+     */
+    LEX_MODE_C_STRING,
+    LEX_MODE_C_STRING_SQ,
+    LEX_MODE_PYTHON_BLOCK_STRING,
+    LEX_MODE_PYTHON_BLOCK_STRING_SQ,
+    LEX_MODE_C_BLOCK_COMMENT,
+    LEX_MODE_LINE_COMMENT,
+    LEX_MODE_JULIA_NESTED_COMMENT,
+
+
+    /* Counter embedded in mode. */
+    LEX_MODE_COUNT_BASE = 16,
+};
+
+
+
+/* ON CALLING AND USING LEX FUNCTION
+ *
+ * If utf-8 BOM possible, detect this before calling the lexer and
+ * advance the buffer. JSON explititly disallows BOM, but recommends
+ * consuming it if present. If some other Unicode BOM is found, convert
+ * the buffer first. The lexer assumes ALL non-ascii characters are
+ * valid trailing identifiers which mostly works well. Strings with
+ * broken utf-8 are passed on as is. utf-8 identifiers must be enabled
+ * with #define LEX_ENABLE_UTF8_ID
+ *
+ * If required, postprocess identifiers and strings for valid utf-8.  It
+ * is assumed that all keywords are at most 9 characters long and always
+ * ASCII. Otherwise post process them in a hash table on identifier
+ * event. This enables a fast compiled trie lookup of keywords.
+ *
+ * Newline and control characters are always emitted, also inside
+ * strings and comments. The exception is \r, \n, \t, \0 which are
+ * handled specially, or if the lexer is adapted to handle certain
+ * control characters specially.
+ *
+ * Each token is not guaranteed correct, only to be delimited correct,
+ * if it is indeed correct. Only very few tokens can be zero length, for
+ * example, the parser can rely on string part token not being empty
+ * which is important in dealing with line continuation. The end of
+ * buffer token is empty, and so is the unterminates string token, and
+ * also the comment end token for single line tokens, but not the
+ * multi-line version. There is a token for every part of the input
+ * stream, but the parser can easily define some to be ignored and have
+ * them optimized out.
+ *
+ * Strings have start token, and optionally sequences of control,
+ * escape, and newline tokens, followed by either string end token or
+ * string unterminated token. Strings delimiters can be one
+ * (single-line) or three double quotes (multi-line, like python, but
+ * cannot be single quotes, unlike Python. Python, C and Javascript
+ * string continuation is handled by having the parser observing string
+ * escape followed by newline token. Escape is always a single
+ * character '\' token, and the parser is responsible for consuming the
+ * following content. If string syntax with double delimiter is used to
+ * define escaped delimiter, this will occur as two separate strings
+ * with no space between. The parser can handle this on its own; if, in
+ * such strings, '\"' does not mean escaped delimiter, the string will
+ * not terminate correctly, and the lexer must be apapted. Unterminated
+ * string may happen at end of buffer, also for single line comments.
+ * This is because the string might continue in a new buffer. The parser
+ * should deal with this.
+ *
+ * Comments always start with a start token, followed by zero or more
+ * comment part tokens interleaved with control and newline tokens,
+ * terminated by either comment end token, or unterminated comment
+ * token. If the comment is single, the unterminated comment token may
+ * appear at the last line instead of the expected end of comment token
+ * because the comment might continue in a new buffer. The parser
+ * should deal with this. Escapes and line continuations have no effects
+ * in comments, unlike strings.
+ *
+ * The lexer only carries one state variable: the mode. The mode can be
+ * normal (default and equals zero), or single or multi string or
+ * comment modes.  These modes are used to to recover after switching
+ * buffers as discussed below.
+ *
+ * The lexer can run to completion without involving the parser and
+ * could be used to pipeline tokens into another thread for concurrent
+ * parsing which is safe since the input buffer is considered read-only.
+ *
+ *
+ * KEYWORDS
+ *
+ * Keywords are treated as identifiers by default. By including a
+ * keyword table the `lex_emit_id` macro will check if the id is a
+ * keyword and translate the token if it is. Using the provided keyword
+ * table macros is just one way to do it. This is better explained by
+ * looking at an example. Keyword lookup based on the precomputed keyword
+ * tag provided to the lookup function are limited to 9 characters, but a
+ * custom lookup function need not use it and then the tag precomputation
+ * will be optimized out.
+ *
+ * Keywords are defined by the lookup function and should be negative
+ * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types.
+ *
+ *
+ * WRAPPING MULTIPLE BUFFERS
+ *
+ * The user may need to deal with multiple buffers because data may
+ * arrive asynchronously over the network, and may have many concurrent
+ * lexing jobs. The emitter part is not difficult since a ring buffer
+ * can grow, or the parser can be called directly (except queuing a few
+ * tokens for backtracking as we shall see).
+ *
+ * If the lexer were an explicit statemachine as in Flex, we could get
+ * an yywrap event to fill buffers, but our state is on the stack and in
+ * registers for optimization. We may use co-routines, but it doesn't
+ * cover all issues, and, as it turns out is not necessary with the
+ * following restrictions on syntax:
+ *
+ * All variable length tokens such as numerics and identifiers are
+ * limited in length. Strings and comments are not, but are broken into
+ * zero, one, or several body tokens per line. ANSI-C limits line length
+ * to 509 characters (allowing for continuation and two byte linebreaks
+ * in a 512 byte buffer). But JSON has no line continuation for strings
+ * and may (and often do) store everything on a single line. Whitespace
+ * can also extend beyond given limit.
+ *
+ * If we ignore whitespace, strings and comments, we can discard the
+ * last token (or last two in case there are paired tokens, such as
+ * leading zero followed by numeric. Parsing can then resume in a new
+ * buffer where the first 512 bytes (or similar) are duplicated from the
+ * previous buffer. The lexer is then restarted at the last token (pair)
+ * start which may turn out to change the length or even introduce a
+ * different result such introducing leading zero. The lexer need no
+ * specific state to do this.
+ *
+ * For strings and comments, we need a flag to allow entering the lexer
+ * mid string or mid comment. The newline and line continuation tokens
+ * need to be dropped, and the last body may need to be truncated as it
+ * can embed a partial delimiter. The simplest way to deal with this is
+ * to backtrack tokens until the last token begins at a safe position,
+ * about 3-6 charaters earlier, and truncating body segments that span
+ * this barrier. Whitespace can also be truncated.
+ *
+ * We can generalize this further by going at least K bytes back in an N
+ * overlap buffer region and require non-strings (and non-comments) to
+ * not exceed N-K bytes, where K and N are specific to the syntax and
+ * the I/O topology.
+ *
+ * We can add flags to tokens that can help decide how to enter
+ * backtracking mode without covering every possible scanner loop - i.e.
+ * are we mid string, mid comment, single-line or multi-line.
+ *
+ * All the lexer needs to do then, is to receive the backtracking mode
+ * flags. A wrapping driver can deal with backtrack logic, which is
+ * specific to how tokens are emitted. Whitespace need no recovery mode
+ * but perhaps new whitespace should extend existing to simplify
+ * parsing.
+ */
+
+
+#endif /* LUTHOR_H */
+
diff --git a/flatcc/external/lex/tokens.h b/flatcc/external/lex/tokens.h
new file mode 100644
index 0000000..2bdbd7c
--- /dev/null
+++ b/flatcc/external/lex/tokens.h
@@ -0,0 +1,554 @@
+#ifndef LEX_TOKENS_H
+#define LEX_TOKENS_H
+
+/* Define LEX_DEBUG to enable token printing and describing functions. */
+
+
+enum {
+
+    /*
+     * EOF is not emitted by lexer, but may be used by driver after
+     * last buffer is processed.
+     */
+    LEX_TOK_EOF = 0,
+
+    /*
+     * Either EOB or EOS is emitted as the last token before exit,
+     * or also ABORT in some lexers. Unterminated string or comment
+     * will be emitted immediately one of these when relevant.
+     *
+     * It may be useful to redefine lex_emit_eos and lex_emit_eob to
+     * produce LEX_TOK_EOF or error directly for simple string lexing.
+     */
+    LEX_TOK_EOB = 1,
+    LEX_TOK_EOS = 2,
+
+    /*
+     * ABORT can be used for early exit by some lexers while other
+     * lexers may choose to run to buffer end regardless of input (with
+     * the exception of deeply nested comments).
+     */
+    LEX_TOK_ABORT = 3,
+
+    /*
+     * Byte order marker. Only happen if lexer was started in bom mode
+     * and the input stream contains a leading bom marker.
+     * The token can only be the first token in the stream. Utf-8 is the
+     * only supported bom, but the lexeme may be checked in case other
+     * boms are added later. Normally it is routed to lex_emit_other
+     * along with comments so it just ignores the bom if present. It is
+     * generally recommended to consume utf-8 bom for interoperability,
+     * but also to not store it for the same reason.
+     */
+    LEX_TOK_BOM,
+
+    /*
+     * Any control character that is not newline or blank will be
+     * emitted as single character token here. This token is discussed
+     * in several comments below. For strings and comments, also
+     * blank control characters will be emitted since they are usually
+     * not desired unexpectd.
+     */
+    LEX_TOK_CTRL,
+    LEX_TOK_STRING_CTRL,
+    LEX_TOK_COMMENT_CTRL,
+
+    /*
+     * Any printable ASCII character that is not otherwise consumed will
+     * be issued as a single length symbol token. Further discussion
+     * below. The symbol and CTRL tokens ensure that the entire input
+     * stream is covered by tokens. If utf-8 identifies have not been
+     * flagged, utf-8 leading characters may also end up here, and so
+     * my utf-8 characters in general, that are not viewed as valid
+     * identifiers (depending on configuration).
+     */
+    LEX_TOK_SYMBOL,
+
+    /*
+     * Variable length identifier starting with (_A-Za-z) by default and
+     * followed by zero or more (_A-Za-z0-9) characters. (_) can be
+     * flagged out. utf-8 can be flagged in. Be default any non-ASCII
+     * character (0x80 and above), is treated as part of an identifier
+     * for simplicity and speed, but this may be redefined. Any broken
+     * utf-8 is not sanitized, thus 0x80 would be a valid identifier
+     * token with utf-8 identifiers enabled, and otherwise it would be a
+     * symbol token.
+     *
+     * The ID does a magic trick: It maps the lexeme to a very simple
+     * and fast 32 bit hash code called a tag. The tag is emitted with
+     * the id token and can be used for fast keyword lookup. The
+     * hash tag is:
+     *
+     *     (length)(first char)(second char)(last char)
+     *
+     * where length is ASCII '0' .. '9' where any length overflow is an
+     * arbitrary value, but such that the length is never longer than
+     * the lexeme. The last char is the last char regardless of length.
+     * For short identifiers, the second char may be the first char
+     * duplicated, and the last char may be first char.
+     *
+     * This code is very simple to write by hand: "5whe" means while,
+     * and can be used in a case switch before a strcmp with "while".
+     * Conflicts are possible, but then several keywords are tested like
+     * any other hash conflict. This keyword lookup is user driven, but
+     * can follow example code quite straightforward.
+     *
+     * The lex_emit_id macro can be implemented to provide the above
+     * lookup and inject a keyword token instead. By convention such
+     * tokens have negative values to avoid conflicts with lexer
+     * generated tokens.
+     *
+     * The ID also has a special role in prefixes and suffixes: C string
+     * literals like (L"hello") and numeric literals like (42f) are
+     * lexed as two tokens, one of which is an ID. The parser must
+     * process this and observe absence of whitespace where such syntax
+     * is relevant.
+     *
+     * While not specific to ID, the emitter macroes can be designed to
+     * keep track of start of lines and end of whitespace and attach
+     * state flags to each token (at line start, after whitespace). The
+     * whitespace tokens can then be dropped. This might help parsing
+     * things like suffixes efficiently.
+     */
+    LEX_TOK_ID,
+
+    /*
+     * C-int :: pos-dec-digit dec-digit *
+     * Julia-int ::= dec-digit+
+     *
+     *    pos-dec-digit ::= '1'..'9'
+     *    dec-digit ::= '0'..'9'
+     *
+     * Floating point numbers take precedence when possible so 00.10 is
+     * always a deciaml floating point value when decimal floats are
+     * enabled.
+     *
+     * The C-int is automatically enabled if C-octals are enabled, and
+     * disabled otherwise. There is no specific Julia-int type - we just
+     * use the terminology to represent integers with leading zeroes.
+     *
+     * Julia style integers accept leading zeroes. C style integers with
+     * leading zeroes are consumed as C style octal numbers, so 0019 is
+     * parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int).
+     *
+     * Single digit '0' maps to octal when C-octals are enabled and to
+     * Julia-int otherwise. (Yes, integers are not that simple, it
+     * seems).
+     *
+     * Both C and Julia octal numbers (see octal token) can be active
+     * simultaneously. This can be used to control leading zero
+     * behavior, even if C-octal numbers are not part of the grammar
+     * being parsed. For example, a language might use 0o777 octal
+     * numbers and disallow 0777 integers. Enabling C-octals makes this
+     * easy to detect (but should accept octal 0).
+     *
+     * There is no destinction between the styles in the int token, but
+     * leading zeroes are easily detected in the lexeme.
+     *
+     * Constant suffixes like 1L are treated as 1(INT), and L(ID). The
+     * same goes for other numeric values.
+     *
+     * Parser should check for leading zeroes and decide if it is valid,
+     * a warning, or an error (it is in JSON). This also goes for float.
+     *
+     * Numericals, not limited to INT, may appear shorter than they are
+     * due to buffer splits. Special recovery is required, but will only
+     * happen just before EOS or EOB tokens (i.e. buffer split events).
+     */
+    LEX_TOK_INT,
+
+    /*
+     * float ::= (int ['.' dec-digits*] dec-exponent)
+     *          | ([int] '.' dec-digits* [dec-exponent])
+     *      dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits*
+     *      dec-digits ::= '0'..'9'
+     *      int ::= dec-digits*
+     *
+     * Consumes a superset of C float representation without suffix.
+     * Some invalid tokens such as 0.E are accepted. Valid tokens such
+     * as 00.10 take precedence over octal numbers even if it is a
+     * prefix, and the same is obviously true with respect to decimal
+     * integers.
+     *
+     * JSON does not allow leading zeroes, and also not leading '.'.
+     * This can easily be checked in the lexeme.
+     *
+     * The octal notation affecting integer leading zeroes is not
+     * relevant to floats because floats take precedence over octal and
+     * decimal int when containing '.', 'e' or 'E'.
+     */
+    LEX_TOK_FLOAT,
+
+    /*
+     * binary ::= (0b | 0B) ('0' | '1')*
+     *
+     * 0b100 or just 0b, parser must check that digits are present,
+     * otherwise it may be interpreted as zero, just like octal zero
+     * in C.
+     *
+     * Like 0X hex, 0B can be flagged out because Julia v0.3 does not
+     * support uppercase 0B.
+     */
+    LEX_TOK_BINARY,
+
+    /*
+     * C-octal ::= 0 octal-digit*
+     *     octal-digits ::= '0'..'7'
+     *
+     * Julia-octal ::= 0o octal-digits*
+     *     octal-digits ::= '0'..'7'
+     *
+     * 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia
+     * v.0.3 does not allow uppercase 0O777, it would mean 0 * O777.
+     *
+     * When enabled, decimal floating points take precedence: 00.10 is
+     * parsed as 00.10(decimal float), as per C standard.
+     *
+     * NOTE: It is possible for both styles to be active simultaneously.
+     * This may be relevant in order to control handling of leading
+     * zeroes in decimal integers.
+     *
+     * If C-octal numbers are flagged out, leading zeroes are mapped to
+     * integers and the numerical value may change.  Julia behaves this
+     * way. Nothing prevents support of both C and Julia octal numbers,
+     * but leading zeroes will then be interpreted the C way - it is not
+     * recommended to do this.
+     */
+    LEX_TOK_OCTAL,
+
+    /*
+     * hex ::= hex-int
+     *     hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9'
+     *     hex-int ::= (0x | 0X) hex_digts*
+     *
+     * where hex_digits are customizable (e.g. all lower case), and hex
+     * prefix 0x can be flagged to be lower case only (as in Julia).
+     *
+     * If hex floats are enabled, they take precedence:
+     * 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed
+     * by .0(decimal float).
+     *
+     * The lead prefix 0x may by flagged to be lower case only because
+     * this is required by Julia v0.3 where 0X means 0 * X. Julia
+     * accepts uppercase in the remaining hex digits (and exponent for
+     * floats). This could possibly change in future versions.
+     *
+     * The zero length sequence (0x | 0X) is accepted and left to the
+     * parser since the lexer emits a token for everything it sees.
+     * Conceptually it may be interpreted as zero, equivalent to 0 being
+     * both octal prefix and numeric 0 in C style octal representation.
+     * Or it may be an error.
+     */
+    LEX_TOK_HEX,
+
+    /*
+     * hex_float ::= hex-int ['.' hex_digit*] hex-exponent
+     *     hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit*
+     *     decimal-digit ::= '0'..'9'
+     *
+     * A superset of IEEE-754-2008 Hexadecimal Floating Point notation.
+     *
+     * We require the exponent to be present, but does not ensure the
+     * value is otherwise complete, e.g. 0x1p+ would be accepted. The p
+     * is needed because otherwise 0x1.f could be accepted, and f is a
+     * float suffix in C, and juxtapostion factor (0x1. * f) in Julia,
+     * at least, that is one possible interpretation.
+     *
+     * The exponent can be flagged optional in which case 0x1.f will be
+     * consumed as a single hex float toke as a single hex float token.
+     * This may either simply be accepted in some grammars, or used to
+     * provide an error message. If the exponent is required, 0x1.f will
+     * be lexed as three tokens:
+     *
+     *     <'0x1'(hex int), '.'(op), 'f'(id)>.
+     *
+     * Thus it may be a good idea to allow the exponent to be optional
+     * anyway and issue an error message or warning if the p is absent
+     * later in the parsing stage.
+     *
+     * Note that, as per IEEE-754, the exponent is a decimal power of
+     * two. In other words, the number of bits to shift the
+     * (hexa)decimal point. Also note that it is p and not e because e
+     * is a hex digit.
+     */
+    LEX_TOK_HEX_FLOAT,
+
+    /*
+     * blank ::= ('\t' | '\x20')+
+     *
+     * Longest run in buffer holding only '\t' and '\x20' (space).
+     *
+     * buffer splits may generate adjacent blanks depending on recovery
+     * processing. (The same goes for other line oriented runs such as
+     * string parts and comment parts).
+     */
+    LEX_TOK_BLANK,
+
+    /* newline ::= '\r' | '\n' | '\r\n' | '\n\r'
+     *
+     * Will always appear, also inside strings and comments. Can be used
+     * to track line starts and counts reliably as only one newline is
+     * issued at a time, and it is issued everywhere, also in strings
+     * and comments.
+     *
+     * May be preceeded by string escape token inside strings. This can
+     * be interpreted as line continuation within strings specifically,
+     * as is the case in Python and Javascript (and in C via
+     * pre-processor).
+     *
+     * The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary
+     * newline may be ignored in comments and other non-string content.
+     */
+    LEX_TOK_NEWLINE,
+    LEX_TOK_STRING_NEWLINE,
+
+    /*
+     * string ::= string_start
+     *            (string_part | string_escape |
+     *                 string_ctrl | string_newline)*
+     *            (string_end | string_unterminated)
+     *
+     * There are several optional string styles. They all start with
+     * this token. The length and content provided details. Python
+     * may start with """ or ''' and this token will then have length
+     * 3 and three quotes as lexeme content. If the lexer exits before
+     * string end token, the returned lexer mode will remember the
+     * state and can be used for reentry - this also goes for comments.
+     *
+     * Strings can only contain part, escape, newline, and control
+     * tokens, and either string unterminated or string end token
+     * at last.
+     */
+    LEX_TOK_STRING_BEGIN,
+
+    /* Longest run without control characters, without (\), without
+     * newline, and without the relevant end delimiter. The run may be
+     * shortened due to buffer splits. The part may, as an exception,
+     * begin with an end delimiter character or a (\) if it was
+     * preceeded by a string escape token. The escape character is
+     * always (\). Strings that use "" or '' as escape will be treated
+     * as start and end of separate strings. Strings that do not supoort
+     * (\) should just treat escape as a part of the string.
+     */
+    LEX_TOK_STRING_PART,
+
+    /*
+     * This is always a single character token (\) and only happens
+     * inside strings. See also string part token.
+     */
+    LEX_TOK_STRING_ESCAPE,
+
+    /* This token is similar to string start. It may be absent at buffer
+     * splits, but will then an unterminated string token will be used
+     * just before the split event token.
+     *
+     * */
+    LEX_TOK_STRING_END,
+
+    /*
+     * This is emitted before the buffer ends, or before unescaped
+     * newlines for line oriented string types (the usual strings).
+     * At buffer splits, recovery should clean it up. The returned
+     * mode allow parsing to continue in a new buffer with a slight
+     * content overlap.
+     *
+     * If string like ("hello, world!") in C, reaches end of line, it
+     * may be continued" ("hello, \)newline(world!"). If this line
+     * continuation is flagged out, this will lead to string
+     * unterminated, even if not at end of buffer. For block strings
+     * like """hello""", this only happens at end of buffer.
+     */
+    LEX_TOK_STRING_UNTERMINATED,
+
+    /*
+     *
+     *     comment ::= comment_start
+     *     (comment_part | ctrl | newline)*
+     *     (comment_end | comment_unterminated)
+     *
+     *
+     * Comments work like strings in most respects. They emit parts, and
+     * control characters, but not escape characters, and cannot be
+     * continued at end of line. Block comments are like python block
+     * strings (''').
+     *
+     * Julia supports nested comments (#= ... #= =# =#). In this case
+     * a new start token can be emitted before an end token. If the
+     * parser exits due to buffer split, the mode has the nesting level
+     * encoded so it can resumed in a new buffer.
+     *
+     * Line comments will have their end token just before newline, or
+     * unterminated comment just before buffer split token (EOB or EOS).
+     * (\) characters are consumed by the comment part tokens and do not
+     * affect the end of any comment.
+     *
+     * Comment begin may include extra characters when a doc comment is
+     * recognized. The emitter flags this. End comments are unaffected.
+     */
+    LEX_TOK_COMMENT_BEGIN,
+    LEX_TOK_COMMENT_PART,
+    LEX_TOK_COMMENT_END,
+    LEX_TOK_COMMENT_UNTERMINATED,
+
+    /*
+     * Issued before ABORT token if nesting level is above a predefined
+     * level. This is to protect against malicious and misguided
+     * content, otherwise the nesting level counter could wrap and
+     * generate a different interpretation, which could be bad. The
+     * parser would probably do similar things with nested tokens.
+     */
+    LEX_TOK_COMMENT_DEEPLY_NESTED,
+
+
+    /* Operators are all recognized single character symbols, or up to
+     * four characters. The token value is the ASCII codes shifted 8
+     * bits per extra character, by default, but the emitter macros
+     * can redefine this. Values below 32 are reserved token types as
+     * discussed above.
+     *
+     * What exactly represents an operator depends on what the lexer has
+     * enabled.
+     *
+     * Printable ASCII symbols that are NOT recognized, are emitted as
+     * the SYMBOL token and is always length 1. The value can be derived
+     * from the lexeme, but not the token itself. This may be perfectly
+     * fine for the parser, or it may be used to indicate an error.
+     * There are no illegal characters per se.
+     *
+     * Non-printable ASCII characters that are not covered by newline or
+     * blank, are emitted as CTRL tokens. These act the same as the
+     * symbol token and may be used to indicate error, or to handle form
+     * feed and other whitespace not handled by default. Unlike symbol,
+     * however, CTRL also appear in strings and comments since they are
+     * generally not allowed and this makes it easy to capture (there is
+     * virtually no performance overhead in providing this service
+     * unless attempting to parse a binary format).
+     */
+
+    /* Don't bleed into this range. */
+    LEX_TOK_OPERATOR_BASE = 32,
+
+
+    /*
+     * Operators use ASCII range.
+     * Compound operators use range 0x80 to 0x7fff
+     * and possibly above for triple sequences.
+     * Custom keywords are normally negative but can be mapped
+     * to any other.
+     *
+     * The layout is designed for efficient table lookup.
+     * Compound operators might benefit from remapping down to a smaller
+     * range for compact lookup tables, but it depends on the parser.
+     */
+};
+
+/*
+ * Custom keyword token range is negative, and well below -99..0 where
+ * special codes are reserved.
+ */
+#ifndef LEX_TOK_KW_BASE
+#define LEX_TOK_KW_BASE -1000
+#endif
+
+#ifndef LEX_TOK_KW_NOT_FOUND
+#define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID
+#endif
+
+
+#ifdef LEX_DEBUG
+
+#include <stdio.h>
+#include <string.h>
+
+static const char *lex_describe_token(long token)
+{
+    switch(token) {
+    case LEX_TOK_BOM: return "BOM marker";
+    case LEX_TOK_EOF: return "EOF";
+    case LEX_TOK_EOS: return "buffer zero terminated";
+    case LEX_TOK_EOB: return "buffer exhausted";
+    case LEX_TOK_ABORT: return "abort";
+    case LEX_TOK_CTRL: return "control";
+    case LEX_TOK_STRING_CTRL: return "string control";
+    case LEX_TOK_COMMENT_CTRL: return "comment control";
+    case LEX_TOK_SYMBOL: return "symbol";
+    case LEX_TOK_ID: return "identifier";
+    case LEX_TOK_INT: return "integer";
+    case LEX_TOK_FLOAT: return "float";
+    case LEX_TOK_BINARY: return "binary";
+    case LEX_TOK_OCTAL: return "octal";
+    case LEX_TOK_HEX: return "hex";
+    case LEX_TOK_HEX_FLOAT: return "hex float";
+    case LEX_TOK_BLANK: return "blank";
+    case LEX_TOK_NEWLINE: return "newline";
+    case LEX_TOK_STRING_NEWLINE: return "string newline";
+    case LEX_TOK_STRING_BEGIN: return "string begin";
+    case LEX_TOK_STRING_PART: return "string part";
+    case LEX_TOK_STRING_END: return "string end";
+    case LEX_TOK_STRING_ESCAPE: return "string escape";
+    case LEX_TOK_STRING_UNTERMINATED: return "unterminated string";
+    case LEX_TOK_COMMENT_BEGIN: return "comment begin";
+    case LEX_TOK_COMMENT_PART: return "comment part";
+    case LEX_TOK_COMMENT_END: return "comment end";
+    case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment";
+    case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment";
+
+    default:
+        if (token < LEX_TOK_EOF) {
+            return "keyword";
+        }
+        if (token < 32) {
+            return "undefined";
+        }
+        if (token < 0x100L) {
+            return "operator";
+        }
+        if (token < 0x10000L) {
+            return "compound operator";
+        }
+        if (token < 0x1000000L) {
+            return "tricompound operator";
+        }
+        if (token < 0x7f0000000L) {
+            return "quadcompound operator";
+        }
+        return "reserved";
+    }
+}
+
+static void lex_fprint_token(FILE *fp,
+        long token,
+        const char *first, const char *last,
+        int line, int pos)
+{
+    char buf[10];
+    const char *lexeme = first;
+    int len = (int)(last - first);
+    switch (token) {
+    case LEX_TOK_EOS:
+    case LEX_TOK_CTRL:
+        sprintf(buf, "^%02x", (int)*first);
+        lexeme = buf;
+        len = strlen(buf);
+        break;
+    default:
+        break;
+    }
+    fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n",
+            line, pos, lex_describe_token(token), token, len, lexeme);
+}
+
+#define lex_print_token(token, first, last, line, pos)                  \
+        lex_fprint_token(stdout, token, first, last, line, pos)
+
+#else /* LEX_DEBUG */
+
+#define lex_describe_token(token) "debug not available"
+#define lex_fprint_token(fp, token, first, last, line, pos) ((void)0)
+#define lex_print_token(token, first, last, line, pos) ((void)0)
+
+#endif /* LEX_DEBUG */
+
+
+#endif /* LEX_TOKENS_H */
+