aboutsummaryrefslogtreecommitdiff
path: root/flatcc/external/lex/luthor.h
blob: 6ca373d263a242b91af33d97ae17aa7c28521dbc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/*
 * Mostly generic lexer that can be hacked to suit specific syntax. See
 * more detailed comments further down in this file.
 *
 * Normally include luthor.c instead of luthor.h so emitter functions
 * can be custom defined, and optionally also fast keyword definitions.
 *
 * At the very minimum, define lex_emit which other emitters default to.
 *
 * Create a wrapper function to drive the lex function in said file.
 *
 * Use this header in separate parser logic to access the token values
 * if relevant.
 */

#ifndef LUTHOR_H
#define LUTHOR_H

#ifdef LEX_KEYWORDS
#include <string.h> /* memcmp for kw match */
#endif

#include "tokens.h"

#ifndef lex_emit
#define lex_emit(token, first, last) ((void)0)
#endif

/*
 * Default for comments, bom, and other things that are not necessarily
 * of interest to the parser, but may be to buffer wrap handling,
 * debugging, and pretty printers.
 */
#ifndef lex_emit_other
#define lex_emit_other(token, first, last) ((void)0)
#endif

#ifndef lex_emit_eof
#define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos)
#endif

#ifndef lex_emit_abort
#define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos)
#endif

#ifndef lex_emit_eob
#define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos)
#endif

#ifndef lex_emit_eos
#define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last)
#endif

#ifndef lex_emit_bom
#define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last)
#endif

#ifndef lex_emit_id
#ifdef LEX_KEYWORDS
/* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */
#define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last)
#else
#define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last)
#endif
#endif

/*
 * This is a default for unknown symbols. It may be treated as an error,
 * or it can be processed further by the parser instead of customizing
 * the lexer. It ensures that there is always a token for every part of
 * the input stream.
 */
#ifndef lex_emit_symbol
#define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last)
#endif

/*
 * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have
 * separate tokens.
 *
 * Control characters in strings and comments are passed on as body
 * elements, except \0\r\n which breaks the string up.
 */
#ifndef lex_emit_ctrl
#define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1)
#endif

#ifndef lex_emit_string_ctrl
#define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1)
#endif

#ifndef lex_emit_comment_ctrl
#define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1)
#endif

/*
 * This enables user to both count lines, and to calculate character
 * offset for subsequent lexemes. New line starts a lexeme, line break
 * symbol is located at lexeme - skipped and with have length 2 if \r\n
 * or \n\r break, and 1 otherwise.
 */
#ifndef lex_emit_newline
#define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last)
#endif

#ifndef lex_emit_string_newline
#define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last)
#endif

#ifndef lex_emit_int
#define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last)
#endif

#ifndef lex_emit_float
#define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last)
#endif

#ifndef lex_emit_int_suffix
#define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last)
#endif

#ifndef lex_emit_float_suffix
#define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last)
#endif

#ifndef lex_emit_binary
#define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last)
#endif

#ifndef lex_emit_octal
#define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last)
#endif

#ifndef lex_emit_hex
#define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last)
#endif

#ifndef lex_emit_hex_float
#define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last)
#endif

/*
 * The comment token can be used to aid backtracking during buffer
 * switch.
 */
#ifndef lex_emit_comment_begin
#define lex_emit_comment_begin(first, last, is_doc)                         \
    lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last)
#endif

#ifndef lex_emit_comment_part
#define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last)
#endif

#ifndef lex_emit_comment_end
#define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last)
#endif

#ifndef lex_emit_comment_unterminated
#define lex_emit_comment_unterminated(pos)                                  \
        lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos)
#endif

#ifndef lex_emit_comment_deeply_nested
#define lex_emit_comment_deeply_nested(pos)                                 \
        lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos)
#endif

#ifndef lex_emit_string_begin
#define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last)
#endif

#ifndef lex_emit_string_part
#define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last)
#endif

#ifndef lex_emit_string_end
#define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last)
#endif

#ifndef lex_emit_string_escape
#define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last)
#endif

#ifndef lex_emit_string_unterminated
#define lex_emit_string_unterminated(pos)                                   \
        lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos)
#endif

#ifndef lex_emit_blank
#define lex_emit_blank(first, last)                                         \
        lex_emit_other(LEX_TOK_BLANK, first, last)
#endif

#ifndef lex_emit_op
#define lex_emit_op(op, first, last)  lex_emit((long)(op), first, last)
#endif

#ifndef lex_emit_compound_op
#define lex_emit_compound_op(op1, op2, first, last)                         \
        lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last)
#endif

#ifndef lex_emit_tricompound_op
#define lex_emit_tricompound_op(op1, op2, op3, first, last)                 \
        lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
                ((long)(op3)<<16), first, last)
#endif

#ifndef lex_emit_quadcompound_op
#define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last)           \
        lex_emit(((long)(op1) | ((long)(op2) << 8)) |                       \
                ((long)(op3) << 16) | ((long)(op4) << 24), first, last)
#endif

/* Used to limit number of nested comment level. */
#ifndef LEX_MAX_NESTING_LEVELS
#define LEX_MAX_NESTING_LEVELS 100
#endif


/* Keyword handling macros, see `keywords.c` for an example usage. */
#ifdef LEX_KEYWORDS

/*
 * This implements a switch statement branching on the 4 character
 * keyword tag (unsigned long value) which is produced by the lexers id
 * recognizer. A final check is needed with to ensure an exact
 * match with a given id. Two keywords rarely conflicts, but it is
 * possible, and therefore kw_begin kw_match kw_match ... kw_end is used
 * to cover this.
 *
 * See example usage elsewhere for details.
 *
 * The first element x0 is length '0'..'9' and ensure comparisons will
 * not overrun the buffer where the lexeme is stored during string
 * comparison, iff the keywords report the length correctly.
 *
 * The next elements in the tag are the first, second, and last
 * character of lexeme / keyword, replacing second character with '\0'
 * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e',
 * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb
 * and last chararacter is msb.
 *
 * An enum with tok_kw_<name> elements is expected to provide return
 * values on match. These should start at LEX_TOK_KW_BASE and are
 * negative.
 *
 */
#define lex_kw_begin(x0, x1, x2, x3)                                    \
        case                                                            \
            ((unsigned long)(x0) |                                      \
            ((unsigned long)(x1) << 8) |                                \
            ((unsigned long)(x2) << 16) |                               \
            ((unsigned long)(x3) << 24)) :

#define lex_kw_match(kw)                                                \
        if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0)                  \
                return tok_kw_##kw;

#define lex_kw_end()                                                    \
        break;

#define lex_kw(kw, x0, x1, x2, x3)                                      \
        lex_kw_begin(x0, x1, x2, x3)                                    \
            lex_kw_match(kw)                                            \
        lex_kw_end()

static long lex_match_kw(unsigned long tag, const char *lexeme);

/* Static so multiple grammers are possible in a single program. */
#define LEX_KW_TABLE_BEGIN                                              \
static long lex_match_kw(unsigned long tag, const char *lexeme)         \
{                                                                       \
    switch (tag) {                                                      \

#define LEX_KW_TABLE_END                                                \
    default:                                                            \
        break;                                                          \
    }                                                                   \
    return LEX_TOK_KW_NOT_FOUND;                                        \
}

#else

/* Allow flagging in and out without unused warning or missing macros */
#define lex_kw_begin(x0, x1, x2, x3)
#define lex_kw_match(kw)
#define lex_kw_end()
#define lex_kw(kw, x0, x1, x2, x3)
#define LEX_KEYWORD_TABLE_BEGIN
#define LEX_KEYWORD_TABLE_END

#endif /* LEX_KEYWORDS */



/*
 * Modes used for recovery when switching to a new buffer and handling
 * internal state changes for strings and comments.
 */
enum {
    /* Always 0, is initial lexer state. */
    LEX_MODE_NORMAL = 0,

    /* Returned if lex is given unsupported mode. */
    LEX_MODE_INVALID = 1,

    /*
     * Can be used in place of normal mode to consume optional bom
     * marker at buffer start. Only utf-8 bom is supported.
     */
    LEX_MODE_BOM,

    /*
     * Returned at end of buffer if mid string or mid comment, may also
     * be larger for nested comments as nesting level is encoded.
     */
    LEX_MODE_C_STRING,
    LEX_MODE_C_STRING_SQ,
    LEX_MODE_PYTHON_BLOCK_STRING,
    LEX_MODE_PYTHON_BLOCK_STRING_SQ,
    LEX_MODE_C_BLOCK_COMMENT,
    LEX_MODE_LINE_COMMENT,
    LEX_MODE_JULIA_NESTED_COMMENT,


    /* Counter embedded in mode. */
    LEX_MODE_COUNT_BASE = 16,
};



/* ON CALLING AND USING LEX FUNCTION
 *
 * If utf-8 BOM possible, detect this before calling the lexer and
 * advance the buffer. JSON explititly disallows BOM, but recommends
 * consuming it if present. If some other Unicode BOM is found, convert
 * the buffer first. The lexer assumes ALL non-ascii characters are
 * valid trailing identifiers which mostly works well. Strings with
 * broken utf-8 are passed on as is. utf-8 identifiers must be enabled
 * with #define LEX_ENABLE_UTF8_ID
 *
 * If required, postprocess identifiers and strings for valid utf-8.  It
 * is assumed that all keywords are at most 9 characters long and always
 * ASCII. Otherwise post process them in a hash table on identifier
 * event. This enables a fast compiled trie lookup of keywords.
 *
 * Newline and control characters are always emitted, also inside
 * strings and comments. The exception is \r, \n, \t, \0 which are
 * handled specially, or if the lexer is adapted to handle certain
 * control characters specially.
 *
 * Each token is not guaranteed correct, only to be delimited correct,
 * if it is indeed correct. Only very few tokens can be zero length, for
 * example, the parser can rely on string part token not being empty
 * which is important in dealing with line continuation. The end of
 * buffer token is empty, and so is the unterminates string token, and
 * also the comment end token for single line tokens, but not the
 * multi-line version. There is a token for every part of the input
 * stream, but the parser can easily define some to be ignored and have
 * them optimized out.
 *
 * Strings have start token, and optionally sequences of control,
 * escape, and newline tokens, followed by either string end token or
 * string unterminated token. Strings delimiters can be one
 * (single-line) or three double quotes (multi-line, like python, but
 * cannot be single quotes, unlike Python. Python, C and Javascript
 * string continuation is handled by having the parser observing string
 * escape followed by newline token. Escape is always a single
 * character '\' token, and the parser is responsible for consuming the
 * following content. If string syntax with double delimiter is used to
 * define escaped delimiter, this will occur as two separate strings
 * with no space between. The parser can handle this on its own; if, in
 * such strings, '\"' does not mean escaped delimiter, the string will
 * not terminate correctly, and the lexer must be apapted. Unterminated
 * string may happen at end of buffer, also for single line comments.
 * This is because the string might continue in a new buffer. The parser
 * should deal with this.
 *
 * Comments always start with a start token, followed by zero or more
 * comment part tokens interleaved with control and newline tokens,
 * terminated by either comment end token, or unterminated comment
 * token. If the comment is single, the unterminated comment token may
 * appear at the last line instead of the expected end of comment token
 * because the comment might continue in a new buffer. The parser
 * should deal with this. Escapes and line continuations have no effects
 * in comments, unlike strings.
 *
 * The lexer only carries one state variable: the mode. The mode can be
 * normal (default and equals zero), or single or multi string or
 * comment modes.  These modes are used to to recover after switching
 * buffers as discussed below.
 *
 * The lexer can run to completion without involving the parser and
 * could be used to pipeline tokens into another thread for concurrent
 * parsing which is safe since the input buffer is considered read-only.
 *
 *
 * KEYWORDS
 *
 * Keywords are treated as identifiers by default. By including a
 * keyword table the `lex_emit_id` macro will check if the id is a
 * keyword and translate the token if it is. Using the provided keyword
 * table macros is just one way to do it. This is better explained by
 * looking at an example. Keyword lookup based on the precomputed keyword
 * tag provided to the lookup function are limited to 9 characters, but a
 * custom lookup function need not use it and then the tag precomputation
 * will be optimized out.
 *
 * Keywords are defined by the lookup function and should be negative
 * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types.
 *
 *
 * WRAPPING MULTIPLE BUFFERS
 *
 * The user may need to deal with multiple buffers because data may
 * arrive asynchronously over the network, and may have many concurrent
 * lexing jobs. The emitter part is not difficult since a ring buffer
 * can grow, or the parser can be called directly (except queuing a few
 * tokens for backtracking as we shall see).
 *
 * If the lexer were an explicit statemachine as in Flex, we could get
 * an yywrap event to fill buffers, but our state is on the stack and in
 * registers for optimization. We may use co-routines, but it doesn't
 * cover all issues, and, as it turns out is not necessary with the
 * following restrictions on syntax:
 *
 * All variable length tokens such as numerics and identifiers are
 * limited in length. Strings and comments are not, but are broken into
 * zero, one, or several body tokens per line. ANSI-C limits line length
 * to 509 characters (allowing for continuation and two byte linebreaks
 * in a 512 byte buffer). But JSON has no line continuation for strings
 * and may (and often do) store everything on a single line. Whitespace
 * can also extend beyond given limit.
 *
 * If we ignore whitespace, strings and comments, we can discard the
 * last token (or last two in case there are paired tokens, such as
 * leading zero followed by numeric. Parsing can then resume in a new
 * buffer where the first 512 bytes (or similar) are duplicated from the
 * previous buffer. The lexer is then restarted at the last token (pair)
 * start which may turn out to change the length or even introduce a
 * different result such introducing leading zero. The lexer need no
 * specific state to do this.
 *
 * For strings and comments, we need a flag to allow entering the lexer
 * mid string or mid comment. The newline and line continuation tokens
 * need to be dropped, and the last body may need to be truncated as it
 * can embed a partial delimiter. The simplest way to deal with this is
 * to backtrack tokens until the last token begins at a safe position,
 * about 3-6 charaters earlier, and truncating body segments that span
 * this barrier. Whitespace can also be truncated.
 *
 * We can generalize this further by going at least K bytes back in an N
 * overlap buffer region and require non-strings (and non-comments) to
 * not exceed N-K bytes, where K and N are specific to the syntax and
 * the I/O topology.
 *
 * We can add flags to tokens that can help decide how to enter
 * backtracking mode without covering every possible scanner loop - i.e.
 * are we mid string, mid comment, single-line or multi-line.
 *
 * All the lexer needs to do then, is to receive the backtracking mode
 * flags. A wrapping driver can deal with backtrack logic, which is
 * specific to how tokens are emitted. Whitespace need no recovery mode
 * but perhaps new whitespace should extend existing to simplify
 * parsing.
 */


#endif /* LUTHOR_H */