1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
|
#ifndef LEX_TOKENS_H
#define LEX_TOKENS_H
/* Define LEX_DEBUG to enable token printing and describing functions. */
enum {
/*
* EOF is not emitted by lexer, but may be used by driver after
* last buffer is processed.
*/
LEX_TOK_EOF = 0,
/*
* Either EOB or EOS is emitted as the last token before exit,
* or also ABORT in some lexers. Unterminated string or comment
* will be emitted immediately one of these when relevant.
*
* It may be useful to redefine lex_emit_eos and lex_emit_eob to
* produce LEX_TOK_EOF or error directly for simple string lexing.
*/
LEX_TOK_EOB = 1,
LEX_TOK_EOS = 2,
/*
* ABORT can be used for early exit by some lexers while other
* lexers may choose to run to buffer end regardless of input (with
* the exception of deeply nested comments).
*/
LEX_TOK_ABORT = 3,
/*
* Byte order marker. Only happen if lexer was started in bom mode
* and the input stream contains a leading bom marker.
* The token can only be the first token in the stream. Utf-8 is the
* only supported bom, but the lexeme may be checked in case other
* boms are added later. Normally it is routed to lex_emit_other
* along with comments so it just ignores the bom if present. It is
* generally recommended to consume utf-8 bom for interoperability,
* but also to not store it for the same reason.
*/
LEX_TOK_BOM,
/*
* Any control character that is not newline or blank will be
* emitted as single character token here. This token is discussed
* in several comments below. For strings and comments, also
* blank control characters will be emitted since they are usually
* not desired unexpectd.
*/
LEX_TOK_CTRL,
LEX_TOK_STRING_CTRL,
LEX_TOK_COMMENT_CTRL,
/*
* Any printable ASCII character that is not otherwise consumed will
* be issued as a single length symbol token. Further discussion
* below. The symbol and CTRL tokens ensure that the entire input
* stream is covered by tokens. If utf-8 identifies have not been
* flagged, utf-8 leading characters may also end up here, and so
* my utf-8 characters in general, that are not viewed as valid
* identifiers (depending on configuration).
*/
LEX_TOK_SYMBOL,
/*
* Variable length identifier starting with (_A-Za-z) by default and
* followed by zero or more (_A-Za-z0-9) characters. (_) can be
* flagged out. utf-8 can be flagged in. Be default any non-ASCII
* character (0x80 and above), is treated as part of an identifier
* for simplicity and speed, but this may be redefined. Any broken
* utf-8 is not sanitized, thus 0x80 would be a valid identifier
* token with utf-8 identifiers enabled, and otherwise it would be a
* symbol token.
*
* The ID does a magic trick: It maps the lexeme to a very simple
* and fast 32 bit hash code called a tag. The tag is emitted with
* the id token and can be used for fast keyword lookup. The
* hash tag is:
*
* (length)(first char)(second char)(last char)
*
* where length is ASCII '0' .. '9' where any length overflow is an
* arbitrary value, but such that the length is never longer than
* the lexeme. The last char is the last char regardless of length.
* For short identifiers, the second char may be the first char
* duplicated, and the last char may be first char.
*
* This code is very simple to write by hand: "5whe" means while,
* and can be used in a case switch before a strcmp with "while".
* Conflicts are possible, but then several keywords are tested like
* any other hash conflict. This keyword lookup is user driven, but
* can follow example code quite straightforward.
*
* The lex_emit_id macro can be implemented to provide the above
* lookup and inject a keyword token instead. By convention such
* tokens have negative values to avoid conflicts with lexer
* generated tokens.
*
* The ID also has a special role in prefixes and suffixes: C string
* literals like (L"hello") and numeric literals like (42f) are
* lexed as two tokens, one of which is an ID. The parser must
* process this and observe absence of whitespace where such syntax
* is relevant.
*
* While not specific to ID, the emitter macroes can be designed to
* keep track of start of lines and end of whitespace and attach
* state flags to each token (at line start, after whitespace). The
* whitespace tokens can then be dropped. This might help parsing
* things like suffixes efficiently.
*/
LEX_TOK_ID,
/*
* C-int :: pos-dec-digit dec-digit *
* Julia-int ::= dec-digit+
*
* pos-dec-digit ::= '1'..'9'
* dec-digit ::= '0'..'9'
*
* Floating point numbers take precedence when possible so 00.10 is
* always a deciaml floating point value when decimal floats are
* enabled.
*
* The C-int is automatically enabled if C-octals are enabled, and
* disabled otherwise. There is no specific Julia-int type - we just
* use the terminology to represent integers with leading zeroes.
*
* Julia style integers accept leading zeroes. C style integers with
* leading zeroes are consumed as C style octal numbers, so 0019 is
* parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int).
*
* Single digit '0' maps to octal when C-octals are enabled and to
* Julia-int otherwise. (Yes, integers are not that simple, it
* seems).
*
* Both C and Julia octal numbers (see octal token) can be active
* simultaneously. This can be used to control leading zero
* behavior, even if C-octal numbers are not part of the grammar
* being parsed. For example, a language might use 0o777 octal
* numbers and disallow 0777 integers. Enabling C-octals makes this
* easy to detect (but should accept octal 0).
*
* There is no destinction between the styles in the int token, but
* leading zeroes are easily detected in the lexeme.
*
* Constant suffixes like 1L are treated as 1(INT), and L(ID). The
* same goes for other numeric values.
*
* Parser should check for leading zeroes and decide if it is valid,
* a warning, or an error (it is in JSON). This also goes for float.
*
* Numericals, not limited to INT, may appear shorter than they are
* due to buffer splits. Special recovery is required, but will only
* happen just before EOS or EOB tokens (i.e. buffer split events).
*/
LEX_TOK_INT,
/*
* float ::= (int ['.' dec-digits*] dec-exponent)
* | ([int] '.' dec-digits* [dec-exponent])
* dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits*
* dec-digits ::= '0'..'9'
* int ::= dec-digits*
*
* Consumes a superset of C float representation without suffix.
* Some invalid tokens such as 0.E are accepted. Valid tokens such
* as 00.10 take precedence over octal numbers even if it is a
* prefix, and the same is obviously true with respect to decimal
* integers.
*
* JSON does not allow leading zeroes, and also not leading '.'.
* This can easily be checked in the lexeme.
*
* The octal notation affecting integer leading zeroes is not
* relevant to floats because floats take precedence over octal and
* decimal int when containing '.', 'e' or 'E'.
*/
LEX_TOK_FLOAT,
/*
* binary ::= (0b | 0B) ('0' | '1')*
*
* 0b100 or just 0b, parser must check that digits are present,
* otherwise it may be interpreted as zero, just like octal zero
* in C.
*
* Like 0X hex, 0B can be flagged out because Julia v0.3 does not
* support uppercase 0B.
*/
LEX_TOK_BINARY,
/*
* C-octal ::= 0 octal-digit*
* octal-digits ::= '0'..'7'
*
* Julia-octal ::= 0o octal-digits*
* octal-digits ::= '0'..'7'
*
* 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia
* v.0.3 does not allow uppercase 0O777, it would mean 0 * O777.
*
* When enabled, decimal floating points take precedence: 00.10 is
* parsed as 00.10(decimal float), as per C standard.
*
* NOTE: It is possible for both styles to be active simultaneously.
* This may be relevant in order to control handling of leading
* zeroes in decimal integers.
*
* If C-octal numbers are flagged out, leading zeroes are mapped to
* integers and the numerical value may change. Julia behaves this
* way. Nothing prevents support of both C and Julia octal numbers,
* but leading zeroes will then be interpreted the C way - it is not
* recommended to do this.
*/
LEX_TOK_OCTAL,
/*
* hex ::= hex-int
* hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9'
* hex-int ::= (0x | 0X) hex_digts*
*
* where hex_digits are customizable (e.g. all lower case), and hex
* prefix 0x can be flagged to be lower case only (as in Julia).
*
* If hex floats are enabled, they take precedence:
* 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed
* by .0(decimal float).
*
* The lead prefix 0x may by flagged to be lower case only because
* this is required by Julia v0.3 where 0X means 0 * X. Julia
* accepts uppercase in the remaining hex digits (and exponent for
* floats). This could possibly change in future versions.
*
* The zero length sequence (0x | 0X) is accepted and left to the
* parser since the lexer emits a token for everything it sees.
* Conceptually it may be interpreted as zero, equivalent to 0 being
* both octal prefix and numeric 0 in C style octal representation.
* Or it may be an error.
*/
LEX_TOK_HEX,
/*
* hex_float ::= hex-int ['.' hex_digit*] hex-exponent
* hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit*
* decimal-digit ::= '0'..'9'
*
* A superset of IEEE-754-2008 Hexadecimal Floating Point notation.
*
* We require the exponent to be present, but does not ensure the
* value is otherwise complete, e.g. 0x1p+ would be accepted. The p
* is needed because otherwise 0x1.f could be accepted, and f is a
* float suffix in C, and juxtapostion factor (0x1. * f) in Julia,
* at least, that is one possible interpretation.
*
* The exponent can be flagged optional in which case 0x1.f will be
* consumed as a single hex float toke as a single hex float token.
* This may either simply be accepted in some grammars, or used to
* provide an error message. If the exponent is required, 0x1.f will
* be lexed as three tokens:
*
* <'0x1'(hex int), '.'(op), 'f'(id)>.
*
* Thus it may be a good idea to allow the exponent to be optional
* anyway and issue an error message or warning if the p is absent
* later in the parsing stage.
*
* Note that, as per IEEE-754, the exponent is a decimal power of
* two. In other words, the number of bits to shift the
* (hexa)decimal point. Also note that it is p and not e because e
* is a hex digit.
*/
LEX_TOK_HEX_FLOAT,
/*
* blank ::= ('\t' | '\x20')+
*
* Longest run in buffer holding only '\t' and '\x20' (space).
*
* buffer splits may generate adjacent blanks depending on recovery
* processing. (The same goes for other line oriented runs such as
* string parts and comment parts).
*/
LEX_TOK_BLANK,
/* newline ::= '\r' | '\n' | '\r\n' | '\n\r'
*
* Will always appear, also inside strings and comments. Can be used
* to track line starts and counts reliably as only one newline is
* issued at a time, and it is issued everywhere, also in strings
* and comments.
*
* May be preceeded by string escape token inside strings. This can
* be interpreted as line continuation within strings specifically,
* as is the case in Python and Javascript (and in C via
* pre-processor).
*
* The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary
* newline may be ignored in comments and other non-string content.
*/
LEX_TOK_NEWLINE,
LEX_TOK_STRING_NEWLINE,
/*
* string ::= string_start
* (string_part | string_escape |
* string_ctrl | string_newline)*
* (string_end | string_unterminated)
*
* There are several optional string styles. They all start with
* this token. The length and content provided details. Python
* may start with """ or ''' and this token will then have length
* 3 and three quotes as lexeme content. If the lexer exits before
* string end token, the returned lexer mode will remember the
* state and can be used for reentry - this also goes for comments.
*
* Strings can only contain part, escape, newline, and control
* tokens, and either string unterminated or string end token
* at last.
*/
LEX_TOK_STRING_BEGIN,
/* Longest run without control characters, without (\), without
* newline, and without the relevant end delimiter. The run may be
* shortened due to buffer splits. The part may, as an exception,
* begin with an end delimiter character or a (\) if it was
* preceeded by a string escape token. The escape character is
* always (\). Strings that use "" or '' as escape will be treated
* as start and end of separate strings. Strings that do not supoort
* (\) should just treat escape as a part of the string.
*/
LEX_TOK_STRING_PART,
/*
* This is always a single character token (\) and only happens
* inside strings. See also string part token.
*/
LEX_TOK_STRING_ESCAPE,
/* This token is similar to string start. It may be absent at buffer
* splits, but will then an unterminated string token will be used
* just before the split event token.
*
* */
LEX_TOK_STRING_END,
/*
* This is emitted before the buffer ends, or before unescaped
* newlines for line oriented string types (the usual strings).
* At buffer splits, recovery should clean it up. The returned
* mode allow parsing to continue in a new buffer with a slight
* content overlap.
*
* If string like ("hello, world!") in C, reaches end of line, it
* may be continued" ("hello, \)newline(world!"). If this line
* continuation is flagged out, this will lead to string
* unterminated, even if not at end of buffer. For block strings
* like """hello""", this only happens at end of buffer.
*/
LEX_TOK_STRING_UNTERMINATED,
/*
*
* comment ::= comment_start
* (comment_part | ctrl | newline)*
* (comment_end | comment_unterminated)
*
*
* Comments work like strings in most respects. They emit parts, and
* control characters, but not escape characters, and cannot be
* continued at end of line. Block comments are like python block
* strings (''').
*
* Julia supports nested comments (#= ... #= =# =#). In this case
* a new start token can be emitted before an end token. If the
* parser exits due to buffer split, the mode has the nesting level
* encoded so it can resumed in a new buffer.
*
* Line comments will have their end token just before newline, or
* unterminated comment just before buffer split token (EOB or EOS).
* (\) characters are consumed by the comment part tokens and do not
* affect the end of any comment.
*
* Comment begin may include extra characters when a doc comment is
* recognized. The emitter flags this. End comments are unaffected.
*/
LEX_TOK_COMMENT_BEGIN,
LEX_TOK_COMMENT_PART,
LEX_TOK_COMMENT_END,
LEX_TOK_COMMENT_UNTERMINATED,
/*
* Issued before ABORT token if nesting level is above a predefined
* level. This is to protect against malicious and misguided
* content, otherwise the nesting level counter could wrap and
* generate a different interpretation, which could be bad. The
* parser would probably do similar things with nested tokens.
*/
LEX_TOK_COMMENT_DEEPLY_NESTED,
/* Operators are all recognized single character symbols, or up to
* four characters. The token value is the ASCII codes shifted 8
* bits per extra character, by default, but the emitter macros
* can redefine this. Values below 32 are reserved token types as
* discussed above.
*
* What exactly represents an operator depends on what the lexer has
* enabled.
*
* Printable ASCII symbols that are NOT recognized, are emitted as
* the SYMBOL token and is always length 1. The value can be derived
* from the lexeme, but not the token itself. This may be perfectly
* fine for the parser, or it may be used to indicate an error.
* There are no illegal characters per se.
*
* Non-printable ASCII characters that are not covered by newline or
* blank, are emitted as CTRL tokens. These act the same as the
* symbol token and may be used to indicate error, or to handle form
* feed and other whitespace not handled by default. Unlike symbol,
* however, CTRL also appear in strings and comments since they are
* generally not allowed and this makes it easy to capture (there is
* virtually no performance overhead in providing this service
* unless attempting to parse a binary format).
*/
/* Don't bleed into this range. */
LEX_TOK_OPERATOR_BASE = 32,
/*
* Operators use ASCII range.
* Compound operators use range 0x80 to 0x7fff
* and possibly above for triple sequences.
* Custom keywords are normally negative but can be mapped
* to any other.
*
* The layout is designed for efficient table lookup.
* Compound operators might benefit from remapping down to a smaller
* range for compact lookup tables, but it depends on the parser.
*/
};
/*
* Custom keyword token range is negative, and well below -99..0 where
* special codes are reserved.
*/
#ifndef LEX_TOK_KW_BASE
#define LEX_TOK_KW_BASE -1000
#endif
#ifndef LEX_TOK_KW_NOT_FOUND
#define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID
#endif
#ifdef LEX_DEBUG
#include <stdio.h>
#include <string.h>
static const char *lex_describe_token(long token)
{
switch(token) {
case LEX_TOK_BOM: return "BOM marker";
case LEX_TOK_EOF: return "EOF";
case LEX_TOK_EOS: return "buffer zero terminated";
case LEX_TOK_EOB: return "buffer exhausted";
case LEX_TOK_ABORT: return "abort";
case LEX_TOK_CTRL: return "control";
case LEX_TOK_STRING_CTRL: return "string control";
case LEX_TOK_COMMENT_CTRL: return "comment control";
case LEX_TOK_SYMBOL: return "symbol";
case LEX_TOK_ID: return "identifier";
case LEX_TOK_INT: return "integer";
case LEX_TOK_FLOAT: return "float";
case LEX_TOK_BINARY: return "binary";
case LEX_TOK_OCTAL: return "octal";
case LEX_TOK_HEX: return "hex";
case LEX_TOK_HEX_FLOAT: return "hex float";
case LEX_TOK_BLANK: return "blank";
case LEX_TOK_NEWLINE: return "newline";
case LEX_TOK_STRING_NEWLINE: return "string newline";
case LEX_TOK_STRING_BEGIN: return "string begin";
case LEX_TOK_STRING_PART: return "string part";
case LEX_TOK_STRING_END: return "string end";
case LEX_TOK_STRING_ESCAPE: return "string escape";
case LEX_TOK_STRING_UNTERMINATED: return "unterminated string";
case LEX_TOK_COMMENT_BEGIN: return "comment begin";
case LEX_TOK_COMMENT_PART: return "comment part";
case LEX_TOK_COMMENT_END: return "comment end";
case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment";
case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment";
default:
if (token < LEX_TOK_EOF) {
return "keyword";
}
if (token < 32) {
return "undefined";
}
if (token < 0x100L) {
return "operator";
}
if (token < 0x10000L) {
return "compound operator";
}
if (token < 0x1000000L) {
return "tricompound operator";
}
if (token < 0x7f0000000L) {
return "quadcompound operator";
}
return "reserved";
}
}
static void lex_fprint_token(FILE *fp,
long token,
const char *first, const char *last,
int line, int pos)
{
char buf[10];
const char *lexeme = first;
int len = (int)(last - first);
switch (token) {
case LEX_TOK_EOS:
case LEX_TOK_CTRL:
sprintf(buf, "^%02x", (int)*first);
lexeme = buf;
len = strlen(buf);
break;
default:
break;
}
fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n",
line, pos, lex_describe_token(token), token, len, lexeme);
}
#define lex_print_token(token, first, last, line, pos) \
lex_fprint_token(stdout, token, first, last, line, pos)
#else /* LEX_DEBUG */
#define lex_describe_token(token) "debug not available"
#define lex_fprint_token(fp, token, first, last, line, pos) ((void)0)
#define lex_print_token(token, first, last, line, pos) ((void)0)
#endif /* LEX_DEBUG */
#endif /* LEX_TOKENS_H */
|