diff options
Diffstat (limited to 'external')
52 files changed, 8857 insertions, 0 deletions
diff --git a/external/grisu3/.gitignore b/external/grisu3/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/external/grisu3/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/external/grisu3/LICENSE b/external/grisu3/LICENSE new file mode 100644 index 0000000..bb7ca57 --- /dev/null +++ b/external/grisu3/LICENSE @@ -0,0 +1,14 @@ +Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com +Some files also Copyright author of MathGeoLib (https://github.com/juj) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 diff --git a/external/grisu3/README.md b/external/grisu3/README.md new file mode 100644 index 0000000..5f5c62e --- /dev/null +++ b/external/grisu3/README.md @@ -0,0 +1,9 @@ +Implements the grisu3 floating point printing and parsing algorithm +based on earlier work: + +- <http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf> +- <https://github.com/google/double-conversion> +- <https://github.com/juj/MathGeoLib/blob/master/src/Math/grisu3.c> +- <http://www.exploringbinary.com/quick-and-dirty-floating-point-to-decimal-conversion/> + + diff --git a/external/grisu3/grisu3_math.h b/external/grisu3/grisu3_math.h new file mode 100644 index 0000000..cff6e8c --- /dev/null +++ b/external/grisu3/grisu3_math.h @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com + * Copyright author of MathGeoLib (https://github.com/juj) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 + */ + +/* 2016-02-02: Updated by mikkelfj + * + * Extracted from MatGeoLib grisu3.c, Apache 2.0 license, and extended. + * + * This file is usually include via grisu3_print.h or grisu3_parse.h. + * + * The original MatGeoLib dtoa_grisu3 implementation is largely + * unchanged except for the uint64 to double cast. The remaining changes + * are file structure, name changes, and new additions for parsing: + * + * - Split into header files only: + * grisu3_math.h, grisu3_print.h, (added grisu3_parse.h) + * + * - names prefixed with grisu3_, grisu3_diy_fp_, GRISU3_. + * - added static to all functions. + * - disabled clang unused function warnings. + * - guarded <stdint.h> to allow for alternative impl. + * - added extra numeric constants needed for parsing. + * - added dec_pow, cast_double_from_diy_fp. + * - changed some function names for consistency. + * - moved printing specific grisu3 functions to grisu3_print.h. + * - changed double to uint64 cast to avoid aliasing. + * - added new grisu3_parse.h for parsing doubles. + * - grisu3_print_double (dtoa_grisu3) format .1 as 0.1 needed for valid JSON output + * and grisu3_parse_double wouldn't consume it. + * - grsu3_print_double changed formatting to prefer 0.012 over 1.2e-2. + * + * These changes make it possible to include the files as headers only + * in other software libraries without risking name conflicts, and to + * extend the implementation with a port of Googles Double Conversion + * strtod functionality for parsing doubles. + * + * Extracted from: rev. 915501a / Dec 22, 2015 + * <https://github.com/juj/MathGeoLib/blob/master/src/Math/grisu3.c> + * MathGeoLib License: http://www.apache.org/licenses/LICENSE-2.0.html + */ + +#ifndef GRISU3_MATH_H +#define GRISU3_MATH_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* Guarded to allow inclusion of pstdint.h first, if stdint.h is not supported. */ +#ifndef UINT8_MAX +#include <stdint.h> /* uint64_t etc. */ +#endif + +#ifdef GRISU3_NO_ASSERT +#undef GRISU3_ASSERT +#define GRISU3_ASSERT(x) ((void)0) +#endif + +#ifndef GRISU3_ASSERT +#include <assert.h> /* assert */ +#define GRISU3_ASSERT(x) assert(x) +#endif + +#ifdef _MSC_VER +#pragma warning(disable : 4204) /* nonstandard extension used : non-constant aggregate initializer */ +#endif + +#define GRISU3_D64_SIGN 0x8000000000000000ULL +#define GRISU3_D64_EXP_MASK 0x7FF0000000000000ULL +#define GRISU3_D64_FRACT_MASK 0x000FFFFFFFFFFFFFULL +#define GRISU3_D64_IMPLICIT_ONE 0x0010000000000000ULL +#define GRISU3_D64_EXP_POS 52 +#define GRISU3_D64_EXP_BIAS 1075 +#define GRISU3_D64_DENORM_EXP (-GRISU3_D64_EXP_BIAS + 1) +#define GRISU3_DIY_FP_FRACT_SIZE 64 +#define GRISU3_D_1_LOG2_10 0.30102999566398114 /* 1 / lg(10) */ +#define GRISU3_MIN_TARGET_EXP -60 +#define GRISU3_MASK32 0xFFFFFFFFULL +#define GRISU3_MIN_CACHED_EXP -348 +#define GRISU3_MAX_CACHED_EXP 340 +#define GRISU3_CACHED_EXP_STEP 8 +#define GRISU3_D64_MAX_DEC_EXP 309 +#define GRISU3_D64_MIN_DEC_EXP -324 +#define GRISU3_D64_INF GRISU3_D64_EXP_MASK + +#define GRISU3_MIN(x,y) ((x) <= (y) ? (x) : (y)) +#define GRISU3_MAX(x,y) ((x) >= (y) ? (x) : (y)) + + +typedef struct grisu3_diy_fp +{ + uint64_t f; + int e; +} grisu3_diy_fp_t; + +typedef struct grisu3_diy_fp_power +{ + uint64_t fract; + int16_t b_exp, d_exp; +} grisu3_diy_fp_power_t; + +typedef union { + uint64_t u64; + double d64; +} grisu3_cast_double_t; + +static uint64_t grisu3_cast_uint64_from_double(double d) +{ + grisu3_cast_double_t cd; + cd.d64 = d; + return cd.u64; +} + +static double grisu3_cast_double_from_uint64(uint64_t u) +{ + grisu3_cast_double_t cd; + cd.u64 = u; + return cd.d64; +} + +#define grisu3_double_infinity grisu3_cast_double_from_uint64(GRISU3_D64_INF) +#define grisu3_double_nan grisu3_cast_double_from_uint64(GRISU3_D64_INF + 1) + +static const grisu3_diy_fp_power_t grisu3_diy_fp_pow_cache[] = +{ + { 0xfa8fd5a0081c0288ULL, -1220, -348 }, + { 0xbaaee17fa23ebf76ULL, -1193, -340 }, + { 0x8b16fb203055ac76ULL, -1166, -332 }, + { 0xcf42894a5dce35eaULL, -1140, -324 }, + { 0x9a6bb0aa55653b2dULL, -1113, -316 }, + { 0xe61acf033d1a45dfULL, -1087, -308 }, + { 0xab70fe17c79ac6caULL, -1060, -300 }, + { 0xff77b1fcbebcdc4fULL, -1034, -292 }, + { 0xbe5691ef416bd60cULL, -1007, -284 }, + { 0x8dd01fad907ffc3cULL, -980, -276 }, + { 0xd3515c2831559a83ULL, -954, -268 }, + { 0x9d71ac8fada6c9b5ULL, -927, -260 }, + { 0xea9c227723ee8bcbULL, -901, -252 }, + { 0xaecc49914078536dULL, -874, -244 }, + { 0x823c12795db6ce57ULL, -847, -236 }, + { 0xc21094364dfb5637ULL, -821, -228 }, + { 0x9096ea6f3848984fULL, -794, -220 }, + { 0xd77485cb25823ac7ULL, -768, -212 }, + { 0xa086cfcd97bf97f4ULL, -741, -204 }, + { 0xef340a98172aace5ULL, -715, -196 }, + { 0xb23867fb2a35b28eULL, -688, -188 }, + { 0x84c8d4dfd2c63f3bULL, -661, -180 }, + { 0xc5dd44271ad3cdbaULL, -635, -172 }, + { 0x936b9fcebb25c996ULL, -608, -164 }, + { 0xdbac6c247d62a584ULL, -582, -156 }, + { 0xa3ab66580d5fdaf6ULL, -555, -148 }, + { 0xf3e2f893dec3f126ULL, -529, -140 }, + { 0xb5b5ada8aaff80b8ULL, -502, -132 }, + { 0x87625f056c7c4a8bULL, -475, -124 }, + { 0xc9bcff6034c13053ULL, -449, -116 }, + { 0x964e858c91ba2655ULL, -422, -108 }, + { 0xdff9772470297ebdULL, -396, -100 }, + { 0xa6dfbd9fb8e5b88fULL, -369, -92 }, + { 0xf8a95fcf88747d94ULL, -343, -84 }, + { 0xb94470938fa89bcfULL, -316, -76 }, + { 0x8a08f0f8bf0f156bULL, -289, -68 }, + { 0xcdb02555653131b6ULL, -263, -60 }, + { 0x993fe2c6d07b7facULL, -236, -52 }, + { 0xe45c10c42a2b3b06ULL, -210, -44 }, + { 0xaa242499697392d3ULL, -183, -36 }, + { 0xfd87b5f28300ca0eULL, -157, -28 }, + { 0xbce5086492111aebULL, -130, -20 }, + { 0x8cbccc096f5088ccULL, -103, -12 }, + { 0xd1b71758e219652cULL, -77, -4 }, + { 0x9c40000000000000ULL, -50, 4 }, + { 0xe8d4a51000000000ULL, -24, 12 }, + { 0xad78ebc5ac620000ULL, 3, 20 }, + { 0x813f3978f8940984ULL, 30, 28 }, + { 0xc097ce7bc90715b3ULL, 56, 36 }, + { 0x8f7e32ce7bea5c70ULL, 83, 44 }, + { 0xd5d238a4abe98068ULL, 109, 52 }, + { 0x9f4f2726179a2245ULL, 136, 60 }, + { 0xed63a231d4c4fb27ULL, 162, 68 }, + { 0xb0de65388cc8ada8ULL, 189, 76 }, + { 0x83c7088e1aab65dbULL, 216, 84 }, + { 0xc45d1df942711d9aULL, 242, 92 }, + { 0x924d692ca61be758ULL, 269, 100 }, + { 0xda01ee641a708deaULL, 295, 108 }, + { 0xa26da3999aef774aULL, 322, 116 }, + { 0xf209787bb47d6b85ULL, 348, 124 }, + { 0xb454e4a179dd1877ULL, 375, 132 }, + { 0x865b86925b9bc5c2ULL, 402, 140 }, + { 0xc83553c5c8965d3dULL, 428, 148 }, + { 0x952ab45cfa97a0b3ULL, 455, 156 }, + { 0xde469fbd99a05fe3ULL, 481, 164 }, + { 0xa59bc234db398c25ULL, 508, 172 }, + { 0xf6c69a72a3989f5cULL, 534, 180 }, + { 0xb7dcbf5354e9beceULL, 561, 188 }, + { 0x88fcf317f22241e2ULL, 588, 196 }, + { 0xcc20ce9bd35c78a5ULL, 614, 204 }, + { 0x98165af37b2153dfULL, 641, 212 }, + { 0xe2a0b5dc971f303aULL, 667, 220 }, + { 0xa8d9d1535ce3b396ULL, 694, 228 }, + { 0xfb9b7cd9a4a7443cULL, 720, 236 }, + { 0xbb764c4ca7a44410ULL, 747, 244 }, + { 0x8bab8eefb6409c1aULL, 774, 252 }, + { 0xd01fef10a657842cULL, 800, 260 }, + { 0x9b10a4e5e9913129ULL, 827, 268 }, + { 0xe7109bfba19c0c9dULL, 853, 276 }, + { 0xac2820d9623bf429ULL, 880, 284 }, + { 0x80444b5e7aa7cf85ULL, 907, 292 }, + { 0xbf21e44003acdd2dULL, 933, 300 }, + { 0x8e679c2f5e44ff8fULL, 960, 308 }, + { 0xd433179d9c8cb841ULL, 986, 316 }, + { 0x9e19db92b4e31ba9ULL, 1013, 324 }, + { 0xeb96bf6ebadf77d9ULL, 1039, 332 }, + { 0xaf87023b9bf0ee6bULL, 1066, 340 } +}; + +/* Avoid dependence on lib math to get (int)ceil(v) */ +static int grisu3_iceil(double v) +{ + int k = (int)v; + if (v < 0) return k; + return v - k == 0 ? k : k + 1; +} + +static int grisu3_diy_fp_cached_pow(int exp, grisu3_diy_fp_t *p) +{ + int k = grisu3_iceil((exp+GRISU3_DIY_FP_FRACT_SIZE-1) * GRISU3_D_1_LOG2_10); + int i = (k-GRISU3_MIN_CACHED_EXP-1) / GRISU3_CACHED_EXP_STEP + 1; + p->f = grisu3_diy_fp_pow_cache[i].fract; + p->e = grisu3_diy_fp_pow_cache[i].b_exp; + return grisu3_diy_fp_pow_cache[i].d_exp; +} + +static grisu3_diy_fp_t grisu3_diy_fp_minus(grisu3_diy_fp_t x, grisu3_diy_fp_t y) +{ + grisu3_diy_fp_t d; d.f = x.f - y.f; d.e = x.e; + GRISU3_ASSERT(x.e == y.e && x.f >= y.f); + return d; +} + +static grisu3_diy_fp_t grisu3_diy_fp_multiply(grisu3_diy_fp_t x, grisu3_diy_fp_t y) +{ + uint64_t a, b, c, d, ac, bc, ad, bd, tmp; + grisu3_diy_fp_t r; + a = x.f >> 32; b = x.f & GRISU3_MASK32; + c = y.f >> 32; d = y.f & GRISU3_MASK32; + ac = a*c; bc = b*c; + ad = a*d; bd = b*d; + tmp = (bd >> 32) + (ad & GRISU3_MASK32) + (bc & GRISU3_MASK32); + tmp += 1U << 31; /* round */ + r.f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); + r.e = x.e + y.e + 64; + return r; +} + +static grisu3_diy_fp_t grisu3_diy_fp_normalize(grisu3_diy_fp_t n) +{ + GRISU3_ASSERT(n.f != 0); + while(!(n.f & 0xFFC0000000000000ULL)) { n.f <<= 10; n.e -= 10; } + while(!(n.f & GRISU3_D64_SIGN)) { n.f <<= 1; --n.e; } + return n; +} + +static grisu3_diy_fp_t grisu3_cast_diy_fp_from_double(double d) +{ + grisu3_diy_fp_t fp; + uint64_t u64 = grisu3_cast_uint64_from_double(d); + if (!(u64 & GRISU3_D64_EXP_MASK)) { fp.f = u64 & GRISU3_D64_FRACT_MASK; fp.e = 1 - GRISU3_D64_EXP_BIAS; } + else { fp.f = (u64 & GRISU3_D64_FRACT_MASK) + GRISU3_D64_IMPLICIT_ONE; fp.e = (int)((u64 & GRISU3_D64_EXP_MASK) >> GRISU3_D64_EXP_POS) - GRISU3_D64_EXP_BIAS; } + return fp; +} + +static double grisu3_cast_double_from_diy_fp(grisu3_diy_fp_t n) +{ + const uint64_t hidden_bit = GRISU3_D64_IMPLICIT_ONE; + const uint64_t frac_mask = GRISU3_D64_FRACT_MASK; + const int denorm_exp = GRISU3_D64_DENORM_EXP; + const int exp_bias = GRISU3_D64_EXP_BIAS; + const int exp_pos = GRISU3_D64_EXP_POS; + + grisu3_diy_fp_t v = n; + uint64_t e_biased; + + while (v.f > hidden_bit + frac_mask) { + v.f >>= 1; + ++v.e; + } + if (v.e < denorm_exp) { + return 0.0; + } + while (v.e > denorm_exp && (v.f & hidden_bit) == 0) { + v.f <<= 1; + --v.e; + } + if (v.e == denorm_exp && (v.f & hidden_bit) == 0) { + e_biased = 0; + } else { + e_biased = (uint64_t)(v.e + exp_bias); + } + return grisu3_cast_double_from_uint64((v.f & frac_mask) | (e_biased << exp_pos)); +} + +/* pow10_cache[i] = 10^(i-1) */ +static const unsigned int grisu3_pow10_cache[] = { 0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; + +static int grisu3_largest_pow10(uint32_t n, int n_bits, uint32_t *power) +{ + int guess = ((n_bits + 1) * 1233 >> 12) + 1/*skip first entry*/; + if (n < grisu3_pow10_cache[guess]) --guess; /* We don't have any guarantees that 2^n_bits <= n. */ + *power = grisu3_pow10_cache[guess]; + return guess; +} + +#ifdef __cplusplus +} +#endif + +#endif /* GRISU3_MATH_H */ diff --git a/external/grisu3/grisu3_parse.h b/external/grisu3/grisu3_parse.h new file mode 100644 index 0000000..3d67c9a --- /dev/null +++ b/external/grisu3/grisu3_parse.h @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 + */ + +/* + * Port of parts of Google Double Conversion strtod functionality + * but with fallback to strtod instead of a bignum implementation. + * + * Based on grisu3 math from MathGeoLib. + * + * See also grisu3_math.h comments. + */ + +#ifndef GRISU3_PARSE_H +#define GRISU3_PARSE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef UINT8_MAX +#include <stdint.h> +#endif + +#include <stdlib.h> +#include <limits.h> + +#include "grisu3_math.h" + + +/* + * The maximum number characters a valid number may contain. The parse + * fails if the input length is longer but the character after max len + * was part of the number. + * + * The length should not be set too high because it protects against + * overflow in the exponent part derived from the input length. + */ +#define GRISU3_NUM_MAX_LEN 1000 + +/* + * The lightweight "portable" C library recognizes grisu3 support if + * included first. + */ +#define grisu3_parse_double_is_defined 1 + +/* + * Disable to compare performance and to test diy_fp algorithm in + * broader range. + */ +#define GRISU3_PARSE_FAST_CASE + +/* May result in a one off error, otherwise when uncertain, fall back to strtod. */ +//#define GRISU3_PARSE_ALLOW_ERROR + + +/* + * The dec output exponent jumps in 8, so the result is offset at most + * by 7 when the input is within range. + */ +static int grisu3_diy_fp_cached_dec_pow(int d_exp, grisu3_diy_fp_t *p) +{ + const int cached_offset = -GRISU3_MIN_CACHED_EXP; + const int d_exp_dist = GRISU3_CACHED_EXP_STEP; + int i, a_exp; + + GRISU3_ASSERT(GRISU3_MIN_CACHED_EXP <= d_exp); + GRISU3_ASSERT(d_exp < GRISU3_MAX_CACHED_EXP + d_exp_dist); + + i = (d_exp + cached_offset) / d_exp_dist; + a_exp = grisu3_diy_fp_pow_cache[i].d_exp; + p->f = grisu3_diy_fp_pow_cache[i].fract; + p->e = grisu3_diy_fp_pow_cache[i].b_exp; + + GRISU3_ASSERT(a_exp <= d_exp); + GRISU3_ASSERT(d_exp < a_exp + d_exp_dist); + + return a_exp; +} + +/* + * Ported from google double conversion strtod using + * MathGeoLibs diy_fp functions for grisu3 in C. + * + * ulp_half_error is set if needed to trunacted non-zero trialing + * characters. + * + * The actual value we need to encode is: + * + * (sign ? -1 : 1) * fraction * 2 ^ (exponent - fraction_exp) + * where exponent is the base 10 exponent assuming the decimal point is + * after the first digit. fraction_exp is the base 10 magnitude of the + * fraction or number of significant digits - 1. + * + * If the exponent is between 0 and 22 and the fraction is encoded in + * the lower 53 bits (the largest bit is implicit in a double, but not + * in this fraction), then the value can be trivially converted to + * double without loss of precision. If the fraction was in fact + * multiplied by trailing zeroes that we didn't convert to exponent, + * we there are larger values the 53 bits that can also be encoded + * trivially - but then it is better to handle this during parsing + * if it is worthwhile. We do not optimize for this here, because it + * can be done in a simple check before calling, and because it might + * not be worthwile to do at all since it cery likely will fail for + * numbers printed to be convertible back to double without loss. + * + * Returns 0 if conversion was not exact. In that case the vale is + * either one smaller than the correct one, or the correct one. + * + * Exponents must be range protected before calling otherwise cached + * powers will blow up. + * + * Google Double Conversion seems to prefer the following notion: + * + * x >= 10^309 => +Inf + * x <= 10^-324 => 0, + * + * max double: HUGE_VAL = 1.7976931348623157 * 10^308 + * min double: 4.9406564584124654 * 10^-324 + * + * Values just below or above min/max representable number + * may round towards large/small non-Inf/non-neg values. + * + * but `strtod` seems to return +/-HUGE_VAL on overflow? + */ +static int grisu3_diy_fp_encode_double(uint64_t fraction, int exponent, int fraction_exp, int ulp_half_error, double *result) +{ + /* + * Error is measures in fractions of integers, so we scale up to get + * some resolution to represent error expressions. + */ + const int log2_error_one = 3; + const int error_one = 1 << log2_error_one; + const int denorm_exp = GRISU3_D64_DENORM_EXP; + const uint64_t hidden_bit = GRISU3_D64_IMPLICIT_ONE; + const int diy_size = GRISU3_DIY_FP_FRACT_SIZE; + const int max_digits = 19; + + int error = ulp_half_error ? error_one / 2 : 0; + int d_exp = (exponent - fraction_exp); + int a_exp; + int o_exp; + grisu3_diy_fp_t v = { fraction, 0 }; + grisu3_diy_fp_t cp; + grisu3_diy_fp_t rounded; + int mag; + int prec; + int prec_bits; + int half_way; + + /* When fractions in a double aren't stored with implicit msb fraction bit. */ + + /* Shift fraction to msb. */ + v = grisu3_diy_fp_normalize(v); + /* The half point error moves up while the exponent moves down. */ + error <<= -v.e; + + a_exp = grisu3_diy_fp_cached_dec_pow(d_exp, &cp); + + /* Interpolate between cached powers at distance 8. */ + if (a_exp != d_exp) { + int adj_exp = d_exp - a_exp - 1; + static grisu3_diy_fp_t cp_10_lut[] = { + { 0xa000000000000000ULL, -60 }, + { 0xc800000000000000ULL, -57 }, + { 0xfa00000000000000ULL, -54 }, + { 0x9c40000000000000ULL, -50 }, + { 0xc350000000000000ULL, -47 }, + { 0xf424000000000000ULL, -44 }, + { 0x9896800000000000ULL, -40 }, + }; + GRISU3_ASSERT(adj_exp >= 0 && adj_exp < 7); + v = grisu3_diy_fp_multiply(v, cp_10_lut[adj_exp]); + + /* 20 decimal digits won't always fit in 64 bit. + * (`fraction_exp` is one less than significant decimal + * digits in fraction, e.g. 1 * 10e0). + * If we cannot fit, introduce 1/2 ulp error + * (says double conversion reference impl.) */ + if (1 + fraction_exp + adj_exp > max_digits) { + error += error_one / 2; + } + } + + v = grisu3_diy_fp_multiply(v, cp); + /* + * Google double conversion claims that: + * + * The error introduced by a multiplication of a*b equals + * error_a + error_b + error_a*error_b/2^64 + 0.5 + * Substituting a with 'input' and b with 'cached_power' we have + * error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), + * error_ab = 0 or 1 / error_oner > error_a*error_b/ 2^64 + * + * which in our encoding becomes: + * error_a = error_one/2 + * error_ab = 1 / error_one (rounds up to 1 if error != 0, or 0 * otherwise) + * fixed_error = error_one/2 + * + * error += error_a + fixed_error + (error ? 1 : 0) + * + * (this isn't entirely clear, but that is as close as we get). + */ + error += error_one + (error ? 1 : 0); + + o_exp = v.e; + v = grisu3_diy_fp_normalize(v); + /* Again, if we shift the significant bits, the error moves along. */ + error <<= o_exp - v.e; + + /* + * The value `v` is bounded by 2^mag which is 64 + v.e. because we + * just normalized it by shifting towards msb. + */ + mag = diy_size + v.e; + + /* The effective magnitude of the IEEE double representation. */ + mag = mag >= diy_size + denorm_exp ? diy_size : mag <= denorm_exp ? 0 : mag - denorm_exp; + prec = diy_size - mag; + if (prec + log2_error_one >= diy_size) { + int e_scale = prec + log2_error_one - diy_size - 1; + v.f >>= e_scale; + v.e += e_scale; + error = (error >> e_scale) + 1 + error_one; + prec -= e_scale; + } + rounded.f = v.f >> prec; + rounded.e = v.e + prec; + prec_bits = (int)(v.f & ((uint64_t)1 << (prec - 1))) * error_one; + half_way = (int)((uint64_t)1 << (prec - 1)) * error_one; + if (prec >= half_way + error) { + rounded.f++; + /* Prevent overflow. */ + if (rounded.f & (hidden_bit << 1)) { + rounded.f >>= 1; + rounded.e += 1; + } + } + *result = grisu3_cast_double_from_diy_fp(rounded); + return half_way - error >= prec_bits || prec_bits >= half_way + error; +} + +/* + * `end` is unchanged if number is handled natively, or it is the result + * of strtod parsing in case of fallback. + */ +static const char *grisu3_encode_double(const char *buf, const char *end, int sign, uint64_t fraction, int exponent, int fraction_exp, int ulp_half_error, double *result) +{ + const int max_d_exp = GRISU3_D64_MAX_DEC_EXP; + const int min_d_exp = GRISU3_D64_MIN_DEC_EXP; + + char *v_end; + + /* Both for user experience, and to protect internal power table lookups. */ + if (fraction == 0 || exponent < min_d_exp) { + *result = 0.0; + goto done; + } + if (exponent - 1 > max_d_exp) { + *result = grisu3_double_infinity; + goto done; + } + + /* + * `exponent` is the normalized value, fraction_exp is the size of + * the representation in the `fraction value`, or one less than + * number of significant digits. + * + * If the final value can be kept in 53 bits and we can avoid + * division, then we can convert to double quite fast. + * + * ulf_half_error only happens when fraction is maxed out, so + * fraction_exp > 22 by definition. + * + * fraction_exp >= 0 always. + * + * http://www.exploringbinary.com/fast-path-decimal-to-floating-point-conversion/ + */ + + +#ifdef GRISU3_PARSE_FAST_CASE + if (fraction < (1ULL << 53) && exponent >= 0 && exponent <= 22) { + double v = (double)fraction; + /* Multiplying by 1e-k instead of dividing by 1ek results in rounding error. */ + switch (exponent - fraction_exp) { + case -22: v /= 1e22; break; + case -21: v /= 1e21; break; + case -20: v /= 1e20; break; + case -19: v /= 1e19; break; + case -18: v /= 1e18; break; + case -17: v /= 1e17; break; + case -16: v /= 1e16; break; + case -15: v /= 1e15; break; + case -14: v /= 1e14; break; + case -13: v /= 1e13; break; + case -12: v /= 1e12; break; + case -11: v /= 1e11; break; + case -10: v /= 1e10; break; + case -9: v /= 1e9; break; + case -8: v /= 1e8; break; + case -7: v /= 1e7; break; + case -6: v /= 1e6; break; + case -5: v /= 1e5; break; + case -4: v /= 1e4; break; + case -3: v /= 1e3; break; + case -2: v /= 1e2; break; + case -1: v /= 1e1; break; + case 0: break; + case 1: v *= 1e1; break; + case 2: v *= 1e2; break; + case 3: v *= 1e3; break; + case 4: v *= 1e4; break; + case 5: v *= 1e5; break; + case 6: v *= 1e6; break; + case 7: v *= 1e7; break; + case 8: v *= 1e8; break; + case 9: v *= 1e9; break; + case 10: v *= 1e10; break; + case 11: v *= 1e11; break; + case 12: v *= 1e12; break; + case 13: v *= 1e13; break; + case 14: v *= 1e14; break; + case 15: v *= 1e15; break; + case 16: v *= 1e16; break; + case 17: v *= 1e17; break; + case 18: v *= 1e18; break; + case 19: v *= 1e19; break; + case 20: v *= 1e20; break; + case 21: v *= 1e21; break; + case 22: v *= 1e22; break; + } + *result = v; + goto done; + } +#endif + + if (grisu3_diy_fp_encode_double(fraction, exponent, fraction_exp, ulp_half_error, result)) { + goto done; + } +#ifdef GRISU3_PARSE_ALLOW_ERROR + goto done; +#endif + *result = strtod(buf, &v_end); + if (v_end < end) { + return v_end; + } + return end; +done: + if (sign) { + *result = -*result; + } + return end; +} + +/* + * Returns buf if number wasn't matched, or null if number starts ok + * but contains invalid content. + */ +static const char *grisu3_parse_hex_fp(const char *buf, const char *end, int sign, double *result) +{ + (void)buf; + (void)end; + (void)sign; + *result = 0.0; + /* Not currently supported. */ + return buf; +} + +/* + * Returns end pointer on success, or null, or buf if start is not a number. + * Sets result to 0.0 on error. + * Reads up to len + 1 bytes from buffer where len + 1 must not be a + * valid part of a number, but all of buf, buf + len need not be a + * number. Leading whitespace is NOT valid. + * Very small numbers are truncated to +/-0.0 and numerically very large + * numbers are returns as +/-infinity. + * + * A value must not end or begin with '.' (like JSON), but can have + * leading zeroes (unlike JSON). A single leading zero followed by + * an encoding symbol may or may not be interpreted as a non-decimal + * encoding prefix, e.g. 0x, but a leading zero followed by a digit is + * NOT interpreted as octal. + * A single leading negative sign may appear before digits, but positive + * sign is not allowed and space after the sign is not allowed. + * At most the first 1000 characters of the input is considered. + */ +static const char *grisu3_parse_double(const char *buf, size_t len, double *result) +{ + const char *mark, *k, *end; + int sign = 0, esign = 0; + uint64_t fraction = 0; + int exponent = 0; + int ee = 0; + int fraction_exp = 0; + int ulp_half_error = 0; + + *result = 0.0; + + end = buf + len + 1; + + /* Failsafe for exponent overflow. */ + if (len > GRISU3_NUM_MAX_LEN) { + end = buf + GRISU3_NUM_MAX_LEN + 1; + } + + if (buf == end) { + return buf; + } + mark = buf; + if (*buf == '-') { + ++buf; + sign = 1; + if (buf == end) { + return 0; + } + } + if (*buf == '0') { + ++buf; + /* | 0x20 is lower case ASCII. */ + if (buf != end && (*buf | 0x20) == 'x') { + k = grisu3_parse_hex_fp(buf, end, sign, result); + if (k == buf) { + return mark; + } + return k; + } + /* Not worthwhile, except for getting the scale of integer part. */ + while (buf != end && *buf == '0') { + ++buf; + } + } else { + if (*buf < '1' || *buf > '9') { + /* + * If we didn't see a sign, just don't recognize it as + * number, otherwise make it an error. + */ + return sign ? 0 : mark; + } + fraction = (uint64_t)(*buf++ - '0'); + } + k = buf; + /* + * We do not catch trailing zeroes when there is no decimal point. + * This misses an opportunity for moving the exponent down into the + * fast case. But it is unlikely to be worthwhile as it complicates + * parsing. + */ + while (buf != end && *buf >= '0' && *buf <= '9') { + if (fraction >= UINT64_MAX / 10) { + fraction += *buf >= '5'; + ulp_half_error = 1; + break; + } + fraction = fraction * 10 + (uint64_t)(*buf++ - '0'); + } + fraction_exp = (int)(buf - k); + /* Skip surplus digits. Trailing zero does not introduce error. */ + while (buf != end && *buf == '0') { + ++exponent; + ++buf; + } + if (buf != end && *buf >= '1' && *buf <= '9') { + ulp_half_error = 1; + ++exponent; + ++buf; + while (buf != end && *buf >= '0' && *buf <= '9') { + ++exponent; + ++buf; + } + } + if (buf != end && *buf == '.') { + ++buf; + k = buf; + if (*buf < '0' || *buf > '9') { + /* We don't accept numbers without leading or trailing digit. */ + return 0; + } + while (buf != end && *buf >= '0' && *buf <= '9') { + if (fraction >= UINT64_MAX / 10) { + if (!ulp_half_error) { + fraction += *buf >= '5'; + ulp_half_error = 1; + } + break; + } + fraction = fraction * 10 + (uint64_t)(*buf++ - '0'); + --exponent; + } + fraction_exp += (int)(buf - k); + while (buf != end && *buf == '0') { + ++exponent; + ++buf; + } + if (buf != end && *buf >= '1' && *buf <= '9') { + ulp_half_error = 1; + ++buf; + while (buf != end && *buf >= '0' && *buf <= '9') { + ++buf; + } + } + } + /* + * Normalized exponent e.g: 1.23434e3 with fraction = 123434, + * fraction_exp = 5, exponent = 3. + * So value = fraction * 10^(exponent - fraction_exp) + */ + exponent += fraction_exp; + if (buf != end && (*buf | 0x20) == 'e') { + if (end - buf < 2) { + return 0; + } + ++buf; + if (*buf == '+') { + ++buf; + if (buf == end) { + return 0; + } + } else if (*buf == '-') { + esign = 1; + ++buf; + if (buf == end) { + return 0; + } + } + if (*buf < '0' || *buf > '9') { + return 0; + } + ee = *buf++ - '0'; + while (buf != end && *buf >= '0' && *buf <= '9') { + /* + * This test impacts performance and we do not need an + * exact value just one large enough to dominate the fraction_exp. + * Subsequent handling maps large absolute ee to 0 or infinity. + */ + if (ee <= 0x7fff) { + ee = ee * 10 + *buf - '0'; + } + ++buf; + } + } + exponent = exponent + (esign ? -ee : ee); + + /* + * Exponent is now a base 10 normalized exponent so the absolute value + * is less the 10^(exponent + 1) for positive exponents. For + * denormalized doubles (using 11 bit exponent 0 with a fraction + * shiftet down, extra small numbers can be achieved. + * + * https://en.wikipedia.org/wiki/Double-precision_floating-point_format + * + * 10^-324 holds the smallest normalized exponent (but not value) and + * 10^308 holds the largest exponent. Internally our lookup table is + * only safe to use within a range slightly larger than this. + * Externally, a slightly larger/smaller value represents NaNs which + * are technically also possible to store as a number. + * + */ + + /* This also protects strod fallback parsing. */ + if (buf == end) { + return 0; + } + return grisu3_encode_double(mark, buf, sign, fraction, exponent, fraction_exp, ulp_half_error, result); +} + +#ifdef __cplusplus +} +#endif + +#endif /* GRISU3_PARSE_H */ diff --git a/external/grisu3/grisu3_print.h b/external/grisu3/grisu3_print.h new file mode 100644 index 0000000..d748408 --- /dev/null +++ b/external/grisu3/grisu3_print.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com + * Copyright author of MathGeoLib (https://github.com/juj) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 + */ + +/* + * Extracted from MathGeoLib. + * + * mikkelfj: + * - Fixed final output when printing single digit negative exponent to + * have leading zero (important for JSON). + * - Changed formatting to prefer 0.012 over 1.2-e-2. + * + * Large portions of the original grisu3.c file has been moved to + * grisu3_math.h, the rest is placed here. + * + * See also comments in grisu3_math.h. + * + * MatGeoLib grisu3.c comment: + * + * This file is part of an implementation of the "grisu3" double to string + * conversion algorithm described in the research paper + * + * "Printing Floating-Point Numbers Quickly And Accurately with Integers" + * by Florian Loitsch, available at + * http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf + */ + +#ifndef GRISU3_PRINT_H +#define GRISU3_PRINT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdio.h> /* sprintf, only needed for fallback printing */ +#include <assert.h> /* assert */ + +#include "grisu3_math.h" + +/* + * The lightweight "portable" C library recognizes grisu3 support if + * included first. + */ +#define grisu3_print_double_is_defined 1 + +/* + * Not sure we have an exact definition, but we get up to 23 + * emperically. There is some math ensuring it does not go awol though, + * like 18 digits + exponent or so. + * This max should be safe size buffer for printing, including zero term. + */ +#define GRISU3_PRINT_MAX 30 + +static int grisu3_round_weed(char *buffer, int len, uint64_t wp_W, uint64_t delta, uint64_t rest, uint64_t ten_kappa, uint64_t ulp) +{ + uint64_t wp_Wup = wp_W - ulp; + uint64_t wp_Wdown = wp_W + ulp; + while(rest < wp_Wup && delta - rest >= ten_kappa + && (rest + ten_kappa < wp_Wup || wp_Wup - rest >= rest + ten_kappa - wp_Wup)) + { + --buffer[len-1]; + rest += ten_kappa; + } + if (rest < wp_Wdown && delta - rest >= ten_kappa + && (rest + ten_kappa < wp_Wdown || wp_Wdown - rest > rest + ten_kappa - wp_Wdown)) + return 0; + + return 2*ulp <= rest && rest <= delta - 4*ulp; +} + +static int grisu3_digit_gen(grisu3_diy_fp_t low, grisu3_diy_fp_t w, grisu3_diy_fp_t high, char *buffer, int *length, int *kappa) +{ + uint64_t unit = 1; + grisu3_diy_fp_t too_low = { low.f - unit, low.e }; + grisu3_diy_fp_t too_high = { high.f + unit, high.e }; + grisu3_diy_fp_t unsafe_interval = grisu3_diy_fp_minus(too_high, too_low); + grisu3_diy_fp_t one = { 1ULL << -w.e, w.e }; + uint32_t p1 = (uint32_t)(too_high.f >> -one.e); + uint64_t p2 = too_high.f & (one.f - 1); + uint32_t div; + *kappa = grisu3_largest_pow10(p1, GRISU3_DIY_FP_FRACT_SIZE + one.e, &div); + *length = 0; + + while(*kappa > 0) + { + uint64_t rest; + char digit = (char)(p1 / div); + buffer[*length] = '0' + digit; + ++*length; + p1 %= div; + --*kappa; + rest = ((uint64_t)p1 << -one.e) + p2; + if (rest < unsafe_interval.f) return grisu3_round_weed(buffer, *length, grisu3_diy_fp_minus(too_high, w).f, unsafe_interval.f, rest, (uint64_t)div << -one.e, unit); + div /= 10; + } + + for(;;) + { + char digit; + p2 *= 10; + unit *= 10; + unsafe_interval.f *= 10; + /* Integer division by one. */ + digit = (char)(p2 >> -one.e); + buffer[*length] = '0' + digit; + ++*length; + p2 &= one.f - 1; /* Modulo by one. */ + --*kappa; + if (p2 < unsafe_interval.f) return grisu3_round_weed(buffer, *length, grisu3_diy_fp_minus(too_high, w).f * unit, unsafe_interval.f, p2, one.f, unit); + } +} + +static int grisu3(double v, char *buffer, int *length, int *d_exp) +{ + int mk, kappa, success; + grisu3_diy_fp_t dfp = grisu3_cast_diy_fp_from_double(v); + grisu3_diy_fp_t w = grisu3_diy_fp_normalize(dfp); + + /* normalize boundaries */ + grisu3_diy_fp_t t = { (dfp.f << 1) + 1, dfp.e - 1 }; + grisu3_diy_fp_t b_plus = grisu3_diy_fp_normalize(t); + grisu3_diy_fp_t b_minus; + grisu3_diy_fp_t c_mk; /* Cached power of ten: 10^-k */ + uint64_t u64 = grisu3_cast_uint64_from_double(v); + assert(v > 0 && v <= 1.7976931348623157e308); /* Grisu only handles strictly positive finite numbers. */ + if (!(u64 & GRISU3_D64_FRACT_MASK) && (u64 & GRISU3_D64_EXP_MASK) != 0) { b_minus.f = (dfp.f << 2) - 1; b_minus.e = dfp.e - 2;} /* lower boundary is closer? */ + else { b_minus.f = (dfp.f << 1) - 1; b_minus.e = dfp.e - 1; } + b_minus.f = b_minus.f << (b_minus.e - b_plus.e); + b_minus.e = b_plus.e; + + mk = grisu3_diy_fp_cached_pow(GRISU3_MIN_TARGET_EXP - GRISU3_DIY_FP_FRACT_SIZE - w.e, &c_mk); + + w = grisu3_diy_fp_multiply(w, c_mk); + b_minus = grisu3_diy_fp_multiply(b_minus, c_mk); + b_plus = grisu3_diy_fp_multiply(b_plus, c_mk); + + success = grisu3_digit_gen(b_minus, w, b_plus, buffer, length, &kappa); + *d_exp = kappa - mk; + return success; +} + +static int grisu3_i_to_str(int val, char *str) +{ + int len, i; + char *s; + char *begin = str; + if (val < 0) { *str++ = '-'; val = -val; } + s = str; + + for(;;) + { + int ni = val / 10; + int digit = val - ni*10; + *s++ = (char)('0' + digit); + if (ni == 0) + break; + val = ni; + } + *s = '\0'; + len = (int)(s - str); + for(i = 0; i < len/2; ++i) + { + char ch = str[i]; + str[i] = str[len-1-i]; + str[len-1-i] = ch; + } + + return (int)(s - begin); +} + +static int grisu3_print_nan(uint64_t v, char *dst) +{ + static char hexdigits[16] = "0123456789ABCDEF"; + int i = 0; + + dst[0] = 'N'; + dst[1] = 'a'; + dst[2] = 'N'; + dst[3] = '('; + dst[20] = ')'; + dst[21] = '\0'; + dst += 4; + for (i = 15; i >= 0; --i) { + dst[i] = hexdigits[v & 0x0F]; + v >>= 4; + } + return 21; +} + +static int grisu3_print_double(double v, char *dst) +{ + int d_exp, len, success, decimals, i; + uint64_t u64 = grisu3_cast_uint64_from_double(v); + char *s2 = dst; + assert(dst); + + /* Prehandle NaNs */ + if ((u64 << 1) > 0xFFE0000000000000ULL) return grisu3_print_nan(u64, dst); + /* Prehandle negative values. */ + if ((u64 & GRISU3_D64_SIGN) != 0) { *s2++ = '-'; v = -v; u64 ^= GRISU3_D64_SIGN; } + /* Prehandle zero. */ + if (!u64) { *s2++ = '0'; *s2 = '\0'; return (int)(s2 - dst); } + /* Prehandle infinity. */ + if (u64 == GRISU3_D64_EXP_MASK) { *s2++ = 'i'; *s2++ = 'n'; *s2++ = 'f'; *s2 = '\0'; return (int)(s2 - dst); } + + success = grisu3(v, s2, &len, &d_exp); + /* If grisu3 was not able to convert the number to a string, then use old sprintf (suboptimal). */ + if (!success) return sprintf(s2, "%.17g", v) + (int)(s2 - dst); + + /* We now have an integer string of form "151324135" and a base-10 exponent for that number. */ + /* Next, decide the best presentation for that string by whether to use a decimal point, or the scientific exponent notation 'e'. */ + /* We don't pick the absolute shortest representation, but pick a balance between readability and shortness, e.g. */ + /* 1.545056189557677e-308 could be represented in a shorter form */ + /* 1545056189557677e-323 but that would be somewhat unreadable. */ + decimals = GRISU3_MIN(-d_exp, GRISU3_MAX(1, len-1)); + + /* mikkelfj: + * fix zero prefix .1 => 0.1, important for JSON export. + * prefer unscientific notation at same length: + * -1.2345e-4 over -1.00012345, + * -1.0012345 over -1.2345e-3 + */ + if (d_exp < 0 && (len + d_exp) > -3 && len <= -d_exp) + { + /* mikkelfj: fix zero prefix .1 => 0.1, and short exponents 1.3e-2 => 0.013. */ + memmove(s2 + 2 - d_exp - len, s2, (size_t)len); + s2[0] = '0'; + s2[1] = '.'; + for (i = 2; i < 2-d_exp-len; ++i) s2[i] = '0'; + len += i; + } + else if (d_exp < 0 && len > 1) /* Add decimal point? */ + { + for(i = 0; i < decimals; ++i) s2[len-i] = s2[len-i-1]; + s2[len++ - decimals] = '.'; + d_exp += decimals; + /* Need scientific notation as well? */ + if (d_exp != 0) { s2[len++] = 'e'; len += grisu3_i_to_str(d_exp, s2+len); } + } + /* Add scientific notation? */ + else if (d_exp < 0 || d_exp > 2) { s2[len++] = 'e'; len += grisu3_i_to_str(d_exp, s2+len); } + /* Add zeroes instead of scientific notation? */ + else if (d_exp > 0) { while(d_exp-- > 0) s2[len++] = '0'; } + s2[len] = '\0'; /* grisu3 doesn't null terminate, so ensure termination. */ + return (int)(s2+len-dst); +} + +#ifdef __cplusplus +} +#endif + +#endif /* GRISU3_PRINT_H */ diff --git a/external/grisu3/grisu3_test.c b/external/grisu3/grisu3_test.c new file mode 100644 index 0000000..930e027 --- /dev/null +++ b/external/grisu3/grisu3_test.c @@ -0,0 +1,141 @@ +#include <inttypes.h> +#include <string.h> +#include <stdio.h> + +#include "grisu3_parse.h" +#include "grisu3_print.h" + +#define TEST(x, s) do { \ + if (!(x)) { \ + fprintf(stderr, \ + "fail: %s\n" \ + " input: %s\n" \ + " expected: %.17g\n" \ + " got: %.17g\n" \ + " binary xor: 0x%016"PRId64"\n", \ + s, buf, expect, v, (a ^ b)); \ + return 1; \ + } \ + } while (0) + +static int test_parse_double(char *buf) +{ + const char *k, *end; + double v, expect; + uint64_t a = 0, b = 0; + int len = strlen(buf); + + end = buf + len; + + expect = strtod(buf, 0); + /* Include '\0' in bytes being parsed to make strtod safe. */ + k = grisu3_parse_double(buf, len, &v); + + /* Make sure we parsed and accepted everything. */ + TEST(k == end, "didn't parse to end"); + + a = grisu3_cast_uint64_from_double(expect); + b = grisu3_cast_uint64_from_double(v); + +#ifdef GRISU3_PARSE_ALLOW_ERROR + /* + * Just where exponent wraps, this assumption will be incorrect. + * TODO: need next higher double function. + */ + TEST(a - b <= 1, "binary representation differs by more than lsb"); +#else + /* Binary comparison should match. */ + TEST(expect == v, "double representation differs"); + TEST(a == b, "binary representation differs"); +#endif + +#if 0 + /* This will print the test data also when correct. */ + TEST(0, "test case passed, just debugging"); +#endif + + return 0; +} + +/* + * We currently do not test grisu3_print_double because + * it is a direct port of dtoa_grisu3 from grisu3.c + * which presumably has been tested in MathGeoLib. + * + * grisu3_parse_double is a new implementation. + */ +int test_suite() +{ + char buf[50]; + int fail = 0; + + fail += test_parse_double("1.23434"); + fail += test_parse_double("1234.34"); + fail += test_parse_double("1234.34e4"); + fail += test_parse_double("1234.34e-4"); + fail += test_parse_double("1.23434E+4"); + fail += test_parse_double("3.2897984798741413E+194"); + fail += test_parse_double("-3.2897984798741413E-194"); + + sprintf(buf, "3289798479874141.314124124128497098e109"); + fail += test_parse_double(buf); + sprintf(buf, "3289798479874141.314124124128497098e209"); + fail += test_parse_double(buf); + sprintf(buf, "-3289798479874141.314124124128497098e209"); + fail += test_parse_double(buf); + sprintf(buf, "3289798479874141.314124124128497098e+209"); + fail += test_parse_double(buf); + sprintf(buf, "-3289798479874141.314124124128497098e-209"); + fail += test_parse_double(buf); + + return fail; +} + +void example() +{ + double v; + const char *buf = "1234.34e-4"; + const char *x, *end; + char result_buf[50]; + int len; + + fprintf(stderr, "grisu3_parse_double example:\n parsing '%s' as double\n", buf); + /* A non-numeric terminator (e.g. '\0') is required to ensure strtod fallback is safe. */ + len = strlen(buf); + end = buf + len; + x = grisu3_parse_double(buf, len, &v); + if (x == 0) { + fprintf(stderr, "syntax or range error\n"); + } else if (x == buf) { + fprintf(stderr, "parse double failed\n"); + } else if (x != end) { + fprintf(stderr, "parse double did not read everything\n"); + } else { + fprintf(stderr, "got: %.17g\n", v); + } + /* + * TODO: with the current example: the input "0.123434" is printed + * as "1.23434e-1" which is sub-optimal and different from sprintf. + * + * This is not the grisu3 algorithm but a post formatting step + * in grisu3_print_double (originally dtoa_grisu) and may be a bug + * in the logic choosing the best print format. + * sprintf "%.17g" and "%g" both print as "0.123434" + */ + fprintf(stderr, "grisu3_print_double example:\n printing %g\n", v); + grisu3_print_double(v, result_buf); + fprintf(stderr, "got: %s\n", result_buf); +} + +int main() +{ + example(); + fprintf(stderr, "running tests\n"); + if (test_suite()) { + fprintf(stderr, "GRISU3 PARSE TEST FAILED\n"); + return -1; + } else { + fprintf(stderr, "GRISU3 PARSE TEST PASSED\n"); + return 0; + } +} diff --git a/external/grisu3/grisu3_test_dblcnv.c b/external/grisu3/grisu3_test_dblcnv.c new file mode 100644 index 0000000..f0e98cc --- /dev/null +++ b/external/grisu3/grisu3_test_dblcnv.c @@ -0,0 +1,482 @@ +/* + * Test cases from Googles Double Conversion Library + * + * https://github.com/google/double-conversion/blob/master/test/cctest/test-strtod.cc + * + * Added extra tests for grisu parse print roundtrip and negative sign. + */ + +#include <string.h> +#include <stdio.h> +#include <math.h> + +#include "grisu3_print.h" +#include "grisu3_parse.h" + +#define BEGIN_TEST(name) int test_ ## name() { \ + int fail = 0; char *id = #name; double v; char *vector; \ + char buf[1001]; + +#define END_TEST() return fail; } + + +void check_double(double x1, double x2, char *id, int line, int *fail) +{ + char tmp[50]; + const char *k; + int n; + int failed = 0; + double v; + + if (x1 != x2) { + failed = 1; + fprintf(stderr, "%d: fail (%s): %.17g != %.17g\n", + line, id, x1, x2); + } else { +#if 1 + n = grisu3_print_double(x1, tmp); + if (n >= GRISU3_PRINT_MAX) { /* Leave space for zterm. */ + failed = 1; + fprintf(stderr, "%d: fail (%s): print length exceeded max: %d, input: %.17g\n", + line, id, n, x1); + } else if ((int)strlen(tmp) != n) { + failed = 1; + fprintf(stderr, "%d: fail (%s): print length does not match strlen of output, input: %.17g, got: %s\n", + line, id, x1, tmp); + } else if (!isinf(x1)) { + /* We do expect print/parse to handle inf. */ + k = grisu3_parse_double(tmp, n, &v); + if (k == 0 || k == tmp) { + failed = 1; + fprintf(stderr, "%d: fail (%s): roundtrip parse failed " + "input: %g, printed value %s\n", + line, id, x1, tmp); + } else if (x1 != v) { + failed = 1; + fprintf(stderr, "%d: fail (%s): print/parse roundtrip mismatch for " + "input: %.17g, got %.17g\n", + line, id, x1, v); + } + } +#endif + } + *fail += failed; +} + +#define CHECK_EQ(v1, v2) check_double((v1), (v2), id, __LINE__, &fail) + +#define StringToVector(f) f + +#define Strtod(f, e) (sprintf(buf, "%se%d", f, e), \ + grisu3_parse_double(buf, strlen(buf), &v), v) + +#define StrtodChar(f, e) (sprintf(buf, "%se%d", f, e), \ + grisu3_parse_double(buf, strlen(buf), &v), v) + +#define double_infinity grisu3_double_infinity + +BEGIN_TEST(Strtod) + vector = StringToVector("0"); + CHECK_EQ(0.0, Strtod(vector, 1)); + CHECK_EQ(0.0, Strtod(vector, 2)); + CHECK_EQ(0.0, Strtod(vector, -2)); + CHECK_EQ(0.0, Strtod(vector, -999)); + CHECK_EQ(0.0, Strtod(vector, +999)); + + vector = StringToVector("1"); + CHECK_EQ(1.0, Strtod(vector, 0)); + CHECK_EQ(10.0, Strtod(vector, 1)); + CHECK_EQ(100.0, Strtod(vector, 2)); + CHECK_EQ(1e20, Strtod(vector, 20)); + CHECK_EQ(1e22, Strtod(vector, 22)); + CHECK_EQ(1e23, Strtod(vector, 23)); + + CHECK_EQ(1e35, Strtod(vector, 35)); + CHECK_EQ(1e36, Strtod(vector, 36)); + CHECK_EQ(1e37, Strtod(vector, 37)); + CHECK_EQ(1e-1, Strtod(vector, -1)); + CHECK_EQ(1e-2, Strtod(vector, -2)); + CHECK_EQ(1e-5, Strtod(vector, -5)); + CHECK_EQ(1e-20, Strtod(vector, -20)); + CHECK_EQ(1e-22, Strtod(vector, -22)); + CHECK_EQ(1e-23, Strtod(vector, -23)); + CHECK_EQ(1e-25, Strtod(vector, -25)); + CHECK_EQ(1e-39, Strtod(vector, -39)); + + vector = StringToVector("2"); + CHECK_EQ(2.0, Strtod(vector, 0)); + CHECK_EQ(20.0, Strtod(vector, 1)); + CHECK_EQ(200.0, Strtod(vector, 2)); + CHECK_EQ(2e20, Strtod(vector, 20)); + CHECK_EQ(2e22, Strtod(vector, 22)); + CHECK_EQ(2e23, Strtod(vector, 23)); + CHECK_EQ(2e35, Strtod(vector, 35)); + CHECK_EQ(2e36, Strtod(vector, 36)); + CHECK_EQ(2e37, Strtod(vector, 37)); + CHECK_EQ(2e-1, Strtod(vector, -1)); + CHECK_EQ(2e-2, Strtod(vector, -2)); + CHECK_EQ(2e-5, Strtod(vector, -5)); + CHECK_EQ(2e-20, Strtod(vector, -20)); + CHECK_EQ(2e-22, Strtod(vector, -22)); + CHECK_EQ(2e-23, Strtod(vector, -23)); + CHECK_EQ(2e-25, Strtod(vector, -25)); + CHECK_EQ(2e-39, Strtod(vector, -39)); + + vector = StringToVector("9"); + CHECK_EQ(9.0, Strtod(vector, 0)); + CHECK_EQ(90.0, Strtod(vector, 1)); + CHECK_EQ(900.0, Strtod(vector, 2)); + CHECK_EQ(9e20, Strtod(vector, 20)); + CHECK_EQ(9e22, Strtod(vector, 22)); + CHECK_EQ(9e23, Strtod(vector, 23)); + CHECK_EQ(9e35, Strtod(vector, 35)); + CHECK_EQ(9e36, Strtod(vector, 36)); + CHECK_EQ(9e37, Strtod(vector, 37)); + CHECK_EQ(9e-1, Strtod(vector, -1)); + CHECK_EQ(9e-2, Strtod(vector, -2)); + CHECK_EQ(9e-5, Strtod(vector, -5)); + CHECK_EQ(9e-20, Strtod(vector, -20)); + CHECK_EQ(9e-22, Strtod(vector, -22)); + CHECK_EQ(9e-23, Strtod(vector, -23)); + CHECK_EQ(9e-25, Strtod(vector, -25)); + CHECK_EQ(9e-39, Strtod(vector, -39)); + + vector = StringToVector("12345"); + CHECK_EQ(12345.0, Strtod(vector, 0)); + CHECK_EQ(123450.0, Strtod(vector, 1)); + CHECK_EQ(1234500.0, Strtod(vector, 2)); + CHECK_EQ(12345e20, Strtod(vector, 20)); + CHECK_EQ(12345e22, Strtod(vector, 22)); + CHECK_EQ(12345e23, Strtod(vector, 23)); + CHECK_EQ(12345e30, Strtod(vector, 30)); + CHECK_EQ(12345e31, Strtod(vector, 31)); + CHECK_EQ(12345e32, Strtod(vector, 32)); + CHECK_EQ(12345e35, Strtod(vector, 35)); + CHECK_EQ(12345e36, Strtod(vector, 36)); + CHECK_EQ(12345e37, Strtod(vector, 37)); + CHECK_EQ(12345e-1, Strtod(vector, -1)); + CHECK_EQ(12345e-2, Strtod(vector, -2)); + CHECK_EQ(12345e-5, Strtod(vector, -5)); + CHECK_EQ(12345e-20, Strtod(vector, -20)); + CHECK_EQ(12345e-22, Strtod(vector, -22)); + CHECK_EQ(12345e-23, Strtod(vector, -23)); + CHECK_EQ(12345e-25, Strtod(vector, -25)); + CHECK_EQ(12345e-39, Strtod(vector, -39)); + + vector = StringToVector("12345678901234"); + CHECK_EQ(12345678901234.0, Strtod(vector, 0)); + CHECK_EQ(123456789012340.0, Strtod(vector, 1)); + CHECK_EQ(1234567890123400.0, Strtod(vector, 2)); + CHECK_EQ(12345678901234e20, Strtod(vector, 20)); + CHECK_EQ(12345678901234e22, Strtod(vector, 22)); + CHECK_EQ(12345678901234e23, Strtod(vector, 23)); + CHECK_EQ(12345678901234e30, Strtod(vector, 30)); + CHECK_EQ(12345678901234e31, Strtod(vector, 31)); + CHECK_EQ(12345678901234e32, Strtod(vector, 32)); + CHECK_EQ(12345678901234e35, Strtod(vector, 35)); + CHECK_EQ(12345678901234e36, Strtod(vector, 36)); + CHECK_EQ(12345678901234e37, Strtod(vector, 37)); + CHECK_EQ(12345678901234e-1, Strtod(vector, -1)); + CHECK_EQ(12345678901234e-2, Strtod(vector, -2)); + CHECK_EQ(12345678901234e-5, Strtod(vector, -5)); + CHECK_EQ(12345678901234e-20, Strtod(vector, -20)); + CHECK_EQ(12345678901234e-22, Strtod(vector, -22)); + CHECK_EQ(12345678901234e-23, Strtod(vector, -23)); + CHECK_EQ(12345678901234e-25, Strtod(vector, -25)); + CHECK_EQ(12345678901234e-39, Strtod(vector, -39)); + + vector = StringToVector("123456789012345"); + CHECK_EQ(123456789012345.0, Strtod(vector, 0)); + CHECK_EQ(1234567890123450.0, Strtod(vector, 1)); + CHECK_EQ(12345678901234500.0, Strtod(vector, 2)); + CHECK_EQ(123456789012345e20, Strtod(vector, 20)); + CHECK_EQ(123456789012345e22, Strtod(vector, 22)); + CHECK_EQ(123456789012345e23, Strtod(vector, 23)); + CHECK_EQ(123456789012345e35, Strtod(vector, 35)); + CHECK_EQ(123456789012345e36, Strtod(vector, 36)); + CHECK_EQ(123456789012345e37, Strtod(vector, 37)); + CHECK_EQ(123456789012345e39, Strtod(vector, 39)); + CHECK_EQ(123456789012345e-1, Strtod(vector, -1)); + CHECK_EQ(123456789012345e-2, Strtod(vector, -2)); + CHECK_EQ(123456789012345e-5, Strtod(vector, -5)); + CHECK_EQ(123456789012345e-20, Strtod(vector, -20)); + CHECK_EQ(123456789012345e-22, Strtod(vector, -22)); + CHECK_EQ(123456789012345e-23, Strtod(vector, -23)); + CHECK_EQ(123456789012345e-25, Strtod(vector, -25)); + CHECK_EQ(123456789012345e-39, Strtod(vector, -39)); + CHECK_EQ(0.0, StrtodChar("0", 12345)); + + CHECK_EQ(0.0, StrtodChar("", 1324)); + CHECK_EQ(0.0, StrtodChar("000000000", 123)); + CHECK_EQ(0.0, StrtodChar("2", -324)); + CHECK_EQ(4e-324, StrtodChar("3", -324)); + + // It would be more readable to put non-zero literals on the left side (i.e. + // CHECK_EQ(1e-325, StrtodChar("1", -325))), but then Gcc complains that + // they are truncated to zero. + CHECK_EQ(0.0, StrtodChar("1", -325)); + CHECK_EQ(0.0, StrtodChar("1", -325)); + CHECK_EQ(0.0, StrtodChar("20000", -328)); + CHECK_EQ(40000e-328, StrtodChar("30000", -328)); + CHECK_EQ(0.0, StrtodChar("10000", -329)); + CHECK_EQ(0.0, StrtodChar("90000", -329)); + CHECK_EQ(0.0, StrtodChar("000000001", -325)); + CHECK_EQ(0.0, StrtodChar("000000001", -325)); + CHECK_EQ(0.0, StrtodChar("0000000020000", -328)); + CHECK_EQ(40000e-328, StrtodChar("00000030000", -328)); + CHECK_EQ(0.0, StrtodChar("0000000010000", -329)); + CHECK_EQ(0.0, StrtodChar("0000000090000", -329)); + + + // It would be more readable to put the literals (and not double_infinity) + // on the left side (i.e. CHECK_EQ(1e309, StrtodChar("1", 309))), but then Gcc + // complains that the floating constant exceeds range of 'double'. + + CHECK_EQ(double_infinity, StrtodChar("1", 309)); + + CHECK_EQ(1e308, StrtodChar("1", 308)); + CHECK_EQ(1234e305, StrtodChar("1234", 305)); + CHECK_EQ(1234e304, StrtodChar("1234", 304)); + + CHECK_EQ(double_infinity, StrtodChar("18", 307)); + CHECK_EQ(17e307, StrtodChar("17", 307)); + + CHECK_EQ(double_infinity, StrtodChar("0000001", 309)); + + CHECK_EQ(1e308, StrtodChar("00000001", 308)); + + CHECK_EQ(1234e305, StrtodChar("00000001234", 305)); + CHECK_EQ(1234e304, StrtodChar("000000001234", 304)); + CHECK_EQ(double_infinity, StrtodChar("0000000018", 307)); + CHECK_EQ(17e307, StrtodChar("0000000017", 307)); + CHECK_EQ(double_infinity, StrtodChar("1000000", 303)); + CHECK_EQ(1e308, StrtodChar("100000", 303)); + CHECK_EQ(1234e305, StrtodChar("123400000", 300)); + CHECK_EQ(1234e304, StrtodChar("123400000", 299)); + CHECK_EQ(double_infinity, StrtodChar("180000000", 300)); + CHECK_EQ(17e307, StrtodChar("170000000", 300)); + CHECK_EQ(double_infinity, StrtodChar("00000001000000", 303)); + CHECK_EQ(1e308, StrtodChar("000000000000100000", 303)); + CHECK_EQ(1234e305, StrtodChar("00000000123400000", 300)); + CHECK_EQ(1234e304, StrtodChar("0000000123400000", 299)); + CHECK_EQ(double_infinity, StrtodChar("00000000180000000", 300)); + CHECK_EQ(17e307, StrtodChar("00000000170000000", 300)); + CHECK_EQ(1.7976931348623157E+308, StrtodChar("17976931348623157", 292)); + CHECK_EQ(1.7976931348623158E+308, StrtodChar("17976931348623158", 292)); + CHECK_EQ(double_infinity, StrtodChar("17976931348623159", 292)); + + // The following number is the result of 89255.0/1e-22. Both floating-point + // numbers can be accurately represented with doubles. However on Linux,x86 + // the floating-point stack is set to 80bits and the double-rounding + // introduces an error. + CHECK_EQ(89255e-22, StrtodChar("89255", -22)); + + // Some random values. + CHECK_EQ(358416272e-33, StrtodChar("358416272", -33)); + CHECK_EQ(104110013277974872254e-225, + StrtodChar("104110013277974872254", -225)); + + CHECK_EQ(123456789e108, StrtodChar("123456789", 108)); + CHECK_EQ(123456789e109, StrtodChar("123456789", 109)); + CHECK_EQ(123456789e110, StrtodChar("123456789", 110)); + CHECK_EQ(123456789e111, StrtodChar("123456789", 111)); + CHECK_EQ(123456789e112, StrtodChar("123456789", 112)); + CHECK_EQ(123456789e113, StrtodChar("123456789", 113)); + CHECK_EQ(123456789e114, StrtodChar("123456789", 114)); + CHECK_EQ(123456789e115, StrtodChar("123456789", 115)); + + CHECK_EQ(1234567890123456789012345e108, + StrtodChar("1234567890123456789012345", 108)); + CHECK_EQ(1234567890123456789012345e109, + StrtodChar("1234567890123456789012345", 109)); + CHECK_EQ(1234567890123456789012345e110, + StrtodChar("1234567890123456789012345", 110)); + CHECK_EQ(1234567890123456789012345e111, + StrtodChar("1234567890123456789012345", 111)); + CHECK_EQ(1234567890123456789012345e112, + StrtodChar("1234567890123456789012345", 112)); + CHECK_EQ(1234567890123456789012345e113, + StrtodChar("1234567890123456789012345", 113)); + CHECK_EQ(1234567890123456789012345e114, + StrtodChar("1234567890123456789012345", 114)); + CHECK_EQ(1234567890123456789012345e115, + StrtodChar("1234567890123456789012345", 115)); + CHECK_EQ(1234567890123456789052345e108, + StrtodChar("1234567890123456789052345", 108)); + CHECK_EQ(1234567890123456789052345e109, + StrtodChar("1234567890123456789052345", 109)); + CHECK_EQ(1234567890123456789052345e110, + StrtodChar("1234567890123456789052345", 110)); + CHECK_EQ(1234567890123456789052345e111, + StrtodChar("1234567890123456789052345", 111)); + CHECK_EQ(1234567890123456789052345e112, + StrtodChar("1234567890123456789052345", 112)); + CHECK_EQ(1234567890123456789052345e113, + StrtodChar("1234567890123456789052345", 113)); + CHECK_EQ(1234567890123456789052345e114, + StrtodChar("1234567890123456789052345", 114)); + CHECK_EQ(1234567890123456789052345e115, + StrtodChar("1234567890123456789052345", 115)); + CHECK_EQ(5.445618932859895e-255, + StrtodChar("5445618932859895362967233318697132813618813095743952975" + "4392982234069699615600475529427176366709107287468930197" + "8628345413991790019316974825934906752493984055268219809" + "5012176093045431437495773903922425632551857520884625114" + "6241265881735209066709685420744388526014389929047617597" + "0302268848374508109029268898695825171158085457567481507" + "4162979705098246243690189880319928315307816832576838178" + "2563074014542859888710209237525873301724479666744537857" + "9026553346649664045621387124193095870305991178772256504" + "4368663670643970181259143319016472430928902201239474588" + "1392338901353291306607057623202353588698746085415097902" + "6640064319118728664842287477491068264828851624402189317" + "2769161449825765517353755844373640588822904791244190695" + "2998382932630754670573838138825217065450843010498555058" + "88186560731", -1035)); + + // Boundary cases. Boundaries themselves should round to even. + // + // 0x1FFFFFFFFFFFF * 2^3 = 72057594037927928 + // next: 72057594037927936 + // boundary: 72057594037927932 should round up. + CHECK_EQ(72057594037927928.0, StrtodChar("72057594037927928", 0)); + CHECK_EQ(72057594037927936.0, StrtodChar("72057594037927936", 0)); + CHECK_EQ(72057594037927936.0, StrtodChar("72057594037927932", 0)); + CHECK_EQ(72057594037927928.0, StrtodChar("7205759403792793199999", -5)); + CHECK_EQ(72057594037927936.0, StrtodChar("7205759403792793200001", -5)); + + // 0x1FFFFFFFFFFFF * 2^10 = 9223372036854774784 + // next: 9223372036854775808 + // boundary: 9223372036854775296 should round up. + CHECK_EQ(9223372036854774784.0, StrtodChar("9223372036854774784", 0)); + CHECK_EQ(9223372036854775808.0, StrtodChar("9223372036854775808", 0)); + CHECK_EQ(9223372036854775808.0, StrtodChar("9223372036854775296", 0)); + + CHECK_EQ(9223372036854774784.0, StrtodChar("922337203685477529599999", -5)); + CHECK_EQ(9223372036854775808.0, StrtodChar("922337203685477529600001", -5)); + + // 0x1FFFFFFFFFFFF * 2^50 = 10141204801825834086073718800384 + // next: 10141204801825835211973625643008 + // boundary: 10141204801825834649023672221696 should round up. + // + CHECK_EQ(10141204801825834086073718800384.0, + StrtodChar("10141204801825834086073718800384", 0)); + CHECK_EQ(10141204801825835211973625643008.0, + StrtodChar("10141204801825835211973625643008", 0)); + CHECK_EQ(10141204801825835211973625643008.0, + StrtodChar("10141204801825834649023672221696", 0)); + CHECK_EQ(10141204801825834086073718800384.0, + StrtodChar("1014120480182583464902367222169599999", -5)); + CHECK_EQ(10141204801825835211973625643008.0, + StrtodChar("1014120480182583464902367222169600001", -5)); + // 0x1FFFFFFFFFFFF * 2^99 = 5708990770823838890407843763683279797179383808 + // next: 5708990770823839524233143877797980545530986496 + // boundary: 5708990770823839207320493820740630171355185152 + // The boundary should round up. + CHECK_EQ(5708990770823838890407843763683279797179383808.0, + StrtodChar("5708990770823838890407843763683279797179383808", 0)); + CHECK_EQ(5708990770823839524233143877797980545530986496.0, + StrtodChar("5708990770823839524233143877797980545530986496", 0)); + CHECK_EQ(5708990770823839524233143877797980545530986496.0, + StrtodChar("5708990770823839207320493820740630171355185152", 0)); + CHECK_EQ(5708990770823838890407843763683279797179383808.0, + StrtodChar("5708990770823839207320493820740630171355185151999", -3)); + CHECK_EQ(5708990770823839524233143877797980545530986496.0, + StrtodChar("5708990770823839207320493820740630171355185152001", -3)); + + // The following test-cases got some public attention in early 2011 when they + // sent Java and PHP into an infinite loop. + CHECK_EQ(2.225073858507201e-308, StrtodChar("22250738585072011", -324)); + CHECK_EQ(2.22507385850720138309e-308, + StrtodChar("22250738585072011360574097967091319759348195463516456480" + "23426109724822222021076945516529523908135087914149158913" + "03962110687008643869459464552765720740782062174337998814" + "10632673292535522868813721490129811224514518898490572223" + "07285255133155755015914397476397983411801999323962548289" + "01710708185069063066665599493827577257201576306269066333" + "26475653000092458883164330377797918696120494973903778297" + "04905051080609940730262937128958950003583799967207254304" + "36028407889577179615094551674824347103070260914462157228" + "98802581825451803257070188608721131280795122334262883686" + "22321503775666622503982534335974568884423900265498198385" + "48794829220689472168983109969836584681402285424333066033" + "98508864458040010349339704275671864433837704860378616227" + "71738545623065874679014086723327636718751", -1076)); +END_TEST() + + +/* Non-google test */ +BEGIN_TEST(grisu3_print_double) + vector = "13"; + CHECK_EQ(13e-2, Strtod(vector, -2)); + CHECK_EQ(13e-3, Strtod(vector, -3)); + + vector = "-13"; + CHECK_EQ(-13e-2, Strtod(vector, -2)); + CHECK_EQ(-13e-3, Strtod(vector, -3)); + vector = "-1"; + CHECK_EQ(-1e-2, Strtod(vector, -2)); + CHECK_EQ(-1e-3, Strtod(vector, -3)); + + CHECK_EQ(-1e1, StrtodChar("-1", 1)); + CHECK_EQ(-1e+1, StrtodChar("-1", 1)); + CHECK_EQ(-1e-0, StrtodChar("-1", -0)); + CHECK_EQ(-1e-1, StrtodChar("-1", -1)); + CHECK_EQ(-1e-2, StrtodChar("-1", -2)); + CHECK_EQ(-1e-3, StrtodChar("-1", -3)); + CHECK_EQ(-1e-4, StrtodChar("-1", -4)); + + CHECK_EQ(-12e1, StrtodChar("-12", 1)); + CHECK_EQ(-12e+1, StrtodChar("-12", 1)); + CHECK_EQ(-12e-0, StrtodChar("-12", -0)); + CHECK_EQ(-12e-1, StrtodChar("-12", -1)); + CHECK_EQ(-12e-2, StrtodChar("-12", -2)); + CHECK_EQ(-12e-3, StrtodChar("-12", -3)); + CHECK_EQ(-12e-4, StrtodChar("-12", -4)); + + CHECK_EQ(-123e1, StrtodChar("-123", 1)); + CHECK_EQ(-123e+1, StrtodChar("-123", 1)); + CHECK_EQ(-123e-0, StrtodChar("-123", -0)); + CHECK_EQ(-123e-1, StrtodChar("-123", -1)); + CHECK_EQ(-123e-2, StrtodChar("-123", -2)); + CHECK_EQ(-123e-3, StrtodChar("-123", -3)); + CHECK_EQ(-123e-4, StrtodChar("-123", -4)); + + CHECK_EQ(-1234e1, StrtodChar("-1234", 1)); + CHECK_EQ(-1234e+1, StrtodChar("-1234", 1)); + CHECK_EQ(-1234e-0, StrtodChar("-1234", -0)); + CHECK_EQ(-1234e-1, StrtodChar("-1234", -1)); + CHECK_EQ(-1234e-2, StrtodChar("-1234", -2)); + CHECK_EQ(-1234e-3, StrtodChar("-1234", -3)); + CHECK_EQ(-1234e-4, StrtodChar("-1234", -4)); + + CHECK_EQ(-12345e1, StrtodChar("-12345", 1)); + CHECK_EQ(-12345e+1, StrtodChar("-12345", 1)); + CHECK_EQ(-12345e-0, StrtodChar("-12345", -0)); + CHECK_EQ(-12345e-1, StrtodChar("-12345", -1)); + CHECK_EQ(-12345e-2, StrtodChar("-12345", -2)); + CHECK_EQ(-12345e-3, StrtodChar("-12345", -3)); + CHECK_EQ(-12345e-4, StrtodChar("-12345", -4)); + + CHECK_EQ(-12345e-5, StrtodChar("-12345", -5)); + CHECK_EQ(-12345e-6, StrtodChar("-12345", -6)); + CHECK_EQ(-12345e-7, StrtodChar("-12345", -7)); + CHECK_EQ(-12345e-8, StrtodChar("-12345", -8)); + CHECK_EQ(-12345e-9, StrtodChar("-12345", -9)); + CHECK_EQ(-12345e-10, StrtodChar("-12345", -10)); +END_TEST() + +int main() +{ + int fail = 0; + + fail += test_Strtod(); + fail += test_grisu3_print_double(); + + if (fail) { + fprintf(stderr, "FAILURE\n"); + return -1; + } + fprintf(stderr, "SUCCESS\n"); + return 0; +} diff --git a/external/grisu3/test.sh b/external/grisu3/test.sh new file mode 100755 index 0000000..1794fbb --- /dev/null +++ b/external/grisu3/test.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +set -e + +cd $(dirname $0) +mkdir -p build + +CC=cc + +$CC -g -Wall -Wextra $INCLUDE -I.. grisu3_test.c -lm -o build/grisu3_test_d +$CC -DNDEBUG -Wall -Wextra -O2 $INCLUDE -I.. grisu3_test.c -lm -o build/grisu3_test +echo "DEBUG:" +build/grisu3_test_d +echo "OPTIMIZED:" +build/grisu3_test + +echo "running double conversion tests" +./test_dblcnv.sh diff --git a/external/grisu3/test_dblcnv.sh b/external/grisu3/test_dblcnv.sh new file mode 100755 index 0000000..89f58f4 --- /dev/null +++ b/external/grisu3/test_dblcnv.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +set -e + +cd $(dirname $0) +mkdir -p build + +CC=cc + +$CC -g -Wall -Wextra $INCLUDE -I.. grisu3_test_dblcnv.c -o build/grisu3_test_dblcnv_d +$CC -DNDEBUG -Wall -Wextra -O2 $INCLUDE -I.. grisu3_test_dblcnv.c -o build/grisu3_test_dblcnv +echo "DEBUG:" +build/grisu3_test_dblcnv_d +echo "OPTIMIZED:" +build/grisu3_test_dblcnv diff --git a/external/hash/.gitignore b/external/hash/.gitignore new file mode 100644 index 0000000..a007fea --- /dev/null +++ b/external/hash/.gitignore @@ -0,0 +1 @@ +build/* diff --git a/external/hash/CMakeLists.txt b/external/hash/CMakeLists.txt new file mode 100644 index 0000000..7b7d990 --- /dev/null +++ b/external/hash/CMakeLists.txt @@ -0,0 +1,38 @@ +cmake_minimum_required (VERSION 3.0.2) + +project (HashTest) + +SET(CMAKE_C_FLAGS_DEBUG "-g") +SET(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG") + +add_executable (hash_test hash_test.c str_set.c token_map.c ht32.c ht64.c ht32rh.c ht64rh.c cmetrohash64.c) +add_executable (hash_test_32 hash_test.c str_set.c token_map.c ht32.c ht64.c ht32rh.c ht64rh.c PMurHash.c) +add_executable (hash_test_rh hash_test.c str_set.c token_map.c ht32.c ht64.c ht32rh.c ht64rh.c cmetrohash64.c) + +target_compile_definitions(hash_test_32 PRIVATE + -DHT_HASH_32) +target_compile_definitions(hash_test_rh PRIVATE + -DSTR_SET_RH -DTOKEN_MAP_RH) + +add_executable (load_test load_test.c ptr_set.c) +# robin hood hash table +add_executable (load_test_rh load_test.c ptr_set.c) + +target_compile_definitions(load_test PRIVATE + -DPTR_SET_INT_HASH) +target_compile_definitions(load_test_rh PRIVATE + -DPTR_SET_RH -DPTR_SET_INT_HASH) + +# default hash function +add_executable (load_test_d load_test.c ptr_set.c cmetrohash64.c) +add_executable (load_test_d_rh load_test.c ptr_set.c cmetrohash64.c) +target_compile_definitions(load_test_rh PRIVATE + -DPTR_SET_RH) + +add_test(hash_test hash_test) +add_test(hash_test_32 hash_test_32) +add_test(hash_test_rh hash_test_rh) +add_test(load_test load_test) +add_test(load_test_rh load_test_rh) + +enable_testing() diff --git a/external/hash/LICENSE b/external/hash/LICENSE new file mode 100644 index 0000000..a561b5f --- /dev/null +++ b/external/hash/LICENSE @@ -0,0 +1,28 @@ +This license applies to the content of the current directory. + +Some sources are externally provided - see respective file headers. +All source is MIT or public domain with varying copyright. + +Unless otherwise stated, the following license apply: + +The MIT License (MIT) + +Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/external/hash/PMurHash.c b/external/hash/PMurHash.c new file mode 100644 index 0000000..7284434 --- /dev/null +++ b/external/hash/PMurHash.c @@ -0,0 +1,334 @@ +/*----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. + * + * This implementation was written by Shane Day, and is also public domain. + * + * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A) + * with support for progressive processing. + */ + +/*----------------------------------------------------------------------------- + +If you want to understand the MurmurHash algorithm you would be much better +off reading the original source. Just point your browser at: +http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + + +What this version provides? + +1. Progressive data feeding. Useful when the entire payload to be hashed +does not fit in memory or when the data is streamed through the application. +Also useful when hashing a number of strings with a common prefix. A partial +hash of a prefix string can be generated and reused for each suffix string. + +2. Portability. Plain old C so that it should compile on any old compiler. +Both CPU endian and access-alignment neutral, but avoiding inefficient code +when possible depending on CPU capabilities. + +3. Drop in. I personally like nice self contained public domain code, making it +easy to pilfer without loads of refactoring to work properly in the existing +application code & makefile structure and mucking around with licence files. +Just copy PMurHash.h and PMurHash.c and you're ready to go. + + +How does it work? + +We can only process entire 32 bit chunks of input, except for the very end +that may be shorter. So along with the partial hash we need to give back to +the caller a carry containing up to 3 bytes that we were unable to process. +This carry also needs to record the number of bytes the carry holds. I use +the low 2 bits as a count (0..3) and the carry bytes are shifted into the +high byte in stream order. + +To handle endianess I simply use a macro that reads a uint32_t and define +that macro to be a direct read on little endian machines, a read and swap +on big endian machines, or a byte-by-byte read if the endianess is unknown. + +-----------------------------------------------------------------------------*/ + + +#include "PMurHash.h" + +/* I used ugly type names in the header to avoid potential conflicts with + * application or system typedefs & defines. Since I'm not including any more + * headers below here I can rename these so that the code reads like C99 */ +#undef uint32_t +#define uint32_t MH_UINT32 +#undef uint8_t +#define uint8_t MH_UINT8 + +/* MSVC warnings we choose to ignore */ +#if defined(_MSC_VER) + #pragma warning(disable: 4127) /* conditional expression is constant */ +#endif + +/*----------------------------------------------------------------------------- + * Endianess, misalignment capabilities and util macros + * + * The following 3 macros are defined in this section. The other macros defined + * are only needed to help derive these 3. + * + * READ_UINT32(x) Read a little endian unsigned 32-bit int + * UNALIGNED_SAFE Defined if READ_UINT32 works on non-word boundaries + * ROTL32(x,r) Rotate x left by r bits + */ + +/* Convention is to define __BYTE_ORDER == to one of these values */ +#if !defined(__BIG_ENDIAN) + #define __BIG_ENDIAN 4321 +#endif +#if !defined(__LITTLE_ENDIAN) + #define __LITTLE_ENDIAN 1234 +#endif + +/* I386 */ +#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(i386) + #define __BYTE_ORDER __LITTLE_ENDIAN + #define UNALIGNED_SAFE +#endif + +/* gcc 'may' define __LITTLE_ENDIAN__ or __BIG_ENDIAN__ to 1 (Note the trailing __), + * or even _LITTLE_ENDIAN or _BIG_ENDIAN (Note the single _ prefix) */ +#if !defined(__BYTE_ORDER) + #if defined(__LITTLE_ENDIAN__) && __LITTLE_ENDIAN__==1 || defined(_LITTLE_ENDIAN) && _LITTLE_ENDIAN==1 + #define __BYTE_ORDER __LITTLE_ENDIAN + #elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__==1 || defined(_BIG_ENDIAN) && _BIG_ENDIAN==1 + #define __BYTE_ORDER __BIG_ENDIAN + #endif +#endif + +/* gcc (usually) defines xEL/EB macros for ARM and MIPS endianess */ +#if !defined(__BYTE_ORDER) + #if defined(__ARMEL__) || defined(__MIPSEL__) + #define __BYTE_ORDER __LITTLE_ENDIAN + #endif + #if defined(__ARMEB__) || defined(__MIPSEB__) + #define __BYTE_ORDER __BIG_ENDIAN + #endif +#endif + +/* Now find best way we can to READ_UINT32 */ +#if __BYTE_ORDER==__LITTLE_ENDIAN + /* CPU endian matches murmurhash algorithm, so read 32-bit word directly */ + #define READ_UINT32(ptr) (*((uint32_t*)(ptr))) +#elif __BYTE_ORDER==__BIG_ENDIAN + /* TODO: Add additional cases below where a compiler provided bswap32 is available */ + #if defined(__GNUC__) && (__GNUC__>4 || (__GNUC__==4 && __GNUC_MINOR__>=3)) + #define READ_UINT32(ptr) (__builtin_bswap32(*((uint32_t*)(ptr)))) + #else + /* Without a known fast bswap32 we're just as well off doing this */ + #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24) + #define UNALIGNED_SAFE + #endif +#else + /* Unknown endianess so last resort is to read individual bytes */ + #define READ_UINT32(ptr) (ptr[0]|ptr[1]<<8|ptr[2]<<16|ptr[3]<<24) + + /* Since we're not doing word-reads we can skip the messing about with realignment */ + #define UNALIGNED_SAFE +#endif + +/* Find best way to ROTL32 */ +#if defined(_MSC_VER) + #include <stdlib.h> /* Microsoft put _rotl declaration in here */ + #define ROTL32(x,r) _rotl(x,r) +#else + /* gcc recognises this code and generates a rotate instruction for CPUs with one */ + #define ROTL32(x,r) (((uint32_t)x << r) | ((uint32_t)x >> (32 - r))) +#endif + + +/*----------------------------------------------------------------------------- + * Core murmurhash algorithm macros */ + +#define C1 (0xcc9e2d51) +#define C2 (0x1b873593) + +/* This is the main processing body of the algorithm. It operates + * on each full 32-bits of input. */ +#define DOBLOCK(h1, k1) do{ \ + k1 *= C1; \ + k1 = ROTL32(k1,15); \ + k1 *= C2; \ + \ + h1 ^= k1; \ + h1 = ROTL32(h1,13); \ + h1 = h1*5+0xe6546b64; \ + }while(0) + + +/* Append unaligned bytes to carry, forcing hash churn if we have 4 bytes */ +/* cnt=bytes to process, h1=name of h1 var, c=carry, n=bytes in c, ptr/len=payload */ +#define DOBYTES(cnt, h1, c, n, ptr, len) do{ \ + int _i = cnt; \ + while(_i--) { \ + c = c>>8 | *ptr++<<24; \ + n++; len--; \ + if(n==4) { \ + DOBLOCK(h1, c); \ + n = 0; \ + } \ + } }while(0) + +/*---------------------------------------------------------------------------*/ + +/* Main hashing function. Initialise carry to 0 and h1 to 0 or an initial seed + * if wanted. Both ph1 and pcarry are required arguments. */ +void PMurHash32_Process(uint32_t *ph1, uint32_t *pcarry, const void *key, int len) +{ + uint32_t h1 = *ph1; + uint32_t c = *pcarry; + + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end; + + /* Extract carry count from low 2 bits of c value */ + int n = c & 3; + +#if defined(UNALIGNED_SAFE) + /* This CPU handles unaligned word access */ + + /* Consume any carry bytes */ + int i = (4-n) & 3; + if(i && i <= len) { + DOBYTES(i, h1, c, n, ptr, len); + } + + /* Process 32-bit chunks */ + end = ptr + len/4*4; + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = READ_UINT32(ptr); + DOBLOCK(h1, k1); + } + +#else /*UNALIGNED_SAFE*/ + /* This CPU does not handle unaligned word access */ + + /* Consume enough so that the next data byte is word aligned */ + int i = -(long)ptr & 3; + if(i && i <= len) { + DOBYTES(i, h1, c, n, ptr, len); + } + + /* We're now aligned. Process in aligned blocks. Specialise for each possible carry count */ + end = ptr + len/4*4; + switch(n) { /* how many bytes in c */ + case 0: /* c=[----] w=[3210] b=[3210]=w c'=[----] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = READ_UINT32(ptr); + DOBLOCK(h1, k1); + } + break; + case 1: /* c=[0---] w=[4321] b=[3210]=c>>24|w<<8 c'=[4---] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>24; + c = READ_UINT32(ptr); + k1 |= c<<8; + DOBLOCK(h1, k1); + } + break; + case 2: /* c=[10--] w=[5432] b=[3210]=c>>16|w<<16 c'=[54--] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>16; + c = READ_UINT32(ptr); + k1 |= c<<16; + DOBLOCK(h1, k1); + } + break; + case 3: /* c=[210-] w=[6543] b=[3210]=c>>8|w<<24 c'=[654-] */ + for( ; ptr < end ; ptr+=4) { + uint32_t k1 = c>>8; + c = READ_UINT32(ptr); + k1 |= c<<24; + DOBLOCK(h1, k1); + } + } +#endif /*UNALIGNED_SAFE*/ + + /* Advance over whole 32-bit chunks, possibly leaving 1..3 bytes */ + len -= len/4*4; + + /* Append any remaining bytes into carry */ + DOBYTES(len, h1, c, n, ptr, len); + + /* Copy out new running hash and carry */ + *ph1 = h1; + *pcarry = (c & ~0xff) | n; +} + +/*---------------------------------------------------------------------------*/ + +/* Finalize a hash. To match the original Murmur3A the total_length must be provided */ +uint32_t PMurHash32_Result(uint32_t h, uint32_t carry, uint32_t total_length) +{ + uint32_t k1; + int n = carry & 3; + if(n) { + k1 = carry >> (4-n)*8; + k1 *= C1; k1 = ROTL32(k1,15); k1 *= C2; h ^= k1; + } + h ^= total_length; + + /* fmix */ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +/*---------------------------------------------------------------------------*/ + +/* Murmur3A compatable all-at-once */ +uint32_t PMurHash32(uint32_t seed, const void *key, int len) +{ + uint32_t h1=seed, carry=0; + PMurHash32_Process(&h1, &carry, key, len); + return PMurHash32_Result(h1, carry, len); +} + +/*---------------------------------------------------------------------------*/ + +/* Provide an API suitable for smhasher */ +void PMurHash32_test(const void *key, int len, uint32_t seed, void *out) +{ + uint32_t h1=seed, carry=0; + const uint8_t *ptr = (uint8_t*)key; + const uint8_t *end = ptr + len; + +#if 0 /* Exercise the progressive processing */ + while(ptr < end) { + //const uint8_t *mid = ptr + rand()%(end-ptr)+1; + const uint8_t *mid = ptr + (rand()&0xF); + mid = mid<end?mid:end; + PMurHash32_Process(&h1, &carry, ptr, mid-ptr); + ptr = mid; + } +#else + PMurHash32_Process(&h1, &carry, ptr, (int)(end-ptr)); +#endif + h1 = PMurHash32_Result(h1, carry, len); + *(uint32_t*)out = h1; +} + +/*---------------------------------------------------------------------------*/ +#ifdef TEST +int main() { + // http://www.cprover.org/cbmc/ + // cbmc PMurHash.c --function PMurHash32 --unwind 255 --bounds-check --pointer-check + //=> seed=308736u (00000000000001001011011000000000) + // key=INVALID-128 (1000000011111111111111111111111111111111111111111111110101100111) + // len=640 + // Violated property: + //file PMurHash.c line 201 function PMurHash32_Process + //dereference failure: object bounds + //!(POINTER_OFFSET(ptr) < 0) && OBJECT_SIZE(ptr) >= 1 + POINTER_OFFSET(ptr) || DYNAMIC_OBJECT(ptr) + + uint32_t seed = 308736; + unsigned long long key = 0x80fffffffffffd67ULL; + PMurHash32(seed, &key, sizeof(key)); +} +#endif diff --git a/external/hash/PMurHash.h b/external/hash/PMurHash.h new file mode 100644 index 0000000..28ead00 --- /dev/null +++ b/external/hash/PMurHash.h @@ -0,0 +1,64 @@ +/*----------------------------------------------------------------------------- + * MurmurHash3 was written by Austin Appleby, and is placed in the public + * domain. + * + * This implementation was written by Shane Day, and is also public domain. + * + * This is a portable ANSI C implementation of MurmurHash3_x86_32 (Murmur3A) + * with support for progressive processing. + */ + +/* ------------------------------------------------------------------------- */ +/* Determine what native type to use for uint32_t */ + +/* We can't use the name 'uint32_t' here because it will conflict with + * any version provided by the system headers or application. */ + +/* First look for special cases */ +#if defined(_MSC_VER) + #define MH_UINT32 unsigned long +#endif + +/* If the compiler says it's C99 then take its word for it */ +#if !defined(MH_UINT32) && ( \ + defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L ) + #include <stdint.h> + #define MH_UINT32 uint32_t +#endif + +/* Otherwise try testing against max value macros from limit.h */ +#if !defined(MH_UINT32) + #include <limits.h> + #if (USHRT_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned short + #elif (UINT_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned int + #elif (ULONG_MAX == 0xffffffffUL) + #define MH_UINT32 unsigned long + #endif +#endif + +#if !defined(MH_UINT32) + #error Unable to determine type name for unsigned 32-bit int +#endif + +/* I'm yet to work on a platform where 'unsigned char' is not 8 bits */ +#define MH_UINT8 unsigned char + + +/* ------------------------------------------------------------------------- */ +/* Prototypes */ + +#ifdef __cplusplus +extern "C" { +#endif + +void PMurHash32_Process(MH_UINT32 *ph1, MH_UINT32 *pcarry, const void *key, int len); +MH_UINT32 PMurHash32_Result(MH_UINT32 h1, MH_UINT32 carry, MH_UINT32 total_length); +MH_UINT32 PMurHash32(MH_UINT32 seed, const void *key, int len); + +void PMurHash32_test(const void *key, int len, MH_UINT32 seed, void *out); + +#ifdef __cplusplus +} +#endif diff --git a/external/hash/README.md b/external/hash/README.md new file mode 100644 index 0000000..d0f03e8 --- /dev/null +++ b/external/hash/README.md @@ -0,0 +1,158 @@ +Generic hash table implementation with focus on being minimally +invasive on existing items to be indexed. + +The key is stored arbitrarily in the referenced item. A custom match +function `HT_MATCH` provides the necessary abstraction. Items are +NOT allocated by the hash table. + +Removed items are replaced with a sentinel value (1) to preserve +chaining. + +See the example implementations `hash_set.h`, `hash_item_table.h`, +and `hash_test.c`. + +The hash function can also be customized, see the default below. + +In all cases the key as assumed to be char string that is not +(necessarily) zero terminated. The length is given separately. Keys +can therefore be arbitrary binary values of arbitrary length. + +Instead of initializing the hash table, it may be zeroed. In that +case the count defaults to 4 upon first insert, meaning it can hold +up to 4 items before resizing or less depending on load factor. By +zeroing memory, hash tables use no memory until actually used. + +For increased portability we do not rely upon `stdint.h` outside the +default hash function. + +Build +----- + +There are no special build requirements. + +CMakeLists.txt simply links the appropriate hash function with the test +files, but CMake is not required, for example: + + cc load_test.c ptr_set.c cmetrohash64.c -O4 -DNDEBUG -o load_test + +There are several significant flags that can be set, but look at +`CMakeLists.txt`, `hash_test.c`, and `load_test.c`. + +`initbuild.sh` is an easy way to create a CMake Ninja build for +platforms that support it. + +Usage +----- + +The hash table is implemented in a generic form with a static (private) +interface. The macros + +`HASH_TABLE_HEADER(name, item)` defines the public prototype for the +specialized type, and `HASH_TABLE_API(name)` defines non-static wrapper +functions to access the generic implementation. This avoids creating all +the code as macros which are painful to develop and debug. + +See `token_map.h`, `token_map.c` which are used in `hash_test.c`. + +If the datatype is only needed in one file, the implementation such as +`token_map.c` can be included after defining `HT_PRIVATE`. This gives +the compiler better optimization opportunities and hides the interface +from other compilation units. + +The basic datatype `hash_table_t` is a small struct that can be embedded +anywhere and used as the instance of any hash table derived type. + + +Note on algorithmic choice +-------------------------- + +We use linear or quadratic probing hash tables because it allows for +many small hash tables. We overallocate the hash table by a factor 2 +(default) but only store a single pointer per item. This probing does +not allow for dense tables by itself, but because the hash table only +stores a single pointer per bucket, we can afford a larger table. +Advanced hashing such as Hopscotch can pack much more densely but +e.g. Hopscotch need to store a bitmask, thus already doubling the +size of the table. Hopscotch is probably good, but more complex and +depends on optimizing bit scan insructions, furthermore, when the use +case is many small tables such as symbol table scopes, cache locality +is less relevant. Chained hashing with 50% load factor is a good +choice, but require intrusive links, and cannot trivially hash string +sets without extra allocation. There is some evidence that linear +probing may be faster than quadratic probing due to cache effects, as +long as we do not pack too densely - however, the tradional quadratic +probing (k + i * i) modulo prime does not cover all buckets. We use +(k + i * (i + 1) / 2) modulo power of 2 which covers all buckets so +without experimentation it is unclear whether linear probing or +quadratic probing is best. + +The use of open addressing leads to more key comparisons than chained +hashing. The fact we store the keys indirectly in the stored item is +also not ideal, except when the item is also directly the key. If we +use larger hash tables from the saved space, we suspect this will +still perform well, also considering external factors such as not +having to allocate and copy a key from e.g. a text buffer being +parsed. + +It is generally understood that linear probing degrades significantly +with a load factor above 0.7. In this light, it is interesting to note +that Emmanuel Goossaert tested hopscotch hashing and found that bucket +swaps only take place in significance above a load factor of 0.7. A +commenter to Goossaert's blog also found that neighbourhoods rarely +exceed 64 even when allowed to grow on demand. Without deep analysis +it would appear that linear probing and hopscotch is pretty similar +at a load factor of 0.5 especially if tombstones are not present. +Because hopscotch requires extra data (e.g. the hash key or a bitmap +or a linked list) this confirms our intuition that it is better with +lower load factors and smaller buckets, than advanced algorithms. +Furthermore, hopscotch insert degrades badly when it needs to search for +empty buckets at high load factors. Of course, for on disk storage +it is a different matter, and this is why Goossaert is interested +in caching hash keys in buckets. + +Robin Hood hashing is mostly interesting when there are many deletions +to clean up and when the load factor increases. In our implementation we +try to keep the per bucket size small: a pointer and a 8 bit offset, or +just a pointer for the linear and quadratic probing implementations. +This makes it affordable with a lower load factor. + +This Robin Hood variation stores the offset from the hashed bucket to +where the first entry is stored. This means we can avoiding sampling any +bucket not indexed by the current hash key, and it also means that we +avoid having to store or calculate the hash key when updating. + +A sorted Robin Hood hashing implementation was also made, but it prooved +to be error prone with many special cases and slower than regular Robin +Hood hashing. It would conceivably protect against hash collision +attacks through exponential search, but insertions and deletions would +still need to move memory in linear time, making this point mood. +Therefore the sorted Robin Hood variant has been removed. + + +Section 4.5: +<http://codecapsule.com/2014/05/07/implementing-a-key-value-store-part-6-open-addressing-hash-tables/> + +<http://codecapsule.com/2013/08/11/hopscotch-hashing/> + +Source file references +---------------------- + +<http://www.jandrewrogers.com/2015/05/27/metrohash/> + +downloaded from + + <https://github.com/rurban/smhasher> + <https://github.com/rurban/smhasher/commit/00a4e5ab6bfb7b25bd3c7cf915f68984d4910cfd> + + <https://raw.githubusercontent.com/rurban/smhasher/master/cmetrohash64.c> + <https://raw.githubusercontent.com/rurban/smhasher/master/cmetrohash.h> + <https://raw.githubusercontent.com/rurban/smhasher/master/PMurHash.c> + <https://raw.githubusercontent.com/rurban/smhasher/master/PMurHash.h> + +As of July 2015, for 64-bit hashes, the C port of the 64 bit metro hash +is a good trade-off between speed and simplicity. The For a 32-bit C hash +function, the ported MurmurHash3 is safe and easy to use in this +environment, but xxHash32 may also be worth considering. + +See also <http://www.strchr.com/hash_functions> + diff --git a/external/hash/cmetrohash.h b/external/hash/cmetrohash.h new file mode 100644 index 0000000..b2c869a --- /dev/null +++ b/external/hash/cmetrohash.h @@ -0,0 +1,78 @@ +// metrohash.h +// +// The MIT License (MIT) +// +// Copyright (c) 2015 J. Andrew Rogers +// +// Updated Nov. 2015 to use safe unaligned reads and platform neutral +// hash. This WILL change hashes on big endian platfors. / mikkelfj +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// + +#ifndef CMETROHASH_METROHASH_H +#define CMETROHASH_METROHASH_H + +#include "ht_portable.h" +#include "unaligned.h" + +#pragma once + +#if defined (__cplusplus) +extern "C" { +#endif + +#include <stdint.h> +#include <string.h> + +// MetroHash 64-bit hash functions +void cmetrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +void cmetrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); + + +/* rotate right idiom recognized by compiler*/ +inline static uint64_t crotate_right(uint64_t v, unsigned k) +{ + return (v >> k) | (v << (64 - k)); +} + +inline static uint64_t cread_u64(const void * const ptr) +{ + return (uint64_t)unaligned_read_le64toh(ptr); +} + +inline static uint64_t cread_u32(const void * const ptr) +{ + return (uint64_t)unaligned_read_le32toh(ptr); +} + +inline static uint64_t cread_u16(const void * const ptr) +{ + return (uint64_t)unaligned_read_le16toh(ptr); +} + +inline static uint64_t cread_u8 (const void * const ptr) +{ + return * (uint8_t *) ptr; +} + +#if defined (__cplusplus) +} +#endif +#endif // #ifndef CMETROHASH_METROHASH_H diff --git a/external/hash/cmetrohash64.c b/external/hash/cmetrohash64.c new file mode 100644 index 0000000..2923958 --- /dev/null +++ b/external/hash/cmetrohash64.c @@ -0,0 +1,185 @@ +// metrohash64.cpp +// +// The MIT License (MIT) +// +// Copyright (c) 2015 J. Andrew Rogers +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// + +#include "cmetrohash.h" + + +void cmetrohash64_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xC83A91E1; + static const uint64_t k1 = 0x8648DBDB; + static const uint64_t k2 = 0x7BDEC03B; + static const uint64_t k3 = 0x2F5870A5; + + const uint8_t * ptr = key; + const uint8_t * const end = ptr + len; + + uint64_t hash = ((((uint64_t) seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += cread_u64(ptr) * k0; ptr += 8; v[0] = crotate_right(v[0],29) + v[2]; + v[1] += cread_u64(ptr) * k1; ptr += 8; v[1] = crotate_right(v[1],29) + v[3]; + v[2] += cread_u64(ptr) * k2; ptr += 8; v[2] = crotate_right(v[2],29) + v[0]; + v[3] += cread_u64(ptr) * k3; ptr += 8; v[3] = crotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= crotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; + v[3] ^= crotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; + v[0] ^= crotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; + v[1] ^= crotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0, v1; + v0 = hash + (cread_u64(ptr) * k0); ptr += 8; v0 = crotate_right(v0,33) * k1; + v1 = hash + (cread_u64(ptr) * k1); ptr += 8; v1 = crotate_right(v1,33) * k2; + v0 ^= crotate_right(v0 * k0, 35) + v1; + v1 ^= crotate_right(v1 * k3, 35) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += cread_u64(ptr) * k3; ptr += 8; + hash ^= crotate_right(hash, 33) * k1; + + } + + if ((end - ptr) >= 4) + { + hash += cread_u32(ptr) * k3; ptr += 4; + hash ^= crotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += cread_u16(ptr) * k3; ptr += 2; + hash ^= crotate_right(hash, 13) * k1; + } + + if ((end - ptr) >= 1) + { + hash += cread_u8 (ptr) * k3; + hash ^= crotate_right(hash, 25) * k1; + } + + hash ^= crotate_right(hash, 33); + hash *= k0; + hash ^= crotate_right(hash, 33); + + memcpy(out, &hash, 8); +} + + +void cmetrohash64_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) +{ + static const uint64_t k0 = 0xD6D018F5; + static const uint64_t k1 = 0xA2AA033B; + static const uint64_t k2 = 0x62992FC1; + static const uint64_t k3 = 0x30BC5B29; + + const uint8_t * ptr = key; + const uint8_t * const end = ptr + len; + + uint64_t hash = ((((uint64_t) seed) + k2) * k0) + len; + + if (len >= 32) + { + uint64_t v[4]; + v[0] = hash; + v[1] = hash; + v[2] = hash; + v[3] = hash; + + do + { + v[0] += cread_u64(ptr) * k0; ptr += 8; v[0] = crotate_right(v[0],29) + v[2]; + v[1] += cread_u64(ptr) * k1; ptr += 8; v[1] = crotate_right(v[1],29) + v[3]; + v[2] += cread_u64(ptr) * k2; ptr += 8; v[2] = crotate_right(v[2],29) + v[0]; + v[3] += cread_u64(ptr) * k3; ptr += 8; v[3] = crotate_right(v[3],29) + v[1]; + } + while (ptr <= (end - 32)); + + v[2] ^= crotate_right(((v[0] + v[3]) * k0) + v[1], 30) * k1; + v[3] ^= crotate_right(((v[1] + v[2]) * k1) + v[0], 30) * k0; + v[0] ^= crotate_right(((v[0] + v[2]) * k0) + v[3], 30) * k1; + v[1] ^= crotate_right(((v[1] + v[3]) * k1) + v[2], 30) * k0; + hash += v[0] ^ v[1]; + } + + if ((end - ptr) >= 16) + { + uint64_t v0, v1; + v0 = hash + (cread_u64(ptr) * k2); ptr += 8; v0 = crotate_right(v0,29) * k3; + v1 = hash + (cread_u64(ptr) * k2); ptr += 8; v1 = crotate_right(v1,29) * k3; + v0 ^= crotate_right(v0 * k0, 34) + v1; + v1 ^= crotate_right(v1 * k3, 34) + v0; + hash += v1; + } + + if ((end - ptr) >= 8) + { + hash += cread_u64(ptr) * k3; ptr += 8; + hash ^= crotate_right(hash, 36) * k1; + } + + if ((end - ptr) >= 4) + { + hash += cread_u32(ptr) * k3; ptr += 4; + hash ^= crotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 2) + { + hash += cread_u16(ptr) * k3; ptr += 2; + hash ^= crotate_right(hash, 15) * k1; + } + + if ((end - ptr) >= 1) + { + hash += cread_u8 (ptr) * k3; + hash ^= crotate_right(hash, 23) * k1; + } + + hash ^= crotate_right(hash, 28); + hash *= k0; + hash ^= crotate_right(hash, 29); + + memcpy(out, &hash, 8); +} + + diff --git a/external/hash/hash.h b/external/hash/hash.h new file mode 100644 index 0000000..c5a6fc6 --- /dev/null +++ b/external/hash/hash.h @@ -0,0 +1,115 @@ +#ifndef HASH_H +#define HASH_H + +/* Misc. hash functions that do not comply to a specific interface. */ + +#include <stdlib.h> + +#ifdef _MSC_VER +/* `inline` only advisory anyway. */ +#pragma warning(disable: 4710) /* function not inlined */ +#endif + +static inline uint32_t hash_fnv1a32_update(uint32_t seed, uint8_t *buf, size_t len) +{ + uint8_t *p = buf; +#ifndef FNV1A_NOMUL + const uint64_t prime = UINT32_C(0x1000193); +#endif + uint64_t hash = seed; + + while (len--) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 7) + + (hash << 8) + (hash << 24); +#endif + } + return hash; +} + +static inline uint32_t hash_fnv1a32(uint8_t *buf, size_t len) +{ + return hash_fnv1a32_update(UINT32_C(0x811c9dc5), buf, len); +} + +static inline uint64_t hash_fnv1a64_update(uint64_t v, uint8_t *buf, size_t len) +{ + uint8_t *p = buf; +#ifndef FNV1A_NOMUL + const uint64_t prime = UINT64_C(0x100000001b3); +#endif + uint64_t hash = v; + + while (len--) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 5) + + (hash << 7) + (hash << 8) + (hash << 40); +#endif + } + return hash; +} + +static inline uint64_t hash_fnv1a64(uint8_t *buf, size_t len) +{ + return hash_fnv1a64_update(UINT64_C(0xcbf29ce484222325), buf, len); +} + +/* + * MurmurHash 3 final mix with seed to handle 0. + * + * Width is number of bits of the value to return. + * http://stackoverflow.com/a/12996028 + */ +static inline uint32_t hash_bucket32(uint32_t v, size_t width) +{ + uint32_t x = v + UINT32_C(0x2f693b52); + + x = ((x >> 16) ^ x) * UINT32_C(0x45d9f3b); + x = ((x >> 16) ^ x) * UINT32_C(0x45d9f3b); + x = ((x >> 16) ^ x); + return x >> (32 - width); +} + +/* + * SplitMix64 - can be used to disperse fnv1a hash, to hash + * an integer, or as a simple non-cryptographic prng. + * + * Width is number of bits of the value to return. + * http://stackoverflow.com/a/12996028 + */ +static inline uint64_t hash_bucket64(uint64_t v, size_t width) +{ + uint64_t x = v + UINT64_C(0x9e3779b97f4a7c15); + + x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9); + x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb); + x = x ^ (x >> 31); + return x >> (64 - width); +} + +static inline uint64_t hash_random64(uint64_t *state) +{ + uint64_t x; + + x = hash_bucket64(*state, 64); + *state = x; + return x; +} + +/* + * Faster, less random hash bucket compared to hash_bucket32, but works + * for smaller integers. + */ +static inline uint32_t hash_mult32(uint32_t v, size_t width) +{ + /* Knuth's multiplicative hash. */ + return (v * UINT32_C(2654435761)) >> (32 - width); +} + +#endif /* HASH_H */ diff --git a/external/hash/hash_table.h b/external/hash/hash_table.h new file mode 100644 index 0000000..5c3e9cd --- /dev/null +++ b/external/hash/hash_table.h @@ -0,0 +1,266 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HASH_TABLE_H +#define HASH_TABLE_H + +#include "ht_portable.h" +#include <stddef.h> + +/* + * Define HT_PRIVATE to make all name wrapping interface functions static + * inline when including hash implementation directly in user code. This + * can increase performance significantly (3x) on small hash tables with + * fast hash functions because the compiler can better optimize static + * functions. Some compiler optimizations will get the same speed + * with external linkage (clang 4.2 -O4 but not -O3). + * + * Can also be used to simple hide the interface from global + * linkage to avoid name clashes. + */ +#ifndef HT_PRIVATE +#define HT_PRIV +#else +#define HT_PRIV static inline +#endif + +/* + * Generic hash table type. This makes it possible to use hash tables + * in datastructures and header files that do not have access to + * the specific hash table implementation. Call to init is optional + * if the structure is zeroed. + * + * Offsets are only used with Robin Hood hashing to segment each chain. + * + * Keys and values are both stored in the same item pointer. There are + * downsides to this over a key / value represention, but since we also + * use less space we can afford lower the load factor and we can have a + * more complex key representations. The smaller bucket size also helps + * when ordering Robin Hood hash chains. + */ +typedef struct hash_table hash_table_t; +struct hash_table { + void *table; + char *offsets; + size_t count; + /* May be stored as a direct count, or log2. */ + size_t buckets; +}; + +enum hash_table_insert_mode { + ht_replace = 0, + ht_keep = 1, + ht_unique = 2, + ht_multi = 3, +}; + +/* + * This macro defines the prototypes of the hash table that user code + * needs for linkage. + * + * See also "hash_table_def.h" which builds wrapper functions to a + * generic hash table implementation so each specialization gets its own + * set of named functions. + * + * The HT_ITEM is normally a pointer to and the hash table does not + * store any signficant information internally. Customizations map + * the item value to a key. Certain values can be reserved, for + * example 0 indicates missing value, and sometimes 1 is reserved for + * internal tombstones and 2 may be used to return allocation failure. + */ +#define DECLARE_HASH_TABLE(HT_NAME, HT_ITEM) \ + \ +typedef hash_table_t HT_NAME##_t; \ +typedef HT_ITEM HT_NAME##_item_t; \ + \ +/* Prototype for user supplied callback when visiting all elements. */ \ +typedef void HT_NAME##_visitor_f(void *context, HT_ITEM item); \ + \ +extern const HT_NAME##_item_t HT_NAME##_missing; \ +extern const HT_NAME##_item_t HT_NAME##_nomem; \ +extern const HT_NAME##_item_t HT_NAME##_deleted; \ + \ +static inline int HT_NAME##_is_valid(HT_ITEM item) \ +{ \ + return \ + item != HT_NAME##_missing && \ + item != HT_NAME##_nomem && \ + item != HT_NAME##_deleted; \ +} \ + \ +static inline int HT_NAME##_is_missing(HT_ITEM item) \ +{ \ + return item == HT_NAME##_missing; \ +} \ + \ +static inline int HT_NAME##_is_nomem(HT_ITEM item) \ +{ \ + return item == HT_NAME##_nomem; \ +} \ + \ +static inline int HT_NAME##_is_deleted(HT_ITEM item) \ +{ \ + return item == HT_NAME##_deleted; \ +} \ + \ +/* \ + * Allocates enough buckets to represent count elements without resizing. \ + * The actual number of allocated buckets depends on the load factor \ + * given as a macro argument in the implementation. The bucket number \ + * rounds up to the nearest power of 2. \ + * \ + * `ht` should not be initialized beforehand, otherwise use resize. \ + * Alternatively, it is also valid to zero initialize the table by \ + * other means - this will postpone allocation until needed. \ + * \ + * The load factor (template argument) should be positive and at most \ + * 100%, otherwise insertion and resize cannot succeed. The recommended \ + * load factor is between 25% and 75%. \ + * \ + * Returns 0 on success, -1 on allocation failure or invalid load factor. \ + */ \ +HT_PRIV int HT_NAME##_init(HT_NAME##_t *ht, size_t count); \ + \ +/* \ + * Clears the allocated memory. Optionally takes a destructor \ + * that will visit all items. \ + * The table struct may be reused after being destroyed. \ + * May also be called on a zero initialised hash table. \ + * \ + * Can be called in place of clear for more control. \ + */ \ +HT_PRIV void HT_NAME##_destroy(HT_NAME##_t *ht, \ + HT_NAME##_visitor_f *destructor, void *context); \ + \ +/* \ + * Clears the allocated memory, but does manage memory or state of any \ + * stored items. It is a simpler version of destroy. \ + */ \ +HT_PRIV void HT_NAME##_clear(HT_NAME##_t *ht); \ + \ +/* \ + * Resizes the hash table to hold at least `count` elements. \ + * The actual number of allocated buckets is a strictly larger power of \ + * two. If `count` is smaller than the current number of elements, \ + * that number is used instead of count. Thus, resize(ht, 0) may be \ + * used to reduce the table size after a spike. \ + * The function is called automatically as elements are inserted, \ + * but shrinking the table should be done manually. \ + * \ + * If resizing to same size, table is still reallocated but will then \ + * clean up old tombstones from excessive deletion. \ + * \ + * Returns 0 on success, -1 on allocation failure. \ + */ \ +HT_PRIV int HT_NAME##_resize(HT_NAME##_t *ht, size_t count); \ + \ +/* \ + * Inserts an item pointer in one of the following modes: \ + * \ + * ht_keep: \ + * If the key exists, the stored item is kept and returned, \ + * otherwise it is inserted and null is returned. \ + * \ + * ht_replace: \ + * If the key exists, the stored item is replaced and the old \ + * item is returned, otherwise the item is inserted and null \ + * is returned. \ + * \ + * ht_unique: \ + * Inserts an item without checking if a key exists. Always return \ + * null. This is faster when it is known that the key does not exists. \ + * \ + * ht_multi: \ + * Same as ht_unique but with the intention that a duplicate key \ + * might exist. This should not be abused because not all hash table \ + * implementions work well with too many collissions. Robin Hood \ + * hashing might reallocate aggressively to keep the chain length \ + * down. Linear and Quadratic probing do handle this, albeit slow. \ + * \ + * The inserted item cannot have the value HT_MISSING and depending on \ + * implementation also not HT_DELETED and HT_NOMEM, but the \ + * definitions are type specific. \ + */ \ +HT_PRIV HT_ITEM HT_NAME##_insert(HT_NAME##_t *ht, \ + const void *key, size_t len, HT_ITEM item, int mode); \ + \ +/* Similar to insert, but derives key from item. */ \ +HT_PRIV HT_ITEM HT_NAME##_insert_item(HT_NAME##_t *ht, \ + HT_ITEM item, int mode); \ + \ +/* \ + * Finds the first matching item if any, or returns null. \ + * If there are duplicate keys, the first inserted is returned. \ + */ \ +HT_PRIV HT_ITEM HT_NAME##_find(HT_NAME##_t *ht, \ + const void *key, size_t len); \ + \ +/* \ + * Removes first inserted item that match the given key, if any. \ + * Returns the removed item if any, otherwise null. \ + */ \ +HT_PRIV HT_ITEM HT_NAME##_remove(HT_NAME##_t *ht, \ + const void *key, size_t len); \ + \ +/* \ + * Finds an item that compares the same as the given item but it is \ + * not necessarily the same item if it either isn't stored, or if \ + * there are duplicates in the table. \ + */ \ +HT_PRIV HT_ITEM HT_NAME##_find_item(HT_NAME##_t *ht, HT_ITEM item); \ + \ +/* \ + * This removes the first item that matches the given item, not \ + * necessarily the item itself, and the item need not be present \ + * in the table. Even if the item is in fact removed, it may still \ + * be present if stored multiple times through abuse use of the \ + * insert_unique function. \ + */ \ +HT_PRIV HT_ITEM HT_NAME##_remove_item(HT_NAME##_t *ht, HT_ITEM item); \ + \ +/* \ + * Calls a function for every item in the hash table. This may be \ + * used for destructing items, provided the table is not accessed \ + * subsequently. In fact, the hash_table_clear function takes an \ + * optional visitor that does exactly that. \ + * \ + * The function is linear of the allocated hash table size, so will be \ + * inefficient if the hash table was resized much larger than the number \ + * of stored items. In that case it is better to store links in the \ + * items. For the default resizing, the function is reasonably fast \ + * because for cache reasons it is very fast to exclude empty elements. \ + */ \ +HT_PRIV void HT_NAME##_visit(HT_NAME##_t *ht, \ + HT_NAME##_visitor_f *visitor, void *context); \ + \ +/* \ + * Returns number of elements in the table. (Not necessarily the number of \ + * unique keys. \ + */ \ +static inline size_t HT_NAME##_count(HT_NAME##_t *ht) \ +{ \ + return ht->count; \ +} \ + +#endif /* HASH_TABLE_H */ diff --git a/external/hash/hash_table_def.h b/external/hash/hash_table_def.h new file mode 100644 index 0000000..5362d47 --- /dev/null +++ b/external/hash/hash_table_def.h @@ -0,0 +1,154 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HASH_TABLE_DEF_H +#define HASH_TABLE_DEF_H + +#include "ht_hash_function.h" +#ifndef HT_HASH_FUNCTION +/* + * If the default hash function is used, make sure to link with the + * appropriate hash implementation file. + */ +#define HT_HASH_FUNCTION ht_default_hash_function +#endif + +#ifndef HT_LOAD_FACTOR +#define HT_LOAD_FACTOR 0.7 +#endif + +#define HT_LOAD_FACTOR_FRAC ((size_t)((float)(HT_LOAD_FACTOR)*256)) + +#ifndef HT_PANIC +#include <stdio.h> +#define HT_PANIC(s) { fprintf(stderr, "aborting on panic: %s\n", s); exit(1); } +#endif + +#ifndef HT_MISSING +#define HT_MISSING ((ht_item_t)0) +#endif + +#ifndef HT_NOMEM +#define HT_NOMEM ((ht_item_t)1) +#endif + +#ifndef HT_DELETED +#define HT_DELETED ((ht_item_t)2) +#endif + +#define DEFINE_HASH_TABLE(HT_NAME) \ + \ +typedef HT_NAME##_item_t ht_item_t; \ +typedef HT_NAME##_visitor_f ht_visitor_f; \ + \ +/* User supplied. */ \ +static inline int ht_match(const void *key, size_t len, ht_item_t item); \ +static inline const void *ht_key(ht_item_t item); \ +static inline size_t ht_key_len(ht_item_t item); \ + \ +/* Implementation supplied. */ \ +static ht_item_t ht_insert(hash_table_t *ht, \ + const void *key, size_t len, ht_item_t new_item, int mode); \ +static ht_item_t ht_find(hash_table_t *ht, const void *key, size_t len); \ +static ht_item_t ht_remove(hash_table_t *ht, const void *key, size_t len); \ +static int ht_init(hash_table_t *ht, size_t count); \ +static int ht_resize(hash_table_t *ht, size_t count); \ +static void ht_clear(hash_table_t *ht); \ +static void ht_visit(hash_table_t *ht, \ + ht_visitor_f *visitor, void *context); \ + \ +const ht_item_t HT_NAME##_missing = HT_MISSING; \ +const ht_item_t HT_NAME##_nomem = HT_NOMEM; \ +const ht_item_t HT_NAME##_deleted = HT_DELETED; \ + \ +HT_PRIV void HT_NAME##_clear(HT_NAME##_t *ht) \ +{ \ + ht_clear(ht); \ +} \ + \ +HT_PRIV void HT_NAME##_destroy(HT_NAME##_t *ht, \ + HT_NAME##_visitor_f *destructor, void *context) \ +{ \ + if (destructor) { \ + ht_visit(ht, destructor, context); \ + } \ + ht_clear(ht); \ +} \ + \ +HT_PRIV int HT_NAME##_init(HT_NAME##_t *ht, size_t count) \ +{ \ + return ht_init(ht, count); \ +} \ + \ +HT_PRIV int HT_NAME##_resize(HT_NAME##_t *ht, size_t count) \ +{ \ + return ht_resize(ht, count); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_insert(HT_NAME##_t *ht, \ + const void *key, size_t len, ht_item_t new_item, int mode) \ +{ \ + return ht_insert(ht, key, len, new_item, mode); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_insert_item(HT_NAME##_t *ht, \ + ht_item_t item, int mode) \ +{ \ + return ht_insert(ht, \ + ht_key(item), \ + ht_key_len(item), \ + item, mode); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_find(HT_NAME##_t *ht, \ + const void *key, size_t len) \ +{ \ + return ht_find(ht, key, len); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_find_item(HT_NAME##_t *ht, ht_item_t item) \ +{ \ + return ht_find(ht, \ + ht_key(item), \ + ht_key_len(item)); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_remove(HT_NAME##_t *ht, \ + const void *key, size_t len) \ +{ \ + return ht_remove(ht, key, len); \ +} \ + \ +HT_PRIV ht_item_t HT_NAME##_remove_item(HT_NAME##_t *ht, ht_item_t item) \ +{ \ + return ht_remove(ht, ht_key(item), ht_key_len(item)); \ +} \ + \ +HT_PRIV void HT_NAME##_visit(HT_NAME##_t *ht, \ + HT_NAME##_visitor_f *visitor, void *context) \ +{ \ + ht_visit(ht, visitor, context); \ +} \ + +#endif /* HASH_TABLE_DEF_H */ diff --git a/external/hash/hash_table_impl.h b/external/hash/hash_table_impl.h new file mode 100644 index 0000000..94fc9b8 --- /dev/null +++ b/external/hash/hash_table_impl.h @@ -0,0 +1,233 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * This file implements a generic hash interface such that different + * instances have the same name, but hidden from each other. + * The interface maps the local names to a public specific type. + * + * This implementations implements a hash table with linear or quadratic + * probing. + */ + +#ifdef HASH_TABLE_IMPL +#error "cannot have multiple implementations in same compilation unit" +#endif +#define HASH_TABLE_IMPL +/* Open Addressing */ +#define HT_OA + +#if defined(_MSC_VER) +#pragma warning(disable: 4127) /* conditional expression is constant */ +#endif + +#include <stdlib.h> +#include <assert.h> + +#ifndef HT_PROBE +#ifdef HT_PROBE_QUADRATIC +#define HT_PROBE(k, i, N) ((k + (i + i * i) / 2) & N) +#else +#define HT_PROBE(k, i, N) ((k + i) & N) +#endif +#endif + +static int ht_init(hash_table_t *ht, size_t count) +{ + size_t buckets = 4; + + if ((HT_LOAD_FACTOR_FRAC) > 256 || (HT_LOAD_FACTOR_FRAC) < 1) { + /* + * 100% is bad but still the users choice. + * 101% will never terminate insertion. + */ + HT_PANIC("hash table failed with impossible load factor"); + return -1; + } + while (count > buckets * (HT_LOAD_FACTOR_FRAC) / 256) { + buckets *= 2; + } + ht->table = calloc(buckets, sizeof(ht_item_t)); + if (ht->table == 0) { + return -1; + } + ht->offsets = 0; + ht->buckets = buckets; + ht->count = 0; + return 0; +} + +static int ht_resize(hash_table_t *ht, size_t count) +{ + size_t i; + hash_table_t ht2; + ht_item_t *T = ht->table; + void *item; + + if (count < ht->count) { + count = ht->count; + } + if (ht_init(&ht2, count)) { + return -1; + } + for (i = 0; i < ht->buckets; ++i) { + item = T[i]; + if ((item && item != HT_DELETED)) { + ht_insert(&ht2, ht_key(item), ht_key_len(item), item, ht_multi); + } + } + ht_clear(ht); + memcpy(ht, &ht2, sizeof(*ht)); + return 0; +} + +static ht_item_t ht_insert(hash_table_t *ht, + const void *key, size_t len, ht_item_t new_item, int mode) +{ + ht_item_t *T; + size_t N, i, j, k; + ht_item_t item, *vacant = 0; + + assert(new_item != HT_MISSING); + assert(new_item != HT_DELETED); + assert(new_item != HT_NOMEM); + + if (ht->count >= ht->buckets * (HT_LOAD_FACTOR_FRAC) / 256) { + if (ht_resize(ht, ht->count * 2)) { + HT_PANIC("hash table failed to allocate memory during resize"); + return HT_NOMEM; + } + } + T = ht->table; + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len); + i = 0; + j = HT_PROBE(k, i, N); + if (mode == ht_unique || mode == ht_multi) { + ++ht->count; + while (T[j] && T[j] != HT_DELETED) { + ++i; + j = HT_PROBE(k, i, N); + } + T[j] = new_item; + return 0; + } + while ((item = T[j])) { + if (item == HT_DELETED) { + if (vacant == 0) { + /* + * If a tombstone was found, use the first available, + * but continue search for possible match. + */ + vacant = &T[j]; + } + } else if (ht_match(key, len, item)) { + if (mode == ht_replace) { + T[j] = new_item; + } + return item; + } + ++i; + j = HT_PROBE(k, i, N); + } + if (vacant == 0) { + vacant = &T[j]; + } + ++ht->count; + *vacant = new_item; + return 0; +} + +static ht_item_t ht_find(hash_table_t *ht, const void *key, size_t len) +{ + ht_item_t *T = ht->table; + size_t N, i, j, k; + ht_item_t item; + + if (T == 0) { + return 0; + } + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len); + i = 0; + j = HT_PROBE(k, i, N); + while ((item = T[j])) { + if ((item != HT_DELETED) && + ht_match(key, len, item)) { + return item; + } + ++i; + j = HT_PROBE(k, i, N); + } + return 0; +} + +static ht_item_t ht_remove(hash_table_t *ht, const void *key, size_t len) +{ + ht_item_t *T = ht->table; + size_t N, i, j, k; + ht_item_t item; + + if (T == 0) { + return 0; + } + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len); + i = 0; + j = HT_PROBE(k, i, N); + while ((item = T[j])) { + if (item != HT_DELETED && + ht_match(key, len, item)) { + T[j] = HT_DELETED; + --ht->count; + return item; + } + ++i; + j = HT_PROBE(k, i, N); + } + return 0; +} + +static void ht_visit(hash_table_t *ht, ht_visitor_f *visitor, void *context) +{ + size_t i; + ht_item_t *T = ht->table; + ht_item_t item; + + for (i = 0; i < ht->buckets; ++i) { + item = T[i]; + if (item && item != HT_DELETED) { + visitor(context, item); + } + } +} + +static void ht_clear(hash_table_t *ht) +{ + if (ht->table) { + free(ht->table); + } + memset(ht, 0, sizeof(*ht)); +} diff --git a/external/hash/hash_table_impl_rh.h b/external/hash/hash_table_impl_rh.h new file mode 100644 index 0000000..b4cabae --- /dev/null +++ b/external/hash/hash_table_impl_rh.h @@ -0,0 +1,360 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* We use the same define for all implementations */ +#ifdef HASH_TABLE_IMPL +#error "cannot have multiple implementations in same compilation unit" +#endif +#define HASH_TABLE_IMPL +/* Robin Hood (with offset table) */ +#define HT_RH + +#if defined(_MSC_VER) +#pragma warning(disable: 4127) /* conditional expression is constant */ +#endif + +#include <stdlib.h> +#include <assert.h> + +/* + * A variation of Robin Hashing: + * We do not calcute distance from buckets, nor do we cache + * hash keys. Instead we maintain a 7-bit offset that points + * to where the first entry of a bucket is stored. In Robin Hood hashing + * all entries conceptually chained to the same bucket are stored + * immediately after each other in order of insertion. The offset of + * the next bucket is naturally the end of the previous bucket, off by + * one. This breaks down when the bucket offset is 0 and the bucket is + * empty because it suggests there is an element. We cannot distinguish + * between a single used and unused entry, except by looking at the + * content or otherwise tag the information on. This is not a problem, + * just a special case to deal with. + * + * The offsets are stored separately which might lead to more cache line + * traffic, but the alternative is not very elegant - either wasting + * space or trying to pack offsets on a per cache line basis. We only + * need 8 bits for offsets. If the offset overflows, bit 7 will be set + * which we can easily detect. During insertion, offsets are increated + * on all affected buckets, and likewise decrement on remove. In + * principle we can use bit parallel increments to update most offsets + * in a single operation, but it is hardly worthwhile due to setup + * cost. The approach bears some resemblance to hopscotch hashing which + * uses local offsets for chaining, but we prefer the simpler Robin + * Hood approach. + * + * If the offset overflows, the table is resized. We expect the packed + * chains to behave like a special case of a hopscotch layout and + * consequently have the same bounds, meaning we are unlikely to have + * neither long offsets nor long chains if we resize below very full + * so resizing on an offset of 128 should be ok. + * + * Our main motivation for this hashing is actually to get rid of + * tombstones in quadruatic and linear probing. Avoiding tombstones + * is much simpler when sorting chains Robin Hood style, and we avoid + * checking for tombstones. We loose this benefit by having to inspect + * offsets, but then also avoid checking keys before the chain, and + * after because we can zero in on exactly the entries belonging to + * bucket. + * + * Unlike traditional Robin Hood, we can find a missing key very quickly + * without any heuristics: we only need to inspect exactly the number + * of entries in the bucket (or at most 1 if the bucket is empty). + * + * Find operations start exactly at an entry with a matching hash key + * unlike normal Robin Hood which must scan past a few earlier entries + * on average, or guestimate where to start and seek both ways. + * + * We can also very quickly insert a key that is known to be unique + * because we can add it directly to the end (but possibly requiring + * a shift of later entries Robin Hood style). + * + * Whether these benefits outweighs the cost of a separate offset + * lookup is unclear, but the reduced memory consumption certainly + * allows for a lower load factor, which also helps a lot. + * + * Traditional Robin Hood Hashing actually permits a chain to become + * very long. We do not permit this, in line with hopscotch hashing. + * This is a drawback from a security perspective because worst case + * this can trigger resizing ad infinitum iff the hash function can + * be hacked or massive duplicate key insertion can be triggered. By + * used the provided hash functions and seeding them randomly at + * startup, and avoiding the multi key feature, it is very unlikely to + * be a problem with what is known about hash table attacks so far. + * + * Values and keys are not stored, only item pointers. Custom macroes + * or inline functions provide access to key data from the item. We + * could add a separate value array and treat the item strictly as a + * key, but we can have a smaller load factor instead, and can more + * easily avoid copying complex key structures, such as start end + * pointers to token data for parser. + * + * A typical hash table has: key pointer or key value, value pointer + * or value, a cached hash key or bitmap (for Robin Hood or Hopscotch) + * which on 64 bit platforms easily amounts to 20 bytes or more per + * bucket. We use 9 bytes on 64 bit platforms and 5 bytes on 32 bit. + * This gets us down to a max load of 0.5 and on average about 0.37. + * This should make it very likely that the first bucket inspected is + * a direct hit negating the benefit of caching hash keys. In addition, + * when it is not a direct hit, we get pointers loaded in a cache line + * to inspect, all known to have the same hash key. + */ + +int ht_init(hash_table_t *ht, size_t count) +{ + size_t buckets = 4; + + if ((HT_LOAD_FACTOR_FRAC) > 256 || (HT_LOAD_FACTOR_FRAC) < 1) { + /* + * 101% will never terminate insertion. + * 0% will never terminate resize. + */ + HT_PANIC("robin hood hash table failed with impossible load factor"); + return -1; + } + while (count > buckets * (HT_LOAD_FACTOR_FRAC) / 256) { + buckets *= 2; + } + ht->table = calloc(buckets, sizeof(ht_item_t)); + if (ht->table == 0) { + return -1; + } + ht->offsets = calloc(buckets, sizeof(char)); + if (ht->offsets == 0) { + free(ht->table); + ht->table = 0; + return -1; + } + ht->buckets = buckets; + ht->count = 0; + return 0; +} + +int ht_resize(hash_table_t *ht, size_t count) +{ + size_t i; + hash_table_t ht2; + ht_item_t *T = ht->table; + ht_item_t item; + + if (count < ht->count) { + count = ht->count; + } + if (ht_init(&ht2, count)) { + return -1; + } + for (i = 0; i < ht->buckets; ++i) { + item = T[i]; + if (item > (ht_item_t)1) { + ht_insert(&ht2, ht_key(item), ht_key_len(item), item, ht_multi); + } + } + ht_clear(ht); + memcpy(ht, &ht2, sizeof(*ht)); + return 0; +} + +ht_item_t ht_insert(hash_table_t *ht, + const void *key, size_t len, ht_item_t item, int mode) +{ + ht_item_t *T; + size_t N, n, j, k, offset; + ht_item_t new_item; + char overflow = 0; + + new_item = item; + if (ht->count >= ht->buckets * (HT_LOAD_FACTOR_FRAC) / 256) { + if (ht_resize(ht, ht->count * 2)) { + HT_PANIC("robin hood hash table failed to allocate memory during resize"); + return HT_NOMEM; + } + } + T = ht->table; + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len) & N; + offset = ht->offsets[k]; + j = (k + offset) & N; + /* + * T[j] == 0 is a special case because we cannot count + * zero probe length, and because we should not increment + * the offset at insertion point in this case. + * + * T[j] == 0 implies offset == 0, but this way we avoid + * hitting memory that we don't need. + */ + if (offset == 0 && T[j] == 0) { + ++ht->count; + T[j] = new_item; + return 0; + } + n = ht->offsets[(k + 1) & N] - offset + 1; + if (mode == ht_multi) { + /* Don't search for match before inserting. */ + j = (j + n) & N; + n = 0; + } + while (n--) { + item = T[j]; + if (ht_match(key, len, item)) { + if (mode == ht_replace) { + T[j] = new_item; + } + return item; + } + j = (j + 1) & N; + } + ++ht->count; + while (k != j) { + /* Only increment buckets after own bucket. */ + k = (k + 1) & N; + overflow |= ++ht->offsets[k]; + } + while ((item = T[j])) { + T[j] = new_item; + new_item = item; + j = (j + 1) & N; + overflow |= ++ht->offsets[j]; + } + T[j] = new_item; + + if (overflow < 0) { + /* + * At least one offset overflowed, so we need to + * resize the table. + */ + if (ht->count * 10 < ht->buckets) { + HT_PANIC("FATAL: hash table resize on low utilization would explode\n"\ + " possible collision DoS or bad hash function"); + return HT_NOMEM; + } + if (ht_resize(ht, ht->count * 2)) { + HT_PANIC("FATAL: hash table resize failed and left hash table inconsistent");\ + /* + * This renders the hash table in a bad state + * because we have updated to an inconsistent + * state. + */ + return HT_NOMEM; + } + } + return item; +} + +ht_item_t ht_find(hash_table_t *ht, const void *key, size_t len) +{ + ht_item_t *T = ht->table; + size_t N, n, j, k, offset; + ht_item_t item; + + if (T == 0) { + return 0; + } + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len) & N; + offset = ht->offsets[k]; + j = (k + offset) & N; + if (offset == 0 && T[j] == 0) { + /* Special case because we cannot count zero probe length. */ + return 0; + } + n = ht->offsets[(k + 1) & N] - offset + 1; + while (n--) { + item = T[j]; + if (ht_match(key, len, item)) { + return item; + } + j = (j + 1) & N; + } + return 0; +} + +ht_item_t ht_remove(hash_table_t *ht, const void *key, size_t len) +{ + ht_item_t *T = ht->table; + size_t N, n, j, k, offset; + ht_item_t item, *next_item; + + if (T == 0) { + return 0; + } + N = ht->buckets - 1; + k = HT_HASH_FUNCTION(key, len) & N; + offset = ht->offsets[k]; + j = (k + offset) & N; + if (offset == 0 && T[j] == 0) { + return 0; + } + n = ht->offsets[(k + 1) & N] - offset + 1; + while (n) { + item = T[j]; + if (ht_match(key, len, item)) { + break; + } + j = (j + 1) & N; + --n; + } + if (n == 0) { + return 0; + } + --ht->count; + while (k != j) { + /* Do not update the offset of the bucket that we own. */ + k = (k + 1) & N; + --ht->offsets[k]; + } + for (;;) { + j = (j + 1) & N; + if (ht->offsets[j] == 0) { + T[k] = 0; + return item; + } + --ht->offsets[j]; + T[k] = T[j]; + k = j; + } +} + +void ht_visit(hash_table_t *ht, ht_visitor_f *visitor, void *context) +{ + size_t i; + ht_item_t *T = ht->table; + ht_item_t item; + + for (i = 0; i < ht->buckets; ++i) { + item = T[i]; + if (item > (ht_item_t)1) { + visitor(context, item); + } + } +} + +void ht_clear(hash_table_t *ht) +{ + if (ht->table) { + free(ht->table); + } + if (ht->offsets) { + free(ht->offsets); + } + memset(ht, 0, sizeof(*ht)); +} diff --git a/external/hash/hash_test.c b/external/hash/hash_test.c new file mode 100644 index 0000000..d54cc07 --- /dev/null +++ b/external/hash/hash_test.c @@ -0,0 +1,419 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <assert.h> + +/* Not used here, just included to catch compiler errors and warnings. */ +#include "hash.h" + +#include "str_set.h" +#include "token_map.h" +#include "ht64.h" +#include "ht32.h" +#include "ht64rh.h" +#include "ht32rh.h" + +#include "ht_trace.h" + +#define test_assert(x) if (!(x)) { printf("Test failed at %s:%d\n", __FILE__, __LINE__); assert(0); exit(1); } + + +str_set_t S; +token_map_t TM; + +char *keys[] = { + "foo", + "bar", + "baz", + "gimli", + "bofur" +}; + +struct token tokens[5]; + +void free_key(void *context, char *key) { + free(key); +} + +void test_str_set() +{ + int i; + char *s, *s0, *s1; + unsigned int n = sizeof(keys)/sizeof(keys[0]); + + /* We rely on zero initialization here. */ + test_assert(str_set_count(&S) == 0); + for (i = 0; i < n; ++i) { + s = keys[i]; + /* We don't have to use strdup, but we test the + * allocation management and item replacement. */ + s = str_set_insert(&S, s, strlen(s), strdup(s), ht_keep); + test_assert(str_set_count(&S) == i + 1); + test_assert(s == 0); + } + test_assert(n == 5); + for (i = 0; i < n; ++i) { + s = keys[i]; + s = str_set_find(&S, s, strlen(s)); + test_assert(strcmp(s, keys[i]) == 0); + } + s = str_set_remove(&S, "gimlibofur", 5); + test_assert(strcmp(s, "gimli") == 0); + free(s); + test_assert(str_set_count(&S) == n - 1); + s = str_set_remove(&S, "gimlibofur", 5); + test_assert(s == 0); + test_assert(str_set_count(&S) == n - 1); + s = str_set_insert(&S, "foobarbaz", 6, + (s0 = strndup("foobarbaz", 6)), ht_keep); + test_assert(s == 0); + test_assert(str_set_count(&S) == n); + s = str_set_insert(&S, "foobarbaz", 6, + (s1 = strndup("foobarbaz", 6)), ht_keep); + test_assert(s == s0); + free(s1); + test_assert(str_set_count(&S) == n); + s = str_set_find(&S, "foobar", 6); + test_assert(s == s0); + s = str_set_insert(&S, "foobarbaz", 6, + (s1 = strndup("foobarbaz", 6)), ht_replace); + test_assert(s == s0); + free(s); + s = str_set_find(&S, "foobar", 6); + test_assert(s == s1); + s = str_set_find(&S, "foobarbaz", 9); + test_assert(s == 0); + str_set_destroy(&S, free_key, 0); + s = str_set_find(&S, "foobar", 6); + test_assert(s == 0); + for (i = 0; i < n; ++i) { + s = keys[i]; + s = str_set_find(&S, s, strlen(s)); + test_assert(s == 0); + } +} + +void test_str_set2() +{ + int i; + char *s, *s1; + unsigned int n = sizeof(keys)/sizeof(keys[0]); + + for (i = 0; i < n; ++i) { + s = keys[i]; + str_set_insert(&S, s, strlen(s), s, ht_unique); + } + test_assert(str_set_count(&S) == n); + for (i = 0; i < n; ++i) { + s = keys[i]; + /* + * Unique and multi are the same logically, but different + * intentionally. + */ + str_set_insert(&S, s, strlen(s), s, ht_multi); + } + test_assert(str_set_count(&S) == 2 * n); + ht_trace_buckets(&S, "after double insert", 0, 8); + for (i = 0; i < n; ++i) { + s = keys[i]; + s1 = str_set_find(&S, s, strlen(s)); + test_assert(strcmp(s, s1) == 0); + } + for (i = 0; i < n; ++i) { + s = keys[i]; + s1 = str_set_remove(&S, s, strlen(s)); + test_assert(strcmp(s, s1) == 0); + test_assert(str_set_count(&S) == 2 * n - i - 1); + ht_trace_buckets(&S, "after single", 8, 8); + } + ht_trace_buckets(&S, "after first remove", 0, 8); + for (i = 0; i < n; ++i) { + s = keys[i]; + s1 = str_set_remove(&S, s, strlen(s)); + test_assert(strcmp(s, s1) == 0); + test_assert(str_set_count(&S) == n - i - 1); + } + ht_trace_buckets(&S, "efter second remove", 0, 8); + for (i = 0; i < n; ++i) { + s = keys[i]; + s1 = str_set_remove(&S, s, strlen(s)); + test_assert(s1 == 0); + test_assert(str_set_count(&S) == 0); + } + str_set_clear(&S); +} + +void test_str_set3() +{ + int i; + char *s, *s1; + unsigned int n = sizeof(keys)/sizeof(keys[0]); + + for (i = 0; i < n; ++i) { + s = keys[i]; + str_set_insert_item(&S, s, ht_unique); + } + test_assert(str_set_count(&S) == n); + for (i = 0; i < n; ++i) { + s = keys[i]; + str_set_insert_item(&S, s, ht_keep); + } + test_assert(str_set_count(&S) == n); + for (i = 0; i < n; ++i) { + s = keys[i]; + s1 = str_set_find_item(&S, s); + test_assert(strcmp(s, s1) == 0); + } + s = keys[1]; + s1 = str_set_remove_item(&S, s); + /* + * This doesn't always hold, but here we + * are sure because of how we inserted data. + */ + test_assert(s == s1); + s1 = str_set_find_item(&S, s); + test_assert(s1 == 0); + str_set_clear(&S); +} + +void test_str_set4() +{ + char *s, *s1; + + s = "dumble"; + str_set_insert_item(&S, "dumble", ht_keep); + s1 = str_set_find_item(&S, s); + /* TMnsert without replace. */ + str_set_insert_item(&S, "2dumble" + 1, ht_keep); + test_assert(s == s1); + s1 = str_set_find_item(&S, s); + test_assert(s == s1); + /* TMnsert with replace. */ + s1 = str_set_insert_item(&S, "2dumble" + 1, ht_replace); + /* Old value still returned. */ + test_assert(s == s1); + s1 = str_set_find_item(&S, s); + test_assert(s != s1); + /* New item returned. */ + test_assert(strcmp(s1 - 1, "2dumble") == 0); + str_set_clear(&S); +} + +void visit_item_set(void *context, token_map_item_t item) +{ + int *count = context; + ++*count; +} + +void test_token_map() +{ + int i, count; + token_map_item_t item; + unsigned int n = sizeof(keys)/sizeof(keys[0]); + + test_assert(sizeof(tokens)/sizeof(item[0]) == n); + + for (i = 0; i < n; ++i) { + tokens[i].token = keys[i]; + tokens[i].len = strlen(keys[i]); + } + for (i = 0; i < n; ++i) { + item = &tokens[i]; + token_map_insert(&TM, item->token, item->len, item, ht_unique); + } + count = 0; + token_map_visit(&TM, visit_item_set, &count); + test_assert(count == n); + + for (i = 0; i < n; ++i) { + item = token_map_find(&TM, keys[i], strlen(keys[i])); + test_assert(item->type == 0); + item->type = 1; + } + for (i = 0; i < n; ++i) { + item = token_map_find_item(&TM, &tokens[i]); + test_assert(item->type == 1); + item->type = 2; + } +} + +void test_ht32() +{ + uint32_t keys[100]; + int i, j; + ht32_t ht; + uint32_t *x, *y; + + ht32_init(&ht, 10); + for (i = 0; i < 100; ++i) { + keys[i] = i + 3398; + } + for (i = 0; i < 100; ++i) { + x = ht32_insert_item(&ht, &keys[i], ht_unique); + } + for (i = 0; i < 100; ++i) { + x = ht32_find_item(&ht, &keys[i]); + test_assert(x != 0); + test_assert(*x == i + 3398); + } + for (i = 0; i < 100; ++i) { + y = ht32_remove_item(&ht, &keys[i]); + test_assert(y != ht32_missing); + for (j = 0; j < 100; ++j) { + x = ht32_find_item(&ht, &keys[j]); + if (j > i) { + test_assert(x != ht32_missing); + test_assert(*x == j + 3398); + } else { + test_assert(x == ht32_missing); + } + } + } + ht32_clear(&ht); +} + +void test_ht64() +{ + uint64_t keys[100]; + int i, j; + ht64_t ht; + uint64_t *x, *y; + + ht64_init(&ht, 10); + for (i = 0; i < 100; ++i) { + keys[i] = i + 3398; + } + for (i = 0; i < 100; ++i) { + x = ht64_insert_item(&ht, &keys[i], ht_unique); + } + for (i = 0; i < 100; ++i) { + x = ht64_find_item(&ht, &keys[i]); + test_assert(x != 0); + test_assert(*x == i + 3398); + } + for (i = 0; i < 100; ++i) { + y = ht64_remove_item(&ht, &keys[i]); + test_assert(y != ht64_missing); + for (j = 0; j < 100; ++j) { + x = ht64_find_item(&ht, &keys[j]); + if (j > i) { + test_assert(x != ht64_missing); + test_assert(*x == j + 3398); + } else { + test_assert(x == ht64_missing); + } + } + } + ht64_clear(&ht); +} + +void test_ht32rh() +{ + uint32_t keys[100]; + int i, j; + ht32rh_t ht; + uint32_t *x, *y; + + ht32rh_init(&ht, 10); + for (i = 0; i < 100; ++i) { + keys[i] = i + 3398; + } + for (i = 0; i < 100; ++i) { + x = ht32rh_insert_item(&ht, &keys[i], ht_unique); + } + for (i = 0; i < 100; ++i) { + x = ht32rh_find_item(&ht, &keys[i]); + test_assert(x != 0); + test_assert(*x == i + 3398); + } + for (i = 0; i < 100; ++i) { + y = ht32rh_remove_item(&ht, &keys[i]); + test_assert(y != ht32rh_missing); + for (j = 0; j < 100; ++j) { + x = ht32rh_find_item(&ht, &keys[j]); + if (j > i) { + test_assert(x != ht32rh_missing); + test_assert(*x == j + 3398); + } else { + test_assert(x == ht32rh_missing); + } + } + } + ht32rh_clear(&ht); +} + +void test_ht64rh() +{ + uint64_t keys[100]; + int i, j; + ht64rh_t ht; + uint64_t *x, *y; + + ht64rh_init(&ht, 10); + for (i = 0; i < 100; ++i) { + keys[i] = i + 3398; + } + for (i = 0; i < 100; ++i) { + x = ht64rh_insert_item(&ht, &keys[i], ht_unique); + } + for (i = 0; i < 100; ++i) { + x = ht64rh_find_item(&ht, &keys[i]); + test_assert(x != 0); + test_assert(*x == i + 3398); + } + for (i = 0; i < 100; ++i) { + y = ht64rh_remove_item(&ht, &keys[i]); + test_assert(y != ht64rh_missing); + for (j = 0; j < 100; ++j) { + x = ht64rh_find_item(&ht, &keys[j]); + if (j > i) { + test_assert(x != ht64rh_missing); + test_assert(*x == j + 3398); + } else { + test_assert(x == ht64rh_missing); + } + } + } + ht64rh_clear(&ht); +} + +int main(int argc, char *argv[]) +{ + test_str_set(); + test_str_set2(); + test_str_set3(); + test_str_set4(); + test_token_map(); + test_ht32(); + test_ht64(); + test_ht32rh(); + test_ht64rh(); + + printf("all tests passed\n"); + + return 0; +} diff --git a/external/hash/ht32.c b/external/hash/ht32.c new file mode 100644 index 0000000..9954bde --- /dev/null +++ b/external/hash/ht32.c @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ht32.h" +#define HT_HASH_FUNCTION ht_uint32_hash_function + +#include "hash_table_def.h" +DEFINE_HASH_TABLE(ht32) + +#include "hash_table_impl.h" + + +static inline int ht_match(const void *key, size_t len, const ht32_item_t item) +{ + return *(const ht32_item_t)key == *item; +} + +static inline const void *ht_key(const ht32_item_t item) +{ + return (const void *)item; +} + +static inline size_t ht_key_len(const ht32_item_t item) +{ + return sizeof(*item); +} diff --git a/external/hash/ht32.h b/external/hash/ht32.h new file mode 100644 index 0000000..dab9ffb --- /dev/null +++ b/external/hash/ht32.h @@ -0,0 +1,36 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HT32_H +#define HT32_H + +#ifndef UINT8_MAX +#include <stdint.h> +#endif + +#include "hash_table.h" + +DECLARE_HASH_TABLE(ht32, uint32_t *) + +#endif /* HT32_H */ diff --git a/external/hash/ht32rh.c b/external/hash/ht32rh.c new file mode 100644 index 0000000..de6dae2 --- /dev/null +++ b/external/hash/ht32rh.c @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ht32rh.h" +#define HT_HASH_FUNCTION ht_uint32_hash_function + +#include "hash_table_def.h" +DEFINE_HASH_TABLE(ht32rh) + +#include "hash_table_impl_rh.h" + + +static inline int ht_match(const void *key, size_t len, const ht32rh_item_t item) +{ + return *(const ht32rh_item_t)key == *item; +} + +static inline const void *ht_key(const ht32rh_item_t item) +{ + return (const void *)item; +} + +static inline size_t ht_key_len(const ht32rh_item_t item) +{ + return sizeof(*item); +} diff --git a/external/hash/ht32rh.h b/external/hash/ht32rh.h new file mode 100644 index 0000000..061328e --- /dev/null +++ b/external/hash/ht32rh.h @@ -0,0 +1,36 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HT32RH_H +#define HT32RH_H + +#ifndef UINT8_MAX +#include <stdint.h> +#endif + +#include "hash_table.h" + +DECLARE_HASH_TABLE(ht32rh, uint32_t *) + +#endif /* HT32RH_H */ diff --git a/external/hash/ht64.c b/external/hash/ht64.c new file mode 100644 index 0000000..eaebbc5 --- /dev/null +++ b/external/hash/ht64.c @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ht64.h" +#define HT_HASH_FUNCTION ht_uint64_hash_function + +#include "hash_table_def.h" +DEFINE_HASH_TABLE(ht64) + +#include "hash_table_impl.h" + + +static inline int ht_match(const void *key, size_t len, const ht64_item_t item) +{ + return *(const ht64_item_t)key == *item; +} + +static inline const void *ht_key(const ht64_item_t item) +{ + return (const void *)item; +} + +static inline size_t ht_key_len(const ht64_item_t item) +{ + return sizeof(*item); +} diff --git a/external/hash/ht64.h b/external/hash/ht64.h new file mode 100644 index 0000000..b9f9fbe --- /dev/null +++ b/external/hash/ht64.h @@ -0,0 +1,36 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HT64_H +#define HT64_H + +#ifndef UINT8_MAX +#include <stdint.h> +#endif + +#include "hash_table.h" + +DECLARE_HASH_TABLE(ht64, uint64_t *) + +#endif /* HT64_H */ diff --git a/external/hash/ht64rh.c b/external/hash/ht64rh.c new file mode 100644 index 0000000..bfde550 --- /dev/null +++ b/external/hash/ht64rh.c @@ -0,0 +1,47 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ht64rh.h" +#define HT_HASH_FUNCTION ht_uint64_hash_function + +#include "hash_table_def.h" +DEFINE_HASH_TABLE(ht64rh) + +#include "hash_table_impl_rh.h" + + +static inline int ht_match(const void *key, size_t len, const ht64rh_item_t item) +{ + return *(const ht64rh_item_t)key == *item; +} + +static inline const void *ht_key(const ht64rh_item_t item) +{ + return (const void *)item; +} + +static inline size_t ht_key_len(const ht64rh_item_t item) +{ + return sizeof(*item); +} diff --git a/external/hash/ht64rh.h b/external/hash/ht64rh.h new file mode 100644 index 0000000..5b3d454 --- /dev/null +++ b/external/hash/ht64rh.h @@ -0,0 +1,36 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef HT64RH_H +#define HT64RH_H + +#ifndef UINT8_MAX +#include <stdint.h> +#endif + +#include "hash_table.h" + +DECLARE_HASH_TABLE(ht64rh, uint64_t *) + +#endif /* HT64RH_H */ diff --git a/external/hash/ht_hash_function.h b/external/hash/ht_hash_function.h new file mode 100644 index 0000000..1f65ee5 --- /dev/null +++ b/external/hash/ht_hash_function.h @@ -0,0 +1,258 @@ +#ifndef HT_HASH_FUNCTION_H +#define HT_HASH_FUNCTION_H + +#include <stddef.h> + +#ifdef _MSC_VER +/* `inline` only advisory anyway. */ +#pragma warning(disable: 4710) /* function not inlined */ +#endif + +/* Avoid 0 special case in hash functions and allow for configuration with unguessable seed. */ +#ifndef HT_HASH_SEED +#define HT_HASH_SEED UINT32_C(0x2f693b52) +#endif + +#ifndef HT_HASH_32 + +#include "cmetrohash.h" + +static inline size_t ht_default_hash_function(const void *key, size_t len) +{ + uint64_t out; + + cmetrohash64_1((const uint8_t *)key, len, HT_HASH_SEED, (uint8_t *)&out); + return (unsigned int)out; +} + +/* When using the pointer directly as a hash key. */ +static inline size_t ht_ptr_hash_function(const void *key, size_t len) +{ + /* MurmurHash3 64-bit finalizer */ + uint64_t x; + + (void)len; + + x = ((uint64_t)(size_t)key) ^ (HT_HASH_SEED); + + x ^= x >> 33; + x *= 0xff51afd7ed558ccdULL; + x ^= x >> 33; + x *= 0xc4ceb9fe1a85ec53ULL; + x ^= x >> 33; + return (size_t)x; +} + +#else + +#include "PMurHash.h" + +static inline size_t ht_default_hash_function(const void *key, size_t len) +{ + return (size_t)PMurHash32((HT_HASH_SEED), key, (int)len); +} + +/* When using the pointer directly as a hash key. */ +static inline size_t ht_ptr_hash_function(const void *key, size_t len) +{ + /* http://stackoverflow.com/a/12996028 */ + size_t x; + + x = (size_t)key ^ (HT_HASH_SEED); + + x = ((x >> 16) ^ x) * 0x45d9f3bUL; + x = ((x >> 16) ^ x) * 0x45d9f3bUL; + x = ((x >> 16) ^ x); + return x; +} + +#endif /* HT_HASH_32 */ + + +/* This assumes the key points to a 32-bit aligned random value that is its own hash function. */ +static inline size_t ht_uint32_identity_hash_function(const void *key, size_t len) +{ + (void)len; + return (size_t)*(uint32_t *)key; +} + +/* This assumes the key points to a 64-bit aligned random value that is its own hash function. */ +static inline size_t ht_uint64_identity_hash_function(const void *key, size_t len) +{ + (void)len; + return (size_t)*(uint64_t *)key; +} + +/* This assumes the key points to a 32-bit aligned value. */ +static inline size_t ht_uint32_hash_function(const void *key, size_t len) +{ + uint32_t x = *(uint32_t *)key + (uint32_t)(HT_HASH_SEED); + + (void)len; + + /* http://stackoverflow.com/a/12996028 */ + x = ((x >> 16) ^ x) * UINT32_C(0x45d9f3b); + x = ((x >> 16) ^ x) * UINT32_C(0x45d9f3b); + x = ((x >> 16) ^ x); + return x; +} + +/* This assumes the key points to a 64-bit aligned value. */ +static inline size_t ht_uint64_hash_function(const void *key, size_t len) +{ + uint64_t x = *(uint64_t *)key + UINT64_C(0x9e3779b97f4a7c15) + (uint64_t)(HT_HASH_SEED); + + (void)len; + + x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9); + x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb); + return (size_t)(x ^ (x >> 31)); +} + +/* + * Suited for set operations of low-valued integers where the stored + * hash pointer is the key and the value. + * + * This function is especially useful for small hash tables (<1000) + * where collisions are cheap due to caching but also works for integer + * sets up to at least 1,000,000. + * + * NOTE: The multiplicative hash function by Knuth requires the modulo + * to table size be done by shifting the upper bits down, since this is + * where the quality bits reside. This yields significantly fewer + * collisions which is important for e.g. chained hashing. However, our + * interface does not provide the required information. + * + * When used in open hashing with load factors below 0.7 where the + * stored pointer is also the key, collision checking is very cheap and + * this pays off in a large range of table sizes where a more + * complicated hash simply doesn't pay off. + * + * When used with a pointer set where the pointer is also the key, it is + * not likely to work as well because the pointer acts as a large + * integer which works against the design of the hash function. Here a + * better mix function is probably worthwhile - therefore we also have + * ht_ptr_hash_function. + */ +static inline size_t ht_int_hash_function(const void *key, size_t len) +{ + (void)len; + return ((size_t)key ^ (HT_HASH_SEED)) * 2654435761UL; +} + +/* Bernsteins hash function, assumes string is zero terminated, len is ignored. */ +static inline size_t ht_str_hash_function(const void *key, size_t len) +{ + const unsigned char *str = key; + size_t hash = 5381 ^ (HT_HASH_SEED); + size_t c; + + (void)len; + + while ((c = (size_t)*str++)) + hash = ((hash << 5) + hash) ^ c; /* (hash * 33) xor c */ + + return hash; +} + +/* Hashes at most len characters or until zero termination. */ +static inline size_t ht_strn_hash_function(const void *key, size_t len) +{ + const unsigned char *str = key; + size_t hash = 5381 ^ (HT_HASH_SEED); + size_t c; + + while (--len && (c = (size_t)*str++)) + hash = ((hash << 5) + hash) ^ c; /* (hash * 33) xor c */ + + return hash; +} + +static inline uint32_t ht_fnv1a32_hash_function(const void *key, size_t len) +{ +#ifndef FNV1A_NOMUL + const uint32_t prime = UINT32_C(0x1000193); +#endif + uint32_t hash = UINT32_C(0x811c9dc5); + const uint8_t *p = key; + + while (len--) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 7) + + (hash << 8) + (hash << 24); +#endif + } + return hash; +} + +static inline uint64_t ht_fnv1a64_hash_function(const void *key, size_t len) +{ +#ifndef FNV1A_NOMUL + const uint64_t prime = UINT64_C(0x100000001b3); +#endif + uint64_t hash = UINT64_C(0xcbf29ce484222325); + const uint8_t *p = key; + + while (len--) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 5) + + (hash << 7) + (hash << 8) + (hash << 40); +#endif + } + return hash; +} + +/* Hashes until string termination and ignores length argument. */ +static inline uint32_t ht_fnv1a32_str_hash_function(const void *key, size_t len) +{ +#ifndef FNV1A_NOMUL + const uint32_t prime = UINT32_C(0x1000193); +#endif + uint32_t hash = UINT32_C(0x811c9dc5); + const uint8_t *p = key; + + (void)len; + + while (*p) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 7) + + (hash << 8) + (hash << 24); +#endif + } + return hash; +} + +/* Hashes until string termination and ignores length argument. */ +static inline uint64_t ht_fnv1a64_str_hash_function(const void *key, size_t len) +{ +#ifndef FNV1A_NOMUL + const uint64_t prime = UINT64_C(0x100000001b3); +#endif + uint64_t hash = UINT64_C(0xcbf29ce484222325); + const uint8_t *p = key; + + (void)len; + + while (*p) { + hash ^= (uint64_t)*p++; +#ifndef FNV1A_NOMUL + hash *= prime; +#else + hash += (hash << 1) + (hash << 4) + (hash << 5) + + (hash << 7) + (hash << 8) + (hash << 40); +#endif + } + return hash; +} + + +#endif /* HT_HASH_FUNCTION_H */ diff --git a/external/hash/ht_portable.h b/external/hash/ht_portable.h new file mode 100644 index 0000000..3affc1d --- /dev/null +++ b/external/hash/ht_portable.h @@ -0,0 +1,9 @@ +#ifndef HT_PORTABLE_H +#define HT_PORTABLE_H + +#if defined(_MSC_VER) && !defined(inline) +#define inline __inline +#endif +#include "pstdint.h" + +#endif diff --git a/external/hash/ht_trace.h b/external/hash/ht_trace.h new file mode 100644 index 0000000..63af4a8 --- /dev/null +++ b/external/hash/ht_trace.h @@ -0,0 +1,59 @@ +#ifndef HT_TRACE_H +#define HT_TRACE_H + +#ifdef HT_TRACE_ON +#ifndef HT_TRACE_OUT +#define HT_TRACE_OUT stderr +#endif + +#include <stdio.h> +#define ht_trace(s) fprintf(HT_TRACE_OUT, "trace: %s\n", s) +#define ht_tracei(s, i) fprintf(HT_TRACE_OUT, "trace: %s: %d\n", s, (int)i) +#define ht_tracex(s, x) fprintf(HT_TRACE_OUT, "trace: %s: 0x%lx\n", s, (long)x) +#define ht_traces(s, s2, len) fprintf(HT_TRACE_OUT, "trace: %s: %.*s\n", s, (int)len, s2) + +static void ht_trace_buckets(hash_table_t *ht, char *msg, int first, int count) +{ + int i, j, N, n; + + n = ht->buckets; + N = n - 1; + + if (count == 0) { + count = 32; + } + if (count > n) { + count = n; + } + + first = first & N; + fprintf(HT_TRACE_OUT, "bucket trace: %s\n", msg); + if (n > count) { + n = count; + } + fprintf(HT_TRACE_OUT, "item count: %ld, bucket count %ld, utilization: %0.1f%%\n", + ht->count, ht->buckets, (double)ht->count / ht->buckets * 100); + + if (ht->offsets) { + for (i = 0; i < n; ++i) { + j = (first + i) & N; + fprintf(HT_TRACE_OUT, "%03d:%08x:[%02d]\n", + j, (unsigned int)((void **)ht->table)[j], (unsigned int)ht->offsets[j]); + } + } else { + for (i = 0; i < n; ++i) { + j = (first + i) & N; + fprintf(HT_TRACE_OUT, "%03d:%08x\n", j, (unsigned int)((void **)ht->table)[j]); + } + } + fprintf(HT_TRACE_OUT, "--\n"); +} +#else +#define ht_trace(arg1) ((void)0) +#define ht_tracei(arg1, arg2) ((void)0) +#define ht_tracex(arg1, arg2) ((void)0) +#define ht_traces(arg1, arg2, arg3) ((void)0) +#define ht_trace_buckets(arg1, arg2, arg3, arg4) ((void)0) +#endif + +#endif /* HT_TRACE_H */ diff --git a/external/hash/initbuild.sh b/external/hash/initbuild.sh new file mode 100755 index 0000000..34a3fc0 --- /dev/null +++ b/external/hash/initbuild.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +cd `dirname $0` +mkdir -p "build/release" +cd build/release && cmake -GNinja ../.. -DCMAKE_BUILD_TYPE=Release && ninja diff --git a/external/hash/initbuild_debug.sh b/external/hash/initbuild_debug.sh new file mode 100755 index 0000000..d190139 --- /dev/null +++ b/external/hash/initbuild_debug.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +cd `dirname $0` +mkdir -p "build/debug" +cd build/debug && cmake -GNinja ../.. -DCMAKE_BUILD_TYPE=Debug && ninja diff --git a/external/hash/int_set.h b/external/hash/int_set.h new file mode 100644 index 0000000..b873ef9 --- /dev/null +++ b/external/hash/int_set.h @@ -0,0 +1,50 @@ +#ifndef INT_SET_H +#define INT_SET_H + +#include "ptr_set.h" + +/* + * The values 0, 1, and 2 are reserved so we map integers + * before casting them to void *. + * + * Instead we disallow the largest positive integers. + * + * This is specfic to the implementation of ptr_set, so + * if it changes, we may have to change here as well. + */ + +#define HT_INT_SET_OFFSET ((1 << (8 * sizeof(int) - 1)) - 2) +#define HT_INT_TO_PTR(x) ((void *)(size_t)((x) - HT_INT_SET_OFFSET)) +#define HT_PTR_TO_INT(x) ((int)(size_t)(x) + HT_INT_SET_OFFSET) + +/* Return value helpers. */ +#define INT_SET_IS_MISSING(x) (HT_PTR_SET_MISSING(HT_INT_TO_PTR(x))) +#define INT_SET_IS_ERROR(x) (HT_PTR_SET_IS_ERROR(HT_INT_TO_PTR(x))) +#define INT_SET_IS_VALID(x) (HT_PTR_SET_IS_VALID(HT_INT_TO_PTR(x))) + +typedef ptr_set_t int_set_t; + +/* Returns 1 if already present, 0 otherwise. */ +static inline int int_set_add(int_set_t *S, int x) +{ + return ptr_set_insert_item(S, HT_INT_TO_PTR(x), ht_keep) != 0; +} + +/* Returns 1 if removed, 0 otherwise. */ +static inline int int_set_remove(int_set_t *S, int x) +{ + return ptr_set_remove_item(S, HT_INT_TO_PTR(x)) != 0; +} + +static inline int int_set_count(int_set_t *S) +{ + return ptr_set_count(S); +} + +/* Returns 1 if present, 0 otherwise. */ +static inline int int_set_exists(int_set_t *S, int x) +{ + return ptr_set_exists(S, HT_INT_TO_PTR(x)); +} + +#endif /* INT_SET_H */ diff --git a/external/hash/load_test.c b/external/hash/load_test.c new file mode 100644 index 0000000..1c3d0e7 --- /dev/null +++ b/external/hash/load_test.c @@ -0,0 +1,86 @@ +#include <assert.h> +#include <sys/time.h> +#include <stdio.h> + +//#define INT_SET_PRIVATE +#ifdef INT_SET_PRIVATE +/* Make all hash functions private to this module for better + * performance. This may not be necessary depending on compiler + * optimizations. clang 4.2 -O3 benefits while -O4 figures it and get + * same speed with external linkage. */ +#define HT_PRIVATE +#include "int_set.h" +#include "ptr_set.c" +#undef HT_PRIVATE +#else +/* Use external linkage. Link with ptr_set.c which int_set depends upon. */ +#include "int_set.h" +#endif + +struct timeval time_diff(struct timeval start, struct timeval end) +{ + struct timeval temp; + if ((end.tv_usec-start.tv_usec)<0) { + temp.tv_sec = end.tv_sec-start.tv_sec-1; + temp.tv_usec = 1000000+end.tv_usec-start.tv_usec; + } else { + temp.tv_sec = end.tv_sec-start.tv_sec; + temp.tv_usec = end.tv_usec-start.tv_usec; + } + return temp; +} + +double elapsed_ms(struct timeval td) +{ + return (double)td.tv_sec * 1000 + (double)td.tv_usec / 1000; +} + +void test_int_set() +{ + int i, x; + const int N = 1000000; + //const int N = 1000; + int_set_t ht = {0}; + int_set_t *S = &ht; + double ms, nsop, opms; + struct timeval t1, t2, td; + + for (i = 1; i <= N; ++i) { + int_set_add(S, i); + assert(int_set_exists(S, i)); + } + assert(int_set_count(S) == N); + + for (i = 1; i <= N; ++i) { + assert(int_set_exists(S, i)); + } + + gettimeofday(&t1, 0); + for (x = 0, i = 1; i <= N; ++i) { + x += int_set_exists(S, i); + } + gettimeofday(&t2, 0); + + td = time_diff(t1, t2); + ms = elapsed_ms(td); + + nsop = ms * 1000000 / x; + opms = (double)x / ms; + printf("%d out of %d keys found in time %0.03f ms or %0.01f ns per op\n", + x, N, ms, nsop); + printf("ops / ms: %0.0f\n", opms); + + for (i = 1; i <= N; ++i) { + assert(int_set_count(S) == N - i + 1); + assert(int_set_exists(S, i)); + int_set_remove(S, i); + assert(!int_set_exists(S, i)); + } + assert(int_set_count(S) == 0); +} + +int main(int argc, char *argv[]) +{ + test_int_set(); + return 0; +} diff --git a/external/hash/pstdint.h b/external/hash/pstdint.h new file mode 100644 index 0000000..14444aa --- /dev/null +++ b/external/hash/pstdint.h @@ -0,0 +1,898 @@ +/* A portable stdint.h + **************************************************************************** + * BSD License: + **************************************************************************** + * + * Copyright (c) 2005-2016 Paul Hsieh + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************** + * + * Version 0.1.15.2 + * + * The ANSI C standard committee, for the C99 standard, specified the + * inclusion of a new standard include file called stdint.h. This is + * a very useful and long desired include file which contains several + * very precise definitions for integer scalar types that is + * critically important for making portable several classes of + * applications including cryptography, hashing, variable length + * integer libraries and so on. But for most developers its likely + * useful just for programming sanity. + * + * The problem is that some compiler vendors chose to ignore the C99 + * standard and some older compilers have no opportunity to be updated. + * Because of this situation, simply including stdint.h in your code + * makes it unportable. + * + * So that's what this file is all about. Its an attempt to build a + * single universal include file that works on as many platforms as + * possible to deliver what stdint.h is supposed to. Even compilers + * that already come with stdint.h can use this file instead without + * any loss of functionality. A few things that should be noted about + * this file: + * + * 1) It is not guaranteed to be portable and/or present an identical + * interface on all platforms. The extreme variability of the + * ANSI C standard makes this an impossibility right from the + * very get go. Its really only meant to be useful for the vast + * majority of platforms that possess the capability of + * implementing usefully and precisely defined, standard sized + * integer scalars. Systems which are not intrinsically 2s + * complement may produce invalid constants. + * + * 2) There is an unavoidable use of non-reserved symbols. + * + * 3) Other standard include files are invoked. + * + * 4) This file may come in conflict with future platforms that do + * include stdint.h. The hope is that one or the other can be + * used with no real difference. + * + * 5) In the current verison, if your platform can't represent + * int32_t, int16_t and int8_t, it just dumps out with a compiler + * error. + * + * 6) 64 bit integers may or may not be defined. Test for their + * presence with the test: #ifdef INT64_MAX or #ifdef UINT64_MAX. + * Note that this is different from the C99 specification which + * requires the existence of 64 bit support in the compiler. If + * this is not defined for your platform, yet it is capable of + * dealing with 64 bits then it is because this file has not yet + * been extended to cover all of your system's capabilities. + * + * 7) (u)intptr_t may or may not be defined. Test for its presence + * with the test: #ifdef PTRDIFF_MAX. If this is not defined + * for your platform, then it is because this file has not yet + * been extended to cover all of your system's capabilities, not + * because its optional. + * + * 8) The following might not been defined even if your platform is + * capable of defining it: + * + * WCHAR_MIN + * WCHAR_MAX + * (u)int64_t + * PTRDIFF_MIN + * PTRDIFF_MAX + * (u)intptr_t + * + * 9) The following have not been defined: + * + * WINT_MIN + * WINT_MAX + * + * 10) The criteria for defining (u)int_least(*)_t isn't clear, + * except for systems which don't have a type that precisely + * defined 8, 16, or 32 bit types (which this include file does + * not support anyways). Default definitions have been given. + * + * 11) The criteria for defining (u)int_fast(*)_t isn't something I + * would trust to any particular compiler vendor or the ANSI C + * committee. It is well known that "compatible systems" are + * commonly created that have very different performance + * characteristics from the systems they are compatible with, + * especially those whose vendors make both the compiler and the + * system. Default definitions have been given, but its strongly + * recommended that users never use these definitions for any + * reason (they do *NOT* deliver any serious guarantee of + * improved performance -- not in this file, nor any vendor's + * stdint.h). + * + * 12) The following macros: + * + * PRINTF_INTMAX_MODIFIER + * PRINTF_INT64_MODIFIER + * PRINTF_INT32_MODIFIER + * PRINTF_INT16_MODIFIER + * PRINTF_LEAST64_MODIFIER + * PRINTF_LEAST32_MODIFIER + * PRINTF_LEAST16_MODIFIER + * PRINTF_INTPTR_MODIFIER + * + * are strings which have been defined as the modifiers required + * for the "d", "u" and "x" printf formats to correctly output + * (u)intmax_t, (u)int64_t, (u)int32_t, (u)int16_t, (u)least64_t, + * (u)least32_t, (u)least16_t and (u)intptr_t types respectively. + * PRINTF_INTPTR_MODIFIER is not defined for some systems which + * provide their own stdint.h. PRINTF_INT64_MODIFIER is not + * defined if INT64_MAX is not defined. These are an extension + * beyond what C99 specifies must be in stdint.h. + * + * In addition, the following macros are defined: + * + * PRINTF_INTMAX_HEX_WIDTH + * PRINTF_INT64_HEX_WIDTH + * PRINTF_INT32_HEX_WIDTH + * PRINTF_INT16_HEX_WIDTH + * PRINTF_INT8_HEX_WIDTH + * PRINTF_INTMAX_DEC_WIDTH + * PRINTF_INT64_DEC_WIDTH + * PRINTF_INT32_DEC_WIDTH + * PRINTF_INT16_DEC_WIDTH + * PRINTF_UINT8_DEC_WIDTH + * PRINTF_UINTMAX_DEC_WIDTH + * PRINTF_UINT64_DEC_WIDTH + * PRINTF_UINT32_DEC_WIDTH + * PRINTF_UINT16_DEC_WIDTH + * PRINTF_UINT8_DEC_WIDTH + * + * Which specifies the maximum number of characters required to + * print the number of that type in either hexadecimal or decimal. + * These are an extension beyond what C99 specifies must be in + * stdint.h. + * + * Compilers tested (all with 0 warnings at their highest respective + * settings): Borland Turbo C 2.0, WATCOM C/C++ 11.0 (16 bits and 32 + * bits), Microsoft Visual C++ 6.0 (32 bit), Microsoft Visual Studio + * .net (VC7), Intel C++ 4.0, GNU gcc v3.3.3 + * + * This file should be considered a work in progress. Suggestions for + * improvements, especially those which increase coverage are strongly + * encouraged. + * + * Acknowledgements + * + * The following people have made significant contributions to the + * development and testing of this file: + * + * Chris Howie + * John Steele Scott + * Dave Thorup + * John Dill + * Florian Wobbe + * Christopher Sean Morrison + * Mikkel Fahnoe Jorgensen + * + */ + +#include <stddef.h> +#include <limits.h> +#include <signal.h> + +/* + * For gcc with _STDINT_H, fill in the PRINTF_INT*_MODIFIER macros, and + * do nothing else. On the Mac OS X version of gcc this is _STDINT_H_. + */ + +#if ((defined(_MSC_VER) && _MSC_VER >= 1600) || (defined(__STDC__) && __STDC__ && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || (defined (__WATCOMC__) && (defined (_STDINT_H_INCLUDED) || __WATCOMC__ >= 1250)) || (defined(__GNUC__) && (__GNUC__ > 3 || defined(_STDINT_H) || defined(_STDINT_H_) || defined (__UINT_FAST64_TYPE__)) )) && !defined (_PSTDINT_H_INCLUDED) +#include <stdint.h> +#define _PSTDINT_H_INCLUDED +# if defined(__GNUC__) && (defined(__x86_64__) || defined(__ppc64__)) && !(defined(__APPLE__) && defined(__MACH__)) +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "l" +# endif +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +# else +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# ifndef PRINTF_INT32_MODIFIER +# if (UINT_MAX == UINT32_MAX) +# define PRINTF_INT32_MODIFIER "" +# else +# define PRINTF_INT32_MODIFIER "l" +# endif +# endif +# endif +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +# endif +# ifndef PRINTF_UINT64_HEX_WIDTH +# define PRINTF_UINT64_HEX_WIDTH "16" +# endif +# ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +# endif +# ifndef PRINTF_UINT32_HEX_WIDTH +# define PRINTF_UINT32_HEX_WIDTH "8" +# endif +# ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +# endif +# ifndef PRINTF_UINT16_HEX_WIDTH +# define PRINTF_UINT16_HEX_WIDTH "4" +# endif +# ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +# endif +# ifndef PRINTF_UINT8_HEX_WIDTH +# define PRINTF_UINT8_HEX_WIDTH "2" +# endif +# ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "19" +# endif +# ifndef PRINTF_UINT64_DEC_WIDTH +# define PRINTF_UINT64_DEC_WIDTH "20" +# endif +# ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +# endif +# ifndef PRINTF_UINT32_DEC_WIDTH +# define PRINTF_UINT32_DEC_WIDTH "10" +# endif +# ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +# endif +# ifndef PRINTF_UINT16_DEC_WIDTH +# define PRINTF_UINT16_DEC_WIDTH "5" +# endif +# ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +# endif +# ifndef PRINTF_UINT8_DEC_WIDTH +# define PRINTF_UINT8_DEC_WIDTH "3" +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_UINT64_HEX_WIDTH +# endif +# ifndef PRINTF_UINTMAX_HEX_WIDTH +# define PRINTF_UINTMAX_HEX_WIDTH PRINTF_UINT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_UINT64_DEC_WIDTH +# endif +# ifndef PRINTF_UINTMAX_DEC_WIDTH +# define PRINTF_UINTMAX_DEC_WIDTH PRINTF_UINT64_DEC_WIDTH +# endif + +/* + * Something really weird is going on with Open Watcom. Just pull some of + * these duplicated definitions from Open Watcom's stdint.h file for now. + */ + +# if defined (__WATCOMC__) && __WATCOMC__ >= 1250 +# if !defined (INT64_C) +# define INT64_C(x) (x + (INT64_MAX - INT64_MAX)) +# endif +# if !defined (UINT64_C) +# define UINT64_C(x) (x + (UINT64_MAX - UINT64_MAX)) +# endif +# if !defined (INT32_C) +# define INT32_C(x) (x + (INT32_MAX - INT32_MAX)) +# endif +# if !defined (UINT32_C) +# define UINT32_C(x) (x + (UINT32_MAX - UINT32_MAX)) +# endif +# if !defined (INT16_C) +# define INT16_C(x) (x) +# endif +# if !defined (UINT16_C) +# define UINT16_C(x) (x) +# endif +# if !defined (INT8_C) +# define INT8_C(x) (x) +# endif +# if !defined (UINT8_C) +# define UINT8_C(x) (x) +# endif +# if !defined (UINT64_MAX) +# define UINT64_MAX 18446744073709551615ULL +# endif +# if !defined (INT64_MAX) +# define INT64_MAX 9223372036854775807LL +# endif +# if !defined (UINT32_MAX) +# define UINT32_MAX 4294967295UL +# endif +# if !defined (INT32_MAX) +# define INT32_MAX 2147483647L +# endif +# if !defined (INTMAX_MAX) +# define INTMAX_MAX INT64_MAX +# endif +# if !defined (INTMAX_MIN) +# define INTMAX_MIN INT64_MIN +# endif +# endif +#endif + +#ifndef _PSTDINT_H_INCLUDED +#define _PSTDINT_H_INCLUDED + +#ifndef SIZE_MAX +# define SIZE_MAX (~(size_t)0) +#endif + +/* + * Deduce the type assignments from limits.h under the assumption that + * integer sizes in bits are powers of 2, and follow the ANSI + * definitions. + */ + +#ifndef UINT8_MAX +# define UINT8_MAX 0xff +#endif +#if !defined(uint8_t) && !defined(_UINT8_T) +# if (UCHAR_MAX == UINT8_MAX) || defined (S_SPLINT_S) + typedef unsigned char uint8_t; +# define UINT8_C(v) ((uint8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef INT8_MAX +# define INT8_MAX 0x7f +#endif +#ifndef INT8_MIN +# define INT8_MIN INT8_C(0x80) +#endif +#if !defined(int8_t) && !defined(_INT8_T) +# if (SCHAR_MAX == INT8_MAX) || defined (S_SPLINT_S) + typedef signed char int8_t; +# define INT8_C(v) ((int8_t) v) +# else +# error "Platform not supported" +# endif +#endif + +#ifndef UINT16_MAX +# define UINT16_MAX 0xffff +#endif +#if !defined(uint16_t) && !defined(_UINT16_T) +#if (UINT_MAX == UINT16_MAX) || defined (S_SPLINT_S) + typedef unsigned int uint16_t; +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +# define UINT16_C(v) ((uint16_t) (v)) +#elif (USHRT_MAX == UINT16_MAX) + typedef unsigned short uint16_t; +# define UINT16_C(v) ((uint16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT16_MAX +# define INT16_MAX 0x7fff +#endif +#ifndef INT16_MIN +# define INT16_MIN INT16_C(0x8000) +#endif +#if !defined(int16_t) && !defined(_INT16_T) +#if (INT_MAX == INT16_MAX) || defined (S_SPLINT_S) + typedef signed int int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "" +# endif +#elif (SHRT_MAX == INT16_MAX) + typedef signed short int16_t; +# define INT16_C(v) ((int16_t) (v)) +# ifndef PRINTF_INT16_MODIFIER +# define PRINTF_INT16_MODIFIER "h" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef UINT32_MAX +# define UINT32_MAX (0xffffffffUL) +#endif +#if !defined(uint32_t) && !defined(_UINT32_T) +#if (ULONG_MAX == UINT32_MAX) || defined (S_SPLINT_S) + typedef unsigned long uint32_t; +# define UINT32_C(v) v ## UL +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (UINT_MAX == UINT32_MAX) + typedef unsigned int uint32_t; +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +# define UINT32_C(v) v ## U +#elif (USHRT_MAX == UINT32_MAX) + typedef unsigned short uint32_t; +# define UINT32_C(v) ((unsigned short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +#ifndef INT32_MAX +# define INT32_MAX (0x7fffffffL) +#endif +#ifndef INT32_MIN +# define INT32_MIN INT32_C(0x80000000) +#endif +#if !defined(int32_t) && !defined(_INT32_T) +#if (LONG_MAX == INT32_MAX) || defined (S_SPLINT_S) + typedef signed long int32_t; +# define INT32_C(v) v ## L +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "l" +# endif +#elif (INT_MAX == INT32_MAX) + typedef signed int int32_t; +# define INT32_C(v) v +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#elif (SHRT_MAX == INT32_MAX) + typedef signed short int32_t; +# define INT32_C(v) ((short) (v)) +# ifndef PRINTF_INT32_MODIFIER +# define PRINTF_INT32_MODIFIER "" +# endif +#else +#error "Platform not supported" +#endif +#endif + +/* + * The macro stdint_int64_defined is temporarily used to record + * whether or not 64 integer support is available. It must be + * defined for any 64 integer extensions for new platforms that are + * added. + */ + +#undef stdint_int64_defined +#if (defined(__STDC__) && defined(__STDC_VERSION__)) || defined (S_SPLINT_S) +# if (__STDC__ && __STDC_VERSION__ >= 199901L) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# endif +#endif + +#if !defined (stdint_int64_defined) +# if defined(__GNUC__) +# define stdint_int64_defined + __extension__ typedef long long int64_t; + __extension__ typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif defined(__MWERKS__) || defined (__SUNPRO_C) || defined (__SUNPRO_CC) || defined (__APPLE_CC__) || defined (_LONG_LONG) || defined (_CRAYC) || defined (S_SPLINT_S) +# define stdint_int64_defined + typedef long long int64_t; + typedef unsigned long long uint64_t; +# define UINT64_C(v) v ## ULL +# define INT64_C(v) v ## LL +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "ll" +# endif +# elif (defined(__WATCOMC__) && defined(__WATCOM_INT64__)) || (defined(_MSC_VER) && _INTEGRAL_MAX_BITS >= 64) || (defined (__BORLANDC__) && __BORLANDC__ > 0x460) || defined (__alpha) || defined (__DECC) +# define stdint_int64_defined + typedef __int64 int64_t; + typedef unsigned __int64 uint64_t; +# define UINT64_C(v) v ## UI64 +# define INT64_C(v) v ## I64 +# ifndef PRINTF_INT64_MODIFIER +# define PRINTF_INT64_MODIFIER "I64" +# endif +# endif +#endif + +#if !defined (LONG_LONG_MAX) && defined (INT64_C) +# define LONG_LONG_MAX INT64_C (9223372036854775807) +#endif +#ifndef ULONG_LONG_MAX +# define ULONG_LONG_MAX UINT64_C (18446744073709551615) +#endif + +#if !defined (INT64_MAX) && defined (INT64_C) +# define INT64_MAX INT64_C (9223372036854775807) +#endif +#if !defined (INT64_MIN) && defined (INT64_C) +# define INT64_MIN INT64_C (-9223372036854775808) +#endif +#if !defined (UINT64_MAX) && defined (INT64_C) +# define UINT64_MAX UINT64_C (18446744073709551615) +#endif + +/* + * Width of hexadecimal for number field. + */ + +#ifndef PRINTF_INT64_HEX_WIDTH +# define PRINTF_INT64_HEX_WIDTH "16" +#endif +#ifndef PRINTF_INT32_HEX_WIDTH +# define PRINTF_INT32_HEX_WIDTH "8" +#endif +#ifndef PRINTF_INT16_HEX_WIDTH +# define PRINTF_INT16_HEX_WIDTH "4" +#endif +#ifndef PRINTF_INT8_HEX_WIDTH +# define PRINTF_INT8_HEX_WIDTH "2" +#endif +#ifndef PRINTF_INT64_DEC_WIDTH +# define PRINTF_INT64_DEC_WIDTH "19" +#endif +#ifndef PRINTF_INT32_DEC_WIDTH +# define PRINTF_INT32_DEC_WIDTH "10" +#endif +#ifndef PRINTF_INT16_DEC_WIDTH +# define PRINTF_INT16_DEC_WIDTH "5" +#endif +#ifndef PRINTF_INT8_DEC_WIDTH +# define PRINTF_INT8_DEC_WIDTH "3" +#endif +#ifndef PRINTF_UINT64_DEC_WIDTH +# define PRINTF_UINT64_DEC_WIDTH "20" +#endif +#ifndef PRINTF_UINT32_DEC_WIDTH +# define PRINTF_UINT32_DEC_WIDTH "10" +#endif +#ifndef PRINTF_UINT16_DEC_WIDTH +# define PRINTF_UINT16_DEC_WIDTH "5" +#endif +#ifndef PRINTF_UINT8_DEC_WIDTH +# define PRINTF_UINT8_DEC_WIDTH "3" +#endif + +/* + * Ok, lets not worry about 128 bit integers for now. Moore's law says + * we don't need to worry about that until about 2040 at which point + * we'll have bigger things to worry about. + */ + +#ifdef stdint_int64_defined + typedef int64_t intmax_t; + typedef uint64_t uintmax_t; +# define INTMAX_MAX INT64_MAX +# define INTMAX_MIN INT64_MIN +# define UINTMAX_MAX UINT64_MAX +# define UINTMAX_C(v) UINT64_C(v) +# define INTMAX_C(v) INT64_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT64_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT64_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT64_DEC_WIDTH +# endif +#else + typedef int32_t intmax_t; + typedef uint32_t uintmax_t; +# define INTMAX_MAX INT32_MAX +# define UINTMAX_MAX UINT32_MAX +# define UINTMAX_C(v) UINT32_C(v) +# define INTMAX_C(v) INT32_C(v) +# ifndef PRINTF_INTMAX_MODIFIER +# define PRINTF_INTMAX_MODIFIER PRINTF_INT32_MODIFIER +# endif +# ifndef PRINTF_INTMAX_HEX_WIDTH +# define PRINTF_INTMAX_HEX_WIDTH PRINTF_INT32_HEX_WIDTH +# endif +# ifndef PRINTF_INTMAX_DEC_WIDTH +# define PRINTF_INTMAX_DEC_WIDTH PRINTF_INT32_DEC_WIDTH +# endif +#endif + +/* + * Because this file currently only supports platforms which have + * precise powers of 2 as bit sizes for the default integers, the + * least definitions are all trivial. Its possible that a future + * version of this file could have different definitions. + */ + +#ifndef stdint_least_defined + typedef int8_t int_least8_t; + typedef uint8_t uint_least8_t; + typedef int16_t int_least16_t; + typedef uint16_t uint_least16_t; + typedef int32_t int_least32_t; + typedef uint32_t uint_least32_t; +# define PRINTF_LEAST32_MODIFIER PRINTF_INT32_MODIFIER +# define PRINTF_LEAST16_MODIFIER PRINTF_INT16_MODIFIER +# define UINT_LEAST8_MAX UINT8_MAX +# define INT_LEAST8_MAX INT8_MAX +# define UINT_LEAST16_MAX UINT16_MAX +# define INT_LEAST16_MAX INT16_MAX +# define UINT_LEAST32_MAX UINT32_MAX +# define INT_LEAST32_MAX INT32_MAX +# define INT_LEAST8_MIN INT8_MIN +# define INT_LEAST16_MIN INT16_MIN +# define INT_LEAST32_MIN INT32_MIN +# ifdef stdint_int64_defined + typedef int64_t int_least64_t; + typedef uint64_t uint_least64_t; +# define PRINTF_LEAST64_MODIFIER PRINTF_INT64_MODIFIER +# define UINT_LEAST64_MAX UINT64_MAX +# define INT_LEAST64_MAX INT64_MAX +# define INT_LEAST64_MIN INT64_MIN +# endif +#endif +#undef stdint_least_defined + +/* + * The ANSI C committee pretending to know or specify anything about + * performance is the epitome of misguided arrogance. The mandate of + * this file is to *ONLY* ever support that absolute minimum + * definition of the fast integer types, for compatibility purposes. + * No extensions, and no attempt to suggest what may or may not be a + * faster integer type will ever be made in this file. Developers are + * warned to stay away from these types when using this or any other + * stdint.h. + */ + +typedef int_least8_t int_fast8_t; +typedef uint_least8_t uint_fast8_t; +typedef int_least16_t int_fast16_t; +typedef uint_least16_t uint_fast16_t; +typedef int_least32_t int_fast32_t; +typedef uint_least32_t uint_fast32_t; +#define UINT_FAST8_MAX UINT_LEAST8_MAX +#define INT_FAST8_MAX INT_LEAST8_MAX +#define UINT_FAST16_MAX UINT_LEAST16_MAX +#define INT_FAST16_MAX INT_LEAST16_MAX +#define UINT_FAST32_MAX UINT_LEAST32_MAX +#define INT_FAST32_MAX INT_LEAST32_MAX +#define INT_FAST8_MIN INT_LEAST8_MIN +#define INT_FAST16_MIN INT_LEAST16_MIN +#define INT_FAST32_MIN INT_LEAST32_MIN +#ifdef stdint_int64_defined + typedef int_least64_t int_fast64_t; + typedef uint_least64_t uint_fast64_t; +# define UINT_FAST64_MAX UINT_LEAST64_MAX +# define INT_FAST64_MAX INT_LEAST64_MAX +# define INT_FAST64_MIN INT_LEAST64_MIN +#endif + +#undef stdint_int64_defined + +/* + * Whatever piecemeal, per compiler thing we can do about the wchar_t + * type limits. + */ + +#if defined(__WATCOMC__) || defined(_MSC_VER) || defined (__GNUC__) +# include <wchar.h> +# ifndef WCHAR_MIN +# define WCHAR_MIN 0 +# endif +# ifndef WCHAR_MAX +# define WCHAR_MAX ((wchar_t)-1) +# endif +#endif + +/* + * Whatever piecemeal, per compiler/platform thing we can do about the + * (u)intptr_t types and limits. + */ + +#if (defined (_MSC_VER) && defined (_UINTPTR_T_DEFINED)) || defined (_UINTPTR_T) +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +#ifndef STDINT_H_UINTPTR_T_DEFINED +# if defined (__alpha__) || defined (__ia64__) || defined (__x86_64__) || defined (_WIN64) || defined (__ppc64__) +# define stdint_intptr_bits 64 +# elif defined (__WATCOMC__) || defined (__TURBOC__) +# if defined(__TINY__) || defined(__SMALL__) || defined(__MEDIUM__) +# define stdint_intptr_bits 16 +# else +# define stdint_intptr_bits 32 +# endif +# elif defined (__i386__) || defined (_WIN32) || defined (WIN32) || defined (__ppc64__) +# define stdint_intptr_bits 32 +# elif defined (__INTEL_COMPILER) +/* TODO -- what did Intel do about x86-64? */ +# else +/* #error "This platform might not be supported yet" */ +# endif + +# ifdef stdint_intptr_bits +# define stdint_intptr_glue3_i(a,b,c) a##b##c +# define stdint_intptr_glue3(a,b,c) stdint_intptr_glue3_i(a,b,c) +# ifndef PRINTF_INTPTR_MODIFIER +# define PRINTF_INTPTR_MODIFIER stdint_intptr_glue3(PRINTF_INT,stdint_intptr_bits,_MODIFIER) +# endif +# ifndef PTRDIFF_MAX +# define PTRDIFF_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef PTRDIFF_MIN +# define PTRDIFF_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef UINTPTR_MAX +# define UINTPTR_MAX stdint_intptr_glue3(UINT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MAX +# define INTPTR_MAX stdint_intptr_glue3(INT,stdint_intptr_bits,_MAX) +# endif +# ifndef INTPTR_MIN +# define INTPTR_MIN stdint_intptr_glue3(INT,stdint_intptr_bits,_MIN) +# endif +# ifndef INTPTR_C +# define INTPTR_C(x) stdint_intptr_glue3(INT,stdint_intptr_bits,_C)(x) +# endif +# ifndef UINTPTR_C +# define UINTPTR_C(x) stdint_intptr_glue3(UINT,stdint_intptr_bits,_C)(x) +# endif + typedef stdint_intptr_glue3(uint,stdint_intptr_bits,_t) uintptr_t; + typedef stdint_intptr_glue3( int,stdint_intptr_bits,_t) intptr_t; +# else +/* TODO -- This following is likely wrong for some platforms, and does + nothing for the definition of uintptr_t. */ + typedef ptrdiff_t intptr_t; +# endif +# define STDINT_H_UINTPTR_T_DEFINED +#endif + +/* + * Assumes sig_atomic_t is signed and we have a 2s complement machine. + */ + +#ifndef SIG_ATOMIC_MAX +# define SIG_ATOMIC_MAX ((((sig_atomic_t) 1) << (sizeof (sig_atomic_t)*CHAR_BIT-1)) - 1) +#endif + +#endif + +#if defined (__TEST_PSTDINT_FOR_CORRECTNESS) + +/* + * Please compile with the maximum warning settings to make sure macros are + * not defined more than once. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#define glue3_aux(x,y,z) x ## y ## z +#define glue3(x,y,z) glue3_aux(x,y,z) + +#define DECLU(bits) glue3(uint,bits,_t) glue3(u,bits,) = glue3(UINT,bits,_C) (0); +#define DECLI(bits) glue3(int,bits,_t) glue3(i,bits,) = glue3(INT,bits,_C) (0); + +#define DECL(us,bits) glue3(DECL,us,) (bits) + +#define TESTUMAX(bits) glue3(u,bits,) = ~glue3(u,bits,); if (glue3(UINT,bits,_MAX) != glue3(u,bits,)) printf ("Something wrong with UINT%d_MAX\n", bits) + +#define REPORTERROR(msg) { err_n++; if (err_first <= 0) err_first = __LINE__; printf msg; } + +int main () { + int err_n = 0; + int err_first = 0; + DECL(I,8) + DECL(U,8) + DECL(I,16) + DECL(U,16) + DECL(I,32) + DECL(U,32) +#ifdef INT64_MAX + DECL(I,64) + DECL(U,64) +#endif + intmax_t imax = INTMAX_C(0); + uintmax_t umax = UINTMAX_C(0); + char str0[256], str1[256]; + + sprintf (str0, "%" PRINTF_INT32_MODIFIER "d", INT32_C(2147483647)); + if (0 != strcmp (str0, "2147483647")) REPORTERROR (("Something wrong with PRINTF_INT32_MODIFIER : %s\n", str0)); + if (atoi(PRINTF_INT32_DEC_WIDTH) != (int) strlen(str0)) REPORTERROR (("Something wrong with PRINTF_INT32_DEC_WIDTH : %s\n", PRINTF_INT32_DEC_WIDTH)); + sprintf (str0, "%" PRINTF_INT32_MODIFIER "u", UINT32_C(4294967295)); + if (0 != strcmp (str0, "4294967295")) REPORTERROR (("Something wrong with PRINTF_INT32_MODIFIER : %s\n", str0)); + if (atoi(PRINTF_UINT32_DEC_WIDTH) != (int) strlen(str0)) REPORTERROR (("Something wrong with PRINTF_UINT32_DEC_WIDTH : %s\n", PRINTF_UINT32_DEC_WIDTH)); +#ifdef INT64_MAX + sprintf (str1, "%" PRINTF_INT64_MODIFIER "d", INT64_C(9223372036854775807)); + if (0 != strcmp (str1, "9223372036854775807")) REPORTERROR (("Something wrong with PRINTF_INT32_MODIFIER : %s\n", str1)); + if (atoi(PRINTF_INT64_DEC_WIDTH) != (int) strlen(str1)) REPORTERROR (("Something wrong with PRINTF_INT64_DEC_WIDTH : %s, %d\n", PRINTF_INT64_DEC_WIDTH, (int) strlen(str1))); + sprintf (str1, "%" PRINTF_INT64_MODIFIER "u", UINT64_C(18446744073709550591)); + if (0 != strcmp (str1, "18446744073709550591")) REPORTERROR (("Something wrong with PRINTF_INT32_MODIFIER : %s\n", str1)); + if (atoi(PRINTF_UINT64_DEC_WIDTH) != (int) strlen(str1)) REPORTERROR (("Something wrong with PRINTF_UINT64_DEC_WIDTH : %s, %d\n", PRINTF_UINT64_DEC_WIDTH, (int) strlen(str1))); +#endif + + sprintf (str0, "%d %x\n", 0, ~0); + + sprintf (str1, "%d %x\n", i8, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with i8 : %s\n", str1)); + sprintf (str1, "%u %x\n", u8, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with u8 : %s\n", str1)); + sprintf (str1, "%d %x\n", i16, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with i16 : %s\n", str1)); + sprintf (str1, "%u %x\n", u16, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with u16 : %s\n", str1)); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "d %x\n", i32, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with i32 : %s\n", str1)); + sprintf (str1, "%" PRINTF_INT32_MODIFIER "u %x\n", u32, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with u32 : %s\n", str1)); +#ifdef INT64_MAX + sprintf (str1, "%" PRINTF_INT64_MODIFIER "d %x\n", i64, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with i64 : %s\n", str1)); +#endif + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "d %x\n", imax, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with imax : %s\n", str1)); + sprintf (str1, "%" PRINTF_INTMAX_MODIFIER "u %x\n", umax, ~0); + if (0 != strcmp (str0, str1)) REPORTERROR (("Something wrong with umax : %s\n", str1)); + + TESTUMAX(8); + TESTUMAX(16); + TESTUMAX(32); +#ifdef INT64_MAX + TESTUMAX(64); +#endif + +#define STR(v) #v +#define Q(v) printf ("sizeof " STR(v) " = %u\n", (unsigned) sizeof (v)); + if (err_n) { + printf ("pstdint.h is not correct. Please use sizes below to correct it:\n"); + } + + Q(int) + Q(unsigned) + Q(long int) + Q(short int) + Q(int8_t) + Q(int16_t) + Q(int32_t) +#ifdef INT64_MAX + Q(int64_t) +#endif + + return EXIT_SUCCESS; +} + +#endif diff --git a/external/hash/ptr_set.c b/external/hash/ptr_set.c new file mode 100644 index 0000000..ab12ddf --- /dev/null +++ b/external/hash/ptr_set.c @@ -0,0 +1,60 @@ +/* + * Creates a set of stored pointers by using the pointer itself as key. + * + * (void *)0 (HT_MISSING) cannot be stored. + * (void *)1 (HT_DELETED) also cannot be stored. + * + * ht_item, ht_key, ht_key_len, and ht_match are required. + * + * In this case HT_HASH_FUNCTION is also required because + * we do not read the content of the key but use the pointer + * itself as a key. The default behavior would crash. + * + * Only one hash table can be defined in a single compilation unit + * because of static function names in the generic implementation. + */ + +#include "ptr_set.h" + +static inline size_t ptr_set_hash_function(const void *s, size_t len); +#define HT_HASH_FUNCTION ptr_set_hash_function + +#define HT_LOAD_FACTOR 0.7 +#include "hash_table_def.h" +DEFINE_HASH_TABLE(ptr_set) + +#if defined(PTR_SET_RH) +#include "hash_table_impl_rh.h" +#else +#include "hash_table_impl.h" +#endif + +static inline const void *ht_key(ht_item_t x) +{ + return (const void *)x; +} + +static inline size_t ht_key_len(ht_item_t x) +{ + return sizeof(x); +} + +static inline int ht_match(const void *key, size_t len, ht_item_t x) +{ + (void)len; + return (size_t)key == (size_t)x; +} + +static inline size_t ptr_set_hash_function(const void *s, size_t len) +{ +#if defined (PTR_SET_PTR_HASH) + /* Murmur hash like finalization step. */ + return ht_ptr_hash_function(s, len); +#elif defined (PTR_SET_INT_HASH) + /* Knuths multiplication. */ + return ht_int_hash_function(s, len); +#else + (void)len; + return ht_default_hash_function(&s, sizeof(char *)); +#endif +} diff --git a/external/hash/ptr_set.h b/external/hash/ptr_set.h new file mode 100644 index 0000000..f66e70e --- /dev/null +++ b/external/hash/ptr_set.h @@ -0,0 +1,19 @@ +#ifndef HT_PTR_SET_H +#define HT_PTR_SET_H + +#include "hash_table.h" + +DECLARE_HASH_TABLE(ptr_set, void *) + +/* Return value helpers - these are specific to the implementation. */ +#define PTR_SET_IS_MISSING(x) ((void *)x == (void *)0) +#define PTR_SET_IS_ERROR(x) ((void *)x == (void *)2) +#define PTR_SET_IS_VALID(x) ((void *)x > (void *)2) + +/* Extensions to std. interface. */ +static inline int ptr_set_exists(ptr_set_t *S, void *p) +{ + return ptr_set_find_item(S, p) != (void *)0; +} + +#endif /* HT_PTR_SET_H */ diff --git a/external/hash/str_set.c b/external/hash/str_set.c new file mode 100644 index 0000000..87a3766 --- /dev/null +++ b/external/hash/str_set.c @@ -0,0 +1,61 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <string.h> + +#include "str_set.h" +#include "hash_table_def.h" +DEFINE_HASH_TABLE(str_set) +#if defined(STR_SET_RH) +#include "hash_table_impl_rh.h" +#else +#include "hash_table_impl.h" +#endif + +/* + * Simple default implementation of a hash set. The stored items are + * zero-terminated strings. The hash table does not manage the + * allocation of the strings, like it doesn't manage any stored items. + * However, it items are created with, say, strndup, a destructor can be + * provided to free each item when clearing the table. The remove + * operation also returns the removed item so it can be deallocated by + * callee. + * + * In general, the key and the item are different, but here they are the + * same. Normally the key would be referenced by the item. + */ +static inline int ht_match(const void *key, size_t len, str_set_item_t item) +{ + return strncmp(key, item, len) == 0; +} + +static inline const void *ht_key(str_set_item_t item) +{ + return (const void *)item; +} + +static inline size_t ht_key_len(str_set_item_t item) +{ + return strlen(item); +} diff --git a/external/hash/str_set.h b/external/hash/str_set.h new file mode 100644 index 0000000..df5d1c7 --- /dev/null +++ b/external/hash/str_set.h @@ -0,0 +1,32 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef STR_SET_H +#define STR_SET_H + +#include "hash_table.h" + +DECLARE_HASH_TABLE(str_set, char *) + +#endif /* STR_SET_H */ diff --git a/external/hash/token_map.c b/external/hash/token_map.c new file mode 100644 index 0000000..9bf85df --- /dev/null +++ b/external/hash/token_map.c @@ -0,0 +1,54 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <string.h> + +/* These are just example settings. */ + +#include "token_map.h" +#define HT_LOAD_FACTOR 0.85 +/* Quadratic probing is ignored with Robin Hood hashing. */ +#define HT_PROBE_QUADRATIC +#include "hash_table_def.h" +DEFINE_HASH_TABLE(token_map) +#if defined(TOKEN_MAP_RH) +#include "hash_table_impl_rh.h" +#else +#include "hash_table_impl.h" +#endif + +static inline const void *ht_key(ht_item_t item) +{ + return item->token; +} + +static inline size_t ht_key_len(ht_item_t item) +{ + return item->len; +} + +static inline int ht_match(const void *key, size_t len, ht_item_t item) +{ + return len == item->len && memcmp(key, item->token, len) == 0; +} diff --git a/external/hash/token_map.h b/external/hash/token_map.h new file mode 100644 index 0000000..700c60e --- /dev/null +++ b/external/hash/token_map.h @@ -0,0 +1,39 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TOKEN_MAP_H +#define TOKEN_MAP_H + +#include "hash_table.h" + +struct token { + char *token; + size_t len; + int type; + void *data; +}; + +DECLARE_HASH_TABLE(token_map, struct token *) + +#endif /* TOKEN_MAP_H */ diff --git a/external/hash/unaligned.h b/external/hash/unaligned.h new file mode 100644 index 0000000..0431f96 --- /dev/null +++ b/external/hash/unaligned.h @@ -0,0 +1,42 @@ +#ifndef UNALIGNED_H +#define UNALIGNED_H + +/* + * This is a simplified version of portable/punaligned.h that does not depend on + * endian detection, but which assumes x86 is always little endian. + * Include the portable version for better precision. + */ + +#ifndef unaligned_read_le16toh + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + +#define unaligned_read_le16toh(p) (*(uint16_t*)(p)) +#define unaligned_read_le32toh(p) (*(uint32_t*)(p)) +#define unaligned_read_le64toh(p) (*(uint64_t*)(p)) + +#else + +#define unaligned_read_le16toh(p) ( \ + (((uint16_t)(((uint8_t *)(p))[0])) << 0) | \ + (((uint16_t)(((uint8_t *)(p))[1])) << 8)) + +#define unaligned_read_le32toh(p) ( \ + (((uint32_t)(((uint8_t *)(p))[0])) << 0) | \ + (((uint32_t)(((uint8_t *)(p))[1])) << 8) | \ + (((uint32_t)(((uint8_t *)(p))[2])) << 16) | \ + (((uint32_t)(((uint8_t *)(p))[3])) << 24)) + +#define unaligned_read_le64toh(p) ( \ + (((uint64_t)(((uint8_t *)(p))[0])) << 0) | \ + (((uint64_t)(((uint8_t *)(p))[1])) << 8) | \ + (((uint64_t)(((uint8_t *)(p))[2])) << 16) | \ + (((uint64_t)(((uint8_t *)(p))[3])) << 24) | \ + (((uint64_t)(((uint8_t *)(p))[4])) << 32) | \ + (((uint64_t)(((uint8_t *)(p))[5])) << 40) | \ + (((uint64_t)(((uint8_t *)(p))[6])) << 48) | \ + (((uint64_t)(((uint8_t *)(p))[7])) << 56)) +#endif +#endif + +#endif /* UNALIGNED_H */ diff --git a/external/lex/LICENSE b/external/lex/LICENSE new file mode 100644 index 0000000..8e84a48 --- /dev/null +++ b/external/lex/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/external/lex/README.md b/external/lex/README.md new file mode 100644 index 0000000..3144091 --- /dev/null +++ b/external/lex/README.md @@ -0,0 +1,3 @@ +Essential files extracted from the luthor scanner - a generic scanner +similar to a handwritten scanner, but covering many common cases by +default. diff --git a/external/lex/luthor.c b/external/lex/luthor.c new file mode 100644 index 0000000..fc81985 --- /dev/null +++ b/external/lex/luthor.c @@ -0,0 +1,1509 @@ +/* + * Designed to be included in other C files which define emitter + * operations. The same source may thus be used to parse different + * grammars. + * + * The operators cover the most common operators i the C family. Each + * operator does not have a name, it is represent by a long token code + * with up to 4 ASCII characters embedded literally. This avoids any + * semantic meaning at the lexer level. Emitters macros can redefine + * this behavior. + * + * No real harm is done in accepting a superset, but the source is + * intended to be modified, have things flagged or removed, other things + * added. The real complicity is in numbers, identifiers, and comments, + * which should be fairly complete with flagging as is. + * + * Keyword handling is done at macroes, and described elsewhere, but for + * identifier compatible keywords, this is quite efficient to handle on + * a per language basis without modifying this source. + * + * The Lisp language family is somewhat different and not directly + * suited for this lexer, although it can easily be modified to suit. + * The main reason is ';' for comments, and operators used as part of + * the identifier symbol set, and no need for operator classification, + * and different handling of single character symbols. + * + * So overall, we more or less have one efficient unified lexer that can + * manage many languages - this is good, because it is a pain to write a + * new lexer by hand, and lexer tools are what they are. + */ + +#include "luthor.h" + +#ifdef LEX_C99_NUMERIC +#define LEX_C_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC +#endif + +#ifdef LEX_C_NUMERIC +#define LEX_C_OCTAL_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +#ifdef LEX_JULIA_NUMERIC +#ifdef LEX_C_OCTAL_NUMERIC +/* + * LEX_JULIA_OCTAL_NUMERIC and LEX_C_OCTAL_NUMERIC can technically + * coexist, but leading zeroes give C style leading zero numbers + * which can lead to incorrect values depending on expectations. + * Therefore the full LEX_JULIA_NUMERIC flag is designed to not allow this. + */ +#error "LEX_C_OCTAL_NUMERIC conflicts with LEX_JULIA_NUMERIC leading zero integers" +#endif + +/* + * Julia v0.3 insists on lower case, and has a different meaning for + * upper case. + */ +#define LEX_LOWER_CASE_NUMERIC_PREFIX +#define LEX_JULIA_OCTAL_NUMERIC +#define LEX_HEX_FLOAT_NUMERIC +#define LEX_BINARY_NUMERIC + +#endif + +#ifdef LEX_HEX_FLOAT_NUMERIC +#define LEX_HEX_NUMERIC +#endif + +/* + * Numeric and string constants do not accept prefixes such as u, l, L, + * U, ll, LL, f, or F in C, or various others in Julia strings. Use the + * parser to detect juxtaposition between identifier and constant. In + * Julia numeric suffix means multiplication, in C it is a type + * qualifier. Sign, such as defined in JSON, are also not accepted - + * they must be operators. See source for various flag to enable + * different token types. + */ + +/* + * Includes '_' in identifers by default. Defines follow characters in + * identifiers but not the lead character - it must be defined in switch + * cases. If the identifier allows for dash '-', it is probably better + * to handle it as an operator and flag surrounding space in the parser. + */ +#ifndef lex_isalnum + +/* + * NOTE: isalnum, isalpha, is locale dependent. We only want to + * to consider that ASCII-7 subset and treat everything else as utf-8. + * This table is not for leading identifiers, as it contains 0..9. + * + * For more correct handling of UTF-8, see: + * https://theantlrguy.atlassian.net/wiki/display/ANTLR4/Grammar+Lexicon + * based on Java Ident = NameStartChar NameChar* + * + * While the following is UTF-16, it can be adapted to UTF-8 easily. + + + fragment + NameChar + : NameStartChar + | '0'..'9' + | '_' + | '\u00B7' + | '\u0300'..'\u036F' + | '\u203F'..'\u2040' + ; + fragment + NameStartChar + : 'A'..'Z' | 'a'..'z' + | '\u00C0'..'\u00D6' + | '\u00D8'..'\u00F6' + | '\u00F8'..'\u02FF' + | '\u0370'..'\u037D' + | '\u037F'..'\u1FFF' + | '\u200C'..'\u200D' + | '\u2070'..'\u218F' + | '\u2C00'..'\u2FEF' + | '\u3001'..'\uD7FF' + | '\uF900'..'\uFDCF' + | '\uFDF0'..'\uFFFD' + ; + */ + +static const char lex_alnum[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0..9 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + /* A..O */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* P..Z, _ */ +#ifdef LEX_ID_WITHOUT_UNDERSCORE + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#else + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, +#endif + /* a..o */ + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + /* p..z */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, +#ifdef LEX_ID_WITH_UTF8 + /* utf-8 */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +#else + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +#endif +}; + +#define lex_isalnum(c) (lex_alnum[(unsigned char)(c)]) +#endif + +#ifndef lex_isbindigit +#define lex_isbindigit(c) ((c) == '0' || (c) == '1') +#endif + +#ifndef lex_isoctdigit +#define lex_isoctdigit(c) ((unsigned)((c) - '0') < 8) +#endif + +#ifndef lex_isdigit +#define lex_isdigit(c) ((c) >= '0' && (c) <= '9') +#endif + +#ifndef lex_ishexdigit +#define lex_ishexdigit(c) (((c) >= '0' && (c) <= '9') || ((c | 0x20) >= 'a' && (c | 0x20) <= 'f')) +#endif + +#ifndef lex_isctrl +#include <ctype.h> +#define lex_isctrl(c) ((c) < 0x20 || (c) == 0x7f) +#endif + +#ifndef lex_isblank +#define lex_isblank(c) ((c) == ' ' || (c) == '\t') +#endif + +#ifndef lex_iszterm +#define lex_iszterm(c) ((c) == '\0') +#endif + +/* + * If ZTERM is disabled, zero will be a LEX_CTRL token + * and allowed to be embedded in comments and strings, or + * elsewhere, as long as the parser accepts the token. + */ +#ifdef LEX_DISABLE_ZTERM +#undef lex_iszterm +#define lex_iszterm(c) (0) +#endif + +/* + * The mode is normally LEX_MODE_NORMAL = 0 initially, or the returned + * mode from a previous call, unless LEX_MODE_INVALID = 1 was returned. + * If a buffer stopped in the middle of a string or a comment, the mode + * will reflect that. In all cases some amount of recovery is needed + * before starting a new buffer - see detailed comments in header file. + * If only a single buffer is used, special handling is still needed if + * the last line contains a single line comment because it will not be + * terminated, but it amounts to replace the emitted unterminated + * comment token with an end of comment token. + * + * Instead of 0, the mode can initially also be LEX_MODE_BOM - it will + * an strip optional BOM before moving to normal mode. Currently only + * UTF-8 BOM is supported, and this is unlikely to change. + * + * The context variable is user-defined and available to emitter macros. + * It may be null if unused. + * + */ +static int lex(const char *buf, size_t len, int mode, void *context) +{ + const char *p, *q, *s, *d; +#if 0 + /* TODO: old, remove this */ + , *z, *f; +#endif + + p = buf; /* next char */ + q = p + len; /* end of buffer */ + s = p; /* start of token */ + d = p; /* end of integer part */ + +#if 0 + /* TODO: old, remove this */ + + /* Used for float and leading zero detection in numerics. */ + z = p; + f = p; +#endif + + /* + * Handle mid string and mid comment for reentering across + * buffer boundaries. Strip embedded counter from mode. + */ + switch(mode & (LEX_MODE_COUNT_BASE - 1)) { + + case LEX_MODE_NORMAL: + goto lex_mode_normal; + + case LEX_MODE_BOM: + goto lex_mode_bom; + +#ifdef LEX_C_STRING + case LEX_MODE_C_STRING: + goto lex_mode_c_string; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING + case LEX_MODE_PYTHON_BLOCK_STRING: + goto lex_mode_python_block_string; +#endif +#ifdef LEX_C_STRING_SQ + case LEX_MODE_C_STRING_SQ: + goto lex_mode_c_string_sq; +#endif +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + case LEX_MODE_PYTHON_BLOCK_STRING_SQ: + goto lex_mode_python_block_string_sq; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case LEX_MODE_C_BLOCK_COMMENT: + goto lex_mode_c_block_comment; +#endif +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) + case LEX_MODE_LINE_COMMENT: + goto lex_mode_line_comment; +#endif +#ifdef LEX_JULIA_NESTED_COMMENT + case LEX_MODE_JULIA_NESTED_COMMENT: + goto lex_mode_julia_nested_comment; +#endif + + default: + /* + * This is mostly to kill unused label warning when comments + * are disabled. + */ + goto lex_mode_exit; + } + +lex_mode_bom: + + mode = LEX_MODE_BOM; + + /* + * Special entry mode to consume utf-8 bom if present. We don't + * support other boms, but we would use the same token if we did. + * + * We generally expect no bom present, but it is here if needed + * without requiring ugly hacks elsewhere. + */ + if (p + 3 < q && p[0] == '\xef' && p[1] == '\xbb' && p[2] == '\xbf') { + p += 3; + lex_emit_bom(s, p); + } + goto lex_mode_normal; + +/* If source is updated, also update LEX_C_STRING_SQ accordingly. */ +#ifdef LEX_C_STRING +lex_mode_c_string: + + mode = LEX_MODE_C_STRING; + + for (;;) { + --p; + /* We do not allow blanks that are also control characters, such as \t. */ + while (++p != q && *p != '\\' && *p != '\"' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\"') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\"') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * This is a copy if LEX_C_STRING with single quote. It's not DRY, but + * no reason to parameterized inner loops, just because. Recopy of + * changes are to the above. + * + * Even if single quote is only used for CHAR types, it makes sense to + * parse as a full string since there can be all sorts of unicocde + * escapes and line continuations, newlines to report and unexpected + * control characters to deal with. + */ +#ifdef LEX_C_STRING_SQ +lex_mode_c_string_sq: + + mode = LEX_MODE_C_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && *p != '\'' && !lex_isctrl(*p)) { + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (*p == '\'') { + ++p; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\') { + ++p; + /* Escape is only itself, whatever is escped follows separately. */ + lex_emit_string_escape(s, p); + s = p; + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\\' || *p == '\'') { + ++p; + continue; + } + /* + * Flag only relevant for single line strings, as it + * controls whether we fail on unterminated string at line + * ending with '\'. + * + * Julia does not support line continuation in strings + * (or elsewhere). C, Python, and Javascript do. + */ +#ifndef LEX_DISABLE_STRING_CONT + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } +#endif + } + if (*p == '\n' || *p == '\r') { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * """ Triple quoted Python block strings. """ + * Single quoted version (''') is a direct copy, update both places + * if a changed is needed. + * + * Note: there is no point in disabling line continuation + * for block strings, since it only affects unterminated + * string errors at newline. It all comes down to how + * escaped newline is interpreted by the parser. + */ +#ifdef LEX_PYTHON_BLOCK_STRING +lex_mode_python_block_string: + + mode = LEX_MODE_PYTHON_BLOCK_STRING; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\"' && p + 2 < q && p[1] == '\"' && p[2] == '\"') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\"') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\"')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * Python ''' style strings. + * Direct copy of """ quote version, update both if changed. + */ +#ifdef LEX_PYTHON_BLOCK_STRING_SQ +lex_mode_python_block_string_sq: + + mode = LEX_MODE_PYTHON_BLOCK_STRING_SQ; + + for (;;) { + --p; + while (++p != q && *p != '\\' && !lex_isctrl(*p)) { + if (*p == '\'' && p + 2 < q && p[1] == '\'' && p[2] == '\'') { + break; + } + } + if (s != p) { + lex_emit_string_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_string_unterminated(p); + goto lex_mode_normal; + } + if (*p == '\'') { + p += 3; + lex_emit_string_end(s, p); + goto lex_mode_normal; + } + if (*p == '\\') { + /* Escape is only itself, allowing parser to interpret and validate. */ + ++p; + lex_emit_string_escape(s, p); + s = p; + if (p + 1 != q && (*p == '\\' || *p == '\'')) { + ++p; + } + continue; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_string_newline(s, p); + s = p; + continue; + } + ++p; + lex_emit_string_ctrl(s); + s = p; + } +#endif + +/* + * We don't really care if it is a shell style comment or a C99, + * or any other line oriented commment, as the termination is + * the same. + */ +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_C99_LINE_COMMENT) +lex_mode_line_comment: + + mode = LEX_MODE_LINE_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + /* + * Unterminated comment here is not necessarily true, + * not even likely, nor possible, but we do this to + * handle buffer switch consistently: any non-normal + * mode exit will have an unterminated token to fix up. + * Here it would be conversion to end of comment, which + * we cannot know yet, since the line might continue in + * the next buffer. This is a zero length token. + */ + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n' || *p == '\r') { + lex_emit_comment_end(s, p); + goto lex_mode_normal; + } + ++p; + lex_emit_comment_ctrl(s); + s = p; + } +#endif + +#ifdef LEX_C_BLOCK_COMMENT +lex_mode_c_block_comment: + + mode = LEX_MODE_C_BLOCK_COMMENT; + + for (;;) { + --p; + while (++p != q && (!lex_isctrl(*p))) { + if (*p == '/' && p[-1] == '*') { + --p; + break; + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + p += 2; + lex_emit_comment_end(s, p); + s = p; + goto lex_mode_normal; + } +#endif + + /* Julia nests block comments as #= ... #= ...=# ... =# across multiple lines. */ +#ifdef LEX_JULIA_NESTED_COMMENT +lex_mode_julia_nested_comment: + + /* Preserve nesting level on re-entrance. */ + if ((mode & (LEX_MODE_COUNT_BASE - 1)) != LEX_MODE_JULIA_NESTED_COMMENT) { + mode = LEX_MODE_JULIA_NESTED_COMMENT; + } + /* We have already entered. */ + mode += LEX_MODE_COUNT_BASE; + + for (;;) { + --p; + while (++p != q && !lex_isctrl(*p)) { + if (*p == '#') { + if (p[-1] == '=') { + --p; + break; + } + if (p + 1 != q && p[1] == '=') { + break; + } + } + } + if (s != p) { + lex_emit_comment_part(s, p); + s = p; + } + if (p == q || lex_iszterm(*p)) { + lex_emit_comment_unterminated(p); + goto lex_mode_exit; + } + if (*p == '\n') { + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (*p == '\r') { + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + s = p; + continue; + } + if (lex_isctrl(*p)) { + ++p; + lex_emit_comment_ctrl(s); + s = p; + continue; + } + if (*p == '=') { + p += 2; + lex_emit_comment_end(s, p); + s = p; + mode -= LEX_MODE_COUNT_BASE; + if (mode / LEX_MODE_COUNT_BASE > 0) { + continue; + } + goto lex_mode_normal; + } + /* The upper bits are used as counter. */ + mode += LEX_MODE_COUNT_BASE; + p += 2; + lex_emit_comment_begin(s, p, 0); + s = p; + if (mode / LEX_MODE_COUNT_BASE > LEX_MAX_NESTING_LEVELS) { + /* Prevent malicious input from overflowing counter. */ + lex_emit_comment_deeply_nested(p); + lex_emit_abort(p); + return mode; + } + } +#endif + +/* Unlike other modes, we can always jump here without updating token start `s` first. */ +lex_mode_normal: + + mode = LEX_MODE_NORMAL; + + while (p != q) { + s = p; + + switch(*p) { + +#ifndef LEX_DISABLE_ZTERM + case '\0': + lex_emit_eos(s, p); + return mode; +#endif + + /* \v, \f etc. are covered by the CTRL token, don't put it here. */ + case '\t': case ' ': + while (++p != q && lex_isblank(*p)) { + } + lex_emit_blank(s, p); + continue; + + /* + * Newline should be emitter in all constructs, also comments + * and strings which have their own newline handling. + * Only one line is emitted at a time permitting simple line + * counting. + */ + case '\n': + if (++p != q && *p == '\r') { + ++p; + } + lex_emit_newline(s, p); + continue; + + case '\r': + if (++p != q && *p == '\n') { + ++p; + } + lex_emit_newline(s, p); + continue; + + /* + * C-style string, and Python style triple double quote + * delimited multi-line string. Prefix and suffix symbols + * should be parsed separately, e.g. L"hello" are two + * tokens. + */ +#if defined(LEX_C_STRING) || defined(LEX_PYTHON_BLOCK_STRING) + case '\"': +#ifdef LEX_PYTHON_BLOCK_STRING + if (p + 2 < q && p[1] == '\"' && p[2] == '\"') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string; + } +#endif +#ifdef LEX_C_STRING + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string; +#endif +#endif + + /* + * Single quoted version of strings, otherwise identical + * behavior. Can also be used for char constants if checked + * by parser subsequently. + */ +#if defined(LEX_C_STRING_SQ) || defined(LEX_PYTHON_BLOCK_STRING_SQ) + case '\'': +#ifdef LEX_PYTHON_BLOCK_STRING_SQ + if (p + 2 < q && p[1] == '\'' && p[2] == '\'') { + p += 3; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_python_block_string_sq; + } +#endif +#ifdef LEX_C_STRING_SQ + ++p; + lex_emit_string_begin(s, p); + s = p; + goto lex_mode_c_string_sq; +#endif +#endif + +#if defined(LEX_SHELL_LINE_COMMENT) || defined(LEX_JULIA_NESTED_COMMENT) + /* + * Line comment excluding terminal line break. + * + * See also C99 line comment `//`. + * + * Julia uses `#=` and `=#` for nested block comments. + * (According to Julia developers, '#=` is motivated by `=` + * not being likely to start anything that you would put a + * comment around, unlike `#{`, `}#` or `#(`, `)#`)). + * + * Some known doc comment formats are identified and + * included in the comment_begin token. + */ + case '#': + ++p; +#ifdef LEX_JULIA_NESTED_COMMENT + if (p != q && *p == '=') { + ++p; + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_julia_nested_comment; + } +#endif + lex_emit_comment_begin(s, p, 0); + s = p; + goto lex_mode_line_comment; +#endif + + case '/': + ++p; + if (p != q) { + switch (*p) { +#ifdef LEX_C99_LINE_COMMENT + case '/': + ++p; + p += p != q && (*p == '/' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_line_comment; +#endif +#ifdef LEX_C_BLOCK_COMMENT + case '*': + ++p; + p += p != q && (*p == '*' || *p == '!'); + lex_emit_comment_begin(s, p, (p - s == 3)); + s = p; + goto lex_mode_c_block_comment; +#endif + case '=': + ++p; + lex_emit_compound_op('/', '=', s, p); + continue; + default: + break; + } + } + lex_emit_op('/', s, p); + continue; + + case '(': case ')': case '[': case ']': case '{': case '}': + case ',': case ';': case '\\': case '?': + ++p; + lex_emit_op(*s, s, p); + continue; + + case '%': case '!': case '~': case '^': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_compound_op(*s, '=', s, p); + continue; + } + lex_emit_op(*s, s, p); + continue; + + case '|': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('|', '=', s, p); + continue; + case '|': + ++p; + lex_emit_compound_op('|', '|', s, p); + break; + default: + break; + } + } + lex_emit_op('|', s, p); + continue; + + case '&': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('&', '=', s, p); + continue; + case '&': + ++p; + lex_emit_compound_op('&', '&', s, p); + break; + default: + break; + } + } + lex_emit_op('&', s, p); + continue; + + case '=': + ++p; + if (p != q) { + switch (*p) { + case '>': + ++p; + lex_emit_compound_op('=', '>', s, p); + continue; + case '=': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op('=', '=', '=', s, p); + continue; + } + lex_emit_compound_op('=', '=', s, p); + break; + default: + break; + } + } + lex_emit_op('=', s, p); + continue; + + case ':': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op(':', '=', s, p); + continue; + case ':': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_tricompound_op(':', ':', '=', s, p); + continue; + } + lex_emit_compound_op(':', ':', s, p); + continue; + default: + break; + } + } + lex_emit_op(':', s, p); + continue; + + case '*': + ++p; + if (p != q) { + switch (*p) { + case '=': + lex_emit_compound_op('*', '=', s, p); + continue; + case '*': + /* **= hardly used anywhere? */ + lex_emit_compound_op('*', '*', s, p); + continue; + default: + break; + } + } + lex_emit_op('*', s, p); + continue; + + case '<': + ++p; + if (p != q) { + switch (*p) { + case '-': + ++p; + lex_emit_compound_op('<', '-', s, p); + continue; + case '=': + ++p; + lex_emit_compound_op('<', '=', s, p); + continue; + case '<': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('<', '<', '=', s, p); + continue; + case '<': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('<', '<', '<', '=', s, p); + continue; + } + lex_emit_tricompound_op('<', '<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('<', '<', s, p); + continue; + default: + break; + } + } + lex_emit_op('<', s, p); + continue; + + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('>', '=', s, p); + continue; + case '>': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_tricompound_op('>', '>', '=', s, p); + continue; + case '>': + ++p; + if (p != q && *p == '=') { + ++p; + lex_emit_quadcompound_op('>', '>', '>', '=', s, p); + continue; + } + lex_emit_tricompound_op('>', '>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_compound_op('>', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('>', s, p); + continue; + + case '-': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('-', '=', s, p); + continue; + case '-': + ++p; + lex_emit_compound_op('-', '-', s, p); + continue; + case '>': + ++p; + lex_emit_compound_op('-', '>', s, p); + continue; + default: + break; + } + } + lex_emit_op('-', s, p); + continue; + + case '+': + ++p; + if (p != q) { + switch (*p) { + case '=': + ++p; + lex_emit_compound_op('+', '=', s, p); + continue; + + case '+': + ++p; + lex_emit_compound_op('+', '+', s, p); + continue; + default: + break; + } + } + lex_emit_op('+', s, p); + continue; + + case '.': + ++p; + if (p != q) { + switch (*p) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + d = s; + goto lex_dot_to_fraction_part; + case '.': + ++p; + if (p != q && *p == '.') { + ++p; + lex_emit_tricompound_op('.', '.', '.', s, p); + continue; + } + lex_emit_compound_op('.', '.', s, p); + continue; + default: + break; + } + } + lex_emit_op('.', s, p); + continue; + + case '0': + if (++p != q) { + switch (*p) { +#ifdef LEX_C_OCTAL_NUMERIC + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + while (++p != q && lex_isoctdigit(*p)) { + } + d = p; + if (p != q) { + /* + * Leading zeroes like 00.10 are valid C + * floating point constants. + */ + if (*p == '.') { + goto lex_c_octal_to_fraction_part; + } + if (*p == 'e' || *p == 'E') { + goto lex_c_octal_to_exponent_part; + } + } + lex_emit_octal(s, p); + /* + * If we have a number like 0079, it becomes + * 007(octal), 9(decimal). The parser should + * deal with this. + * + * To add to confusion i64 is a C integer suffix + * like in 007i64, but 2+2i is a Go complex + * constant. (Not specific to octals). + * + * This can all be handled by having the parser inspect + * following identifier or numeric, parser + * here meaning a lexer post processing step, not + * necessarily the parser itself. + */ + + continue; +#else + /* + * All integers reach default and enter + * integer part. As a result, leading zeroes are + * mapped to floats and integers which matches + * Julia behavior. Other languages should decide + * if leading zero is valid or not. JSON + * disallows leading zero. + */ +#endif + +#ifdef LEX_JULIA_OCTAL_NUMERIC + /* + * This is the style of octal, not 100% Julia + * compatible. Also define Julia numeric to enforce + * lower case. + */ +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'O': +#endif + /* + * Julia accepts 0o700 as octal and 0b100 as + * binary, and 0xa00 as hex, and 0100 as + * integer, and 1e2 as 64 bit float and 1f2 as + * 32 bit float. Julia 0.3 does not support + * octal and binary fractions. + */ + case 'o': + while (++p != q && lex_isoctdigit(*p)) { + } + lex_emit_octal(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_BINARY_NUMERIC + /* Binary in C++14. */ + case 'b': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* See also hex 0X. Julia v.0.3 uses lower case only here. */ + case 'B': +#endif + while (++p != q && lex_isbindigit(*p)) { + } + lex_emit_binary(s, p); + /* Avoid hitting int fall through. */ + continue; +#endif +#ifdef LEX_HEX_NUMERIC + case 'x': +#ifndef LEX_LOWER_CASE_NUMERIC_PREFIX + /* + * Julia v0.3 does not allow this, it thinks 0X1 is + * 0 * X1, X1 being an identifier. + * while 0x1 is a hex value due to precedence. + * + * TODO: This might change. + */ + + case 'X': +#endif + while (++p != q && lex_ishexdigit(*p)) { + } +#ifdef LEX_HEX_FLOAT_NUMERIC + /* + * Most hexadecimal floating poing conversion + * functions, including Pythons + * float.fromhex("0x1.0"), Julias parse + * function, and and C strtod on + * supporting platforms, will parse without + * exponent. The same languages do not support + * literal constants without the p exponent. + * First it is named p because e is a hex digit, + * second, the float suffix f is also a hex + * digit: 0x1.f is ambigious in C without that + * rule. Conversions have no such ambiguity. + * In Julia, juxtaposition means that 0x1.f + * could mean 0x1p0 * f or 0x1.fp0. + * + * Since we are not doing conversion here but + * lexing a stream, we opt to require the p + * suffix because making it optional could end + * up consuming parts of the next token. + * + * But, we also make a flag to make the exponent + * optional, anyway. It could be used for better + * error reporting than just consuming the hex + * part since we likely should accept the ambigous + * syntax either way. + */ + d = p; + if (p != q && *p == '.') { + while (++p != q && lex_ishexdigit(*p)) { + } + } + if (p != q && (*p == 'p' || *p == 'P')) { + if (++p != q && *p != '+' && *p != '-') { + --p; + } + /* The exponent is a decimal power of 2. */ + while (++p != q && lex_isdigit(*p)) { + } + lex_emit_hex_float(s, p); + continue; + } +#ifdef LEX_HEX_FLOAT_OPTIONAL_EXPONENT + if (d != p) { + lex_emit_hex_float(s, p); + continue; + } +#else + /* + * Backtrack to decimal point. We require p to + * be present because we could otherwise consume + * part of the next token. + */ + p = d; +#endif +#endif /* LEX_HEX_FLOAT_NUMERIC */ + lex_emit_hex(s, p); + continue; +#endif /* LEX_HEX_NUMERIC */ + + default: + /* + * This means leading zeroes like 001 or 001.0 are + * treated like like int and float respectively, + * iff C octals are flaggged out. Otherwise they + * become 001(octal), and 001(octal),.0(float) + * which should be treated as an error because + * future extensions might allow octal floats. + * (Not likely, but interpretion is ambigious). + */ + break; + } /* Switch under '0' case. */ + + /* + * Pure single digit '0' is an octal number in the C + * spec. We have the option to treat it as an integer, + * or as an octal. For strict C behavior, this can be + * flagged in, but is disabled by default. It only + * applies to single digit 0. Thus, with C octal + * enabled, leading zeroes always go octal. + */ + } /* If condition around switch under '0' case. */ + --p; + goto lex_fallthrough_1; /* silence warning */ + + lex_fallthrough_1: + /* Leading integer digit in C integers. */ + case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + while (++p && lex_isdigit(*p)) { + } + d = p; + if (*p == '.') { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_fraction_part: +#endif +lex_dot_to_fraction_part: + while (++p != q && lex_isdigit(*p)) { + } + } + if (p != q && (*p == 'e' || *p == 'E')) { +/* Silence unused label warnings when features are disabled. */ +#ifdef LEX_C_OCTAL_NUMERIC +lex_c_octal_to_exponent_part: +#endif + if (++p != q && *p != '+' && *p != '-') { + --p; + } + while (++p != q && lex_isdigit(*p)) { + } + } + if (d != p) { + lex_emit_float(s, p); + } else { +#ifdef LEX_C_OCTAL_NUMERIC + if (*s == '0') { + lex_emit_octal(s, p); + continue; + } +#endif + lex_emit_int(s, p); + } + continue; + +#ifndef LEX_ID_WITHOUT_UNDERSCORE + case '_': +#endif + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + + /* + * We do not try to ensure utf-8 is terminated correctly nor + * that any unicode character above ASCII is a character + * suitable for identifiers. + * + * tag is calculated for keyword lookup, and we assume these + * are always ASCII-7bit. It has the form: length, first + * char, second, char, last char in lsb to msb order. If the + * second char is missing, it becomes '\0'. The tag is not + * entirely unique, but suitable for fast lookup. + * + * If utf-8 appears in tag, the tag is undefined except the + * length is valid or overflows (meaning longer than any + * keyword and thus safe to compare against if tag matches). + * + * If the grammar is case insensitive, the tag be can + * downcased trivially by or'ring with 0x20202000 which + * preserves the length field (clever design by ASCII + * designers). After tag matching, a case insentive + * compare is obviously also needed against the full lexeme. + */ + + { + unsigned long tag; + + tag = (unsigned long)*p << 8; + if (++p != q && lex_isalnum(*p)) { + tag |= (unsigned long)*p << 16; + while (++p != q && lex_isalnum(*p)) { + } + } + tag |= (unsigned long)p[-1] << 24; + tag |= (unsigned char)(p - s) + (unsigned long)'0'; + lex_emit_id(s, p, tag); + continue; + } + + default: + +#ifdef LEX_ID_WITH_UTF8 + /* + * Identifier again, in case it starts with a utf-8 lead + * character. This time we can ignore the tag, except the + * length char must be valid to avoid buffer overruns + * on potential kw check upstream. + */ + if (*p & '\x80') { + unsigned long tag; + + while (++p != q && lex_isalnum(*p)) { + } + tag = (unsigned char)(p - s) + '0'; + lex_emit_id(s, p, tag); + continue; + } +#endif + ++p; + /* normally 0x7f DEL and 0x00..0x1f incl. */ + if (lex_isctrl(*s) && !lex_isblank(*s)) { + lex_emit_ctrl(s); + } else { + lex_emit_symbol(*s, s, p); + } + continue; + } /* Main switch in normal mode. */ + } /* Main while loop in normal mode. */ + +lex_mode_exit: + if (mode == LEX_MODE_INVALID) { + return mode; + } + +#ifndef LEX_DISABLE_ZTERM + if (p != q && lex_iszterm(*p)) { + lex_emit_eos(s, p); + return mode; + } +#endif + lex_emit_eob(p); + return mode; +} + diff --git a/external/lex/luthor.h b/external/lex/luthor.h new file mode 100644 index 0000000..6ca373d --- /dev/null +++ b/external/lex/luthor.h @@ -0,0 +1,472 @@ +/* + * Mostly generic lexer that can be hacked to suit specific syntax. See + * more detailed comments further down in this file. + * + * Normally include luthor.c instead of luthor.h so emitter functions + * can be custom defined, and optionally also fast keyword definitions. + * + * At the very minimum, define lex_emit which other emitters default to. + * + * Create a wrapper function to drive the lex function in said file. + * + * Use this header in separate parser logic to access the token values + * if relevant. + */ + +#ifndef LUTHOR_H +#define LUTHOR_H + +#ifdef LEX_KEYWORDS +#include <string.h> /* memcmp for kw match */ +#endif + +#include "tokens.h" + +#ifndef lex_emit +#define lex_emit(token, first, last) ((void)0) +#endif + +/* + * Default for comments, bom, and other things that are not necessarily + * of interest to the parser, but may be to buffer wrap handling, + * debugging, and pretty printers. + */ +#ifndef lex_emit_other +#define lex_emit_other(token, first, last) ((void)0) +#endif + +#ifndef lex_emit_eof +#define lex_emit_eof(pos) lex_emit(LEX_TOK_EOF, pos, pos) +#endif + +#ifndef lex_emit_abort +#define lex_emit_abort(pos) lex_emit(LEX_TOK_ABORT, pos, pos) +#endif + +#ifndef lex_emit_eob +#define lex_emit_eob(pos) lex_emit(LEX_TOK_EOB, pos, pos) +#endif + +#ifndef lex_emit_eos +#define lex_emit_eos(first, last) lex_emit(LEX_TOK_EOS, first, last) +#endif + +#ifndef lex_emit_bom +#define lex_emit_bom(first, last) lex_emit_other(LEX_TOK_BOM, first, last) +#endif + +#ifndef lex_emit_id +#ifdef LEX_KEYWORDS +/* LEX_KW_TABLE_BEGIN .. LEX_KEYWORD_TABLE_END defines lex_match_kw. */ +#define lex_emit_id(first, last, tag) lex_emit(lex_match_kw(tag, first), first, last) +#else +#define lex_emit_id(first, last, tag) lex_emit(LEX_TOK_ID, first, last) +#endif +#endif + +/* + * This is a default for unknown symbols. It may be treated as an error, + * or it can be processed further by the parser instead of customizing + * the lexer. It ensures that there is always a token for every part of + * the input stream. + */ +#ifndef lex_emit_symbol +#define lex_emit_symbol(token, first, last) lex_emit(LEX_TOK_SYMBOL, first, last) +#endif + +/* + * Control characters 0x01 .. 0x1f, 0x7f(DEL), excluding \0\r\n\t which have + * separate tokens. + * + * Control characters in strings and comments are passed on as body + * elements, except \0\r\n which breaks the string up. + */ +#ifndef lex_emit_ctrl +#define lex_emit_ctrl(pos) lex_emit(LEX_TOK_CTRL, pos, pos + 1) +#endif + +#ifndef lex_emit_string_ctrl +#define lex_emit_string_ctrl(pos) lex_emit(LEX_TOK_STRING_CTRL, pos, pos + 1) +#endif + +#ifndef lex_emit_comment_ctrl +#define lex_emit_comment_ctrl(pos) lex_emit_other(LEX_TOK_COMMENT_CTRL, pos, pos + 1) +#endif + +/* + * This enables user to both count lines, and to calculate character + * offset for subsequent lexemes. New line starts a lexeme, line break + * symbol is located at lexeme - skipped and with have length 2 if \r\n + * or \n\r break, and 1 otherwise. + */ +#ifndef lex_emit_newline +#define lex_emit_newline(first, last) lex_emit(LEX_TOK_NEWLINE, first, last) +#endif + +#ifndef lex_emit_string_newline +#define lex_emit_string_newline(first, last) lex_emit(LEX_TOK_STRING_NEWLINE, first, last) +#endif + +#ifndef lex_emit_int +#define lex_emit_int(first, last) lex_emit(LEX_TOK_INT, first, last) +#endif + +#ifndef lex_emit_float +#define lex_emit_float(first, last) lex_emit(LEX_TOK_FLOAT, first, last) +#endif + +#ifndef lex_emit_int_suffix +#define lex_emit_int_suffix(first, last) lex_emit(LEX_TOK_INT_SUFFIX, first, last) +#endif + +#ifndef lex_emit_float_suffix +#define lex_emit_floatint_suffix(first, last) lex_emit(LEX_TOK_FLOAT_SUFFIX, first, last) +#endif + +#ifndef lex_emit_binary +#define lex_emit_binary(first, last) lex_emit(LEX_TOK_BINARY, first, last) +#endif + +#ifndef lex_emit_octal +#define lex_emit_octal(first, last) lex_emit(LEX_TOK_OCTAL, first, last) +#endif + +#ifndef lex_emit_hex +#define lex_emit_hex(first, last) lex_emit(LEX_TOK_HEX, first, last) +#endif + +#ifndef lex_emit_hex_float +#define lex_emit_hex_float(first, last) lex_emit(LEX_TOK_HEX_FLOAT, first, last) +#endif + +/* + * The comment token can be used to aid backtracking during buffer + * switch. + */ +#ifndef lex_emit_comment_begin +#define lex_emit_comment_begin(first, last, is_doc) \ + lex_emit_other(LEX_TOK_COMMENT_BEGIN, first, last) +#endif + +#ifndef lex_emit_comment_part +#define lex_emit_comment_part(first, last) lex_emit_other(LEX_TOK_COMMENT_PART, first, last) +#endif + +#ifndef lex_emit_comment_end +#define lex_emit_comment_end(first, last) lex_emit_other(LEX_TOK_COMMENT_END, first, last) +#endif + +#ifndef lex_emit_comment_unterminated +#define lex_emit_comment_unterminated(pos) \ + lex_emit_other(LEX_TOK_COMMENT_UNTERMINATED, pos, pos) +#endif + +#ifndef lex_emit_comment_deeply_nested +#define lex_emit_comment_deeply_nested(pos) \ + lex_emit_other(LEX_TOK_COMMENT_DEEPLY_NESTED, pos, pos) +#endif + +#ifndef lex_emit_string_begin +#define lex_emit_string_begin(first, last) lex_emit(LEX_TOK_STRING_BEGIN, first, last) +#endif + +#ifndef lex_emit_string_part +#define lex_emit_string_part(first, last) lex_emit(LEX_TOK_STRING_PART, first, last) +#endif + +#ifndef lex_emit_string_end +#define lex_emit_string_end(first, last) lex_emit(LEX_TOK_STRING_END, first, last) +#endif + +#ifndef lex_emit_string_escape +#define lex_emit_string_escape(first, last) lex_emit(LEX_TOK_STRING_ESCAPE, first, last) +#endif + +#ifndef lex_emit_string_unterminated +#define lex_emit_string_unterminated(pos) \ + lex_emit(LEX_TOK_STRING_UNTERMINATED, pos, pos) +#endif + +#ifndef lex_emit_blank +#define lex_emit_blank(first, last) \ + lex_emit_other(LEX_TOK_BLANK, first, last) +#endif + +#ifndef lex_emit_op +#define lex_emit_op(op, first, last) lex_emit((long)(op), first, last) +#endif + +#ifndef lex_emit_compound_op +#define lex_emit_compound_op(op1, op2, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)), first, last) +#endif + +#ifndef lex_emit_tricompound_op +#define lex_emit_tricompound_op(op1, op2, op3, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ + ((long)(op3)<<16), first, last) +#endif + +#ifndef lex_emit_quadcompound_op +#define lex_emit_quadcompound_op(op1, op2, op3, op4, first, last) \ + lex_emit(((long)(op1) | ((long)(op2) << 8)) | \ + ((long)(op3) << 16) | ((long)(op4) << 24), first, last) +#endif + +/* Used to limit number of nested comment level. */ +#ifndef LEX_MAX_NESTING_LEVELS +#define LEX_MAX_NESTING_LEVELS 100 +#endif + + +/* Keyword handling macros, see `keywords.c` for an example usage. */ +#ifdef LEX_KEYWORDS + +/* + * This implements a switch statement branching on the 4 character + * keyword tag (unsigned long value) which is produced by the lexers id + * recognizer. A final check is needed with to ensure an exact + * match with a given id. Two keywords rarely conflicts, but it is + * possible, and therefore kw_begin kw_match kw_match ... kw_end is used + * to cover this. + * + * See example usage elsewhere for details. + * + * The first element x0 is length '0'..'9' and ensure comparisons will + * not overrun the buffer where the lexeme is stored during string + * comparison, iff the keywords report the length correctly. + * + * The next elements in the tag are the first, second, and last + * character of lexeme / keyword, replacing second character with '\0' + * on single length keywords, so keyword 'e' is tagged '1', 'e', '\0', 'e', + * and 'while' is tagged '5' 'w', 'h', 'e', where the length is lsb + * and last chararacter is msb. + * + * An enum with tok_kw_<name> elements is expected to provide return + * values on match. These should start at LEX_TOK_KW_BASE and are + * negative. + * + */ +#define lex_kw_begin(x0, x1, x2, x3) \ + case \ + ((unsigned long)(x0) | \ + ((unsigned long)(x1) << 8) | \ + ((unsigned long)(x2) << 16) | \ + ((unsigned long)(x3) << 24)) : + +#define lex_kw_match(kw) \ + if (memcmp(#kw, lexeme, sizeof(#kw) - 1) == 0) \ + return tok_kw_##kw; + +#define lex_kw_end() \ + break; + +#define lex_kw(kw, x0, x1, x2, x3) \ + lex_kw_begin(x0, x1, x2, x3) \ + lex_kw_match(kw) \ + lex_kw_end() + +static long lex_match_kw(unsigned long tag, const char *lexeme); + +/* Static so multiple grammers are possible in a single program. */ +#define LEX_KW_TABLE_BEGIN \ +static long lex_match_kw(unsigned long tag, const char *lexeme) \ +{ \ + switch (tag) { \ + +#define LEX_KW_TABLE_END \ + default: \ + break; \ + } \ + return LEX_TOK_KW_NOT_FOUND; \ +} + +#else + +/* Allow flagging in and out without unused warning or missing macros */ +#define lex_kw_begin(x0, x1, x2, x3) +#define lex_kw_match(kw) +#define lex_kw_end() +#define lex_kw(kw, x0, x1, x2, x3) +#define LEX_KEYWORD_TABLE_BEGIN +#define LEX_KEYWORD_TABLE_END + +#endif /* LEX_KEYWORDS */ + + + +/* + * Modes used for recovery when switching to a new buffer and handling + * internal state changes for strings and comments. + */ +enum { + /* Always 0, is initial lexer state. */ + LEX_MODE_NORMAL = 0, + + /* Returned if lex is given unsupported mode. */ + LEX_MODE_INVALID = 1, + + /* + * Can be used in place of normal mode to consume optional bom + * marker at buffer start. Only utf-8 bom is supported. + */ + LEX_MODE_BOM, + + /* + * Returned at end of buffer if mid string or mid comment, may also + * be larger for nested comments as nesting level is encoded. + */ + LEX_MODE_C_STRING, + LEX_MODE_C_STRING_SQ, + LEX_MODE_PYTHON_BLOCK_STRING, + LEX_MODE_PYTHON_BLOCK_STRING_SQ, + LEX_MODE_C_BLOCK_COMMENT, + LEX_MODE_LINE_COMMENT, + LEX_MODE_JULIA_NESTED_COMMENT, + + + /* Counter embedded in mode. */ + LEX_MODE_COUNT_BASE = 16, +}; + + + +/* ON CALLING AND USING LEX FUNCTION + * + * If utf-8 BOM possible, detect this before calling the lexer and + * advance the buffer. JSON explititly disallows BOM, but recommends + * consuming it if present. If some other Unicode BOM is found, convert + * the buffer first. The lexer assumes ALL non-ascii characters are + * valid trailing identifiers which mostly works well. Strings with + * broken utf-8 are passed on as is. utf-8 identifiers must be enabled + * with #define LEX_ENABLE_UTF8_ID + * + * If required, postprocess identifiers and strings for valid utf-8. It + * is assumed that all keywords are at most 9 characters long and always + * ASCII. Otherwise post process them in a hash table on identifier + * event. This enables a fast compiled trie lookup of keywords. + * + * Newline and control characters are always emitted, also inside + * strings and comments. The exception is \r, \n, \t, \0 which are + * handled specially, or if the lexer is adapted to handle certain + * control characters specially. + * + * Each token is not guaranteed correct, only to be delimited correct, + * if it is indeed correct. Only very few tokens can be zero length, for + * example, the parser can rely on string part token not being empty + * which is important in dealing with line continuation. The end of + * buffer token is empty, and so is the unterminates string token, and + * also the comment end token for single line tokens, but not the + * multi-line version. There is a token for every part of the input + * stream, but the parser can easily define some to be ignored and have + * them optimized out. + * + * Strings have start token, and optionally sequences of control, + * escape, and newline tokens, followed by either string end token or + * string unterminated token. Strings delimiters can be one + * (single-line) or three double quotes (multi-line, like python, but + * cannot be single quotes, unlike Python. Python, C and Javascript + * string continuation is handled by having the parser observing string + * escape followed by newline token. Escape is always a single + * character '\' token, and the parser is responsible for consuming the + * following content. If string syntax with double delimiter is used to + * define escaped delimiter, this will occur as two separate strings + * with no space between. The parser can handle this on its own; if, in + * such strings, '\"' does not mean escaped delimiter, the string will + * not terminate correctly, and the lexer must be apapted. Unterminated + * string may happen at end of buffer, also for single line comments. + * This is because the string might continue in a new buffer. The parser + * should deal with this. + * + * Comments always start with a start token, followed by zero or more + * comment part tokens interleaved with control and newline tokens, + * terminated by either comment end token, or unterminated comment + * token. If the comment is single, the unterminated comment token may + * appear at the last line instead of the expected end of comment token + * because the comment might continue in a new buffer. The parser + * should deal with this. Escapes and line continuations have no effects + * in comments, unlike strings. + * + * The lexer only carries one state variable: the mode. The mode can be + * normal (default and equals zero), or single or multi string or + * comment modes. These modes are used to to recover after switching + * buffers as discussed below. + * + * The lexer can run to completion without involving the parser and + * could be used to pipeline tokens into another thread for concurrent + * parsing which is safe since the input buffer is considered read-only. + * + * + * KEYWORDS + * + * Keywords are treated as identifiers by default. By including a + * keyword table the `lex_emit_id` macro will check if the id is a + * keyword and translate the token if it is. Using the provided keyword + * table macros is just one way to do it. This is better explained by + * looking at an example. Keyword lookup based on the precomputed keyword + * tag provided to the lookup function are limited to 9 characters, but a + * custom lookup function need not use it and then the tag precomputation + * will be optimized out. + * + * Keywords are defined by the lookup function and should be negative + * starting at LEX_TOK_KW_BASE to avoid conflicts with other token types. + * + * + * WRAPPING MULTIPLE BUFFERS + * + * The user may need to deal with multiple buffers because data may + * arrive asynchronously over the network, and may have many concurrent + * lexing jobs. The emitter part is not difficult since a ring buffer + * can grow, or the parser can be called directly (except queuing a few + * tokens for backtracking as we shall see). + * + * If the lexer were an explicit statemachine as in Flex, we could get + * an yywrap event to fill buffers, but our state is on the stack and in + * registers for optimization. We may use co-routines, but it doesn't + * cover all issues, and, as it turns out is not necessary with the + * following restrictions on syntax: + * + * All variable length tokens such as numerics and identifiers are + * limited in length. Strings and comments are not, but are broken into + * zero, one, or several body tokens per line. ANSI-C limits line length + * to 509 characters (allowing for continuation and two byte linebreaks + * in a 512 byte buffer). But JSON has no line continuation for strings + * and may (and often do) store everything on a single line. Whitespace + * can also extend beyond given limit. + * + * If we ignore whitespace, strings and comments, we can discard the + * last token (or last two in case there are paired tokens, such as + * leading zero followed by numeric. Parsing can then resume in a new + * buffer where the first 512 bytes (or similar) are duplicated from the + * previous buffer. The lexer is then restarted at the last token (pair) + * start which may turn out to change the length or even introduce a + * different result such introducing leading zero. The lexer need no + * specific state to do this. + * + * For strings and comments, we need a flag to allow entering the lexer + * mid string or mid comment. The newline and line continuation tokens + * need to be dropped, and the last body may need to be truncated as it + * can embed a partial delimiter. The simplest way to deal with this is + * to backtrack tokens until the last token begins at a safe position, + * about 3-6 charaters earlier, and truncating body segments that span + * this barrier. Whitespace can also be truncated. + * + * We can generalize this further by going at least K bytes back in an N + * overlap buffer region and require non-strings (and non-comments) to + * not exceed N-K bytes, where K and N are specific to the syntax and + * the I/O topology. + * + * We can add flags to tokens that can help decide how to enter + * backtracking mode without covering every possible scanner loop - i.e. + * are we mid string, mid comment, single-line or multi-line. + * + * All the lexer needs to do then, is to receive the backtracking mode + * flags. A wrapping driver can deal with backtrack logic, which is + * specific to how tokens are emitted. Whitespace need no recovery mode + * but perhaps new whitespace should extend existing to simplify + * parsing. + */ + + +#endif /* LUTHOR_H */ + diff --git a/external/lex/tokens.h b/external/lex/tokens.h new file mode 100644 index 0000000..2bdbd7c --- /dev/null +++ b/external/lex/tokens.h @@ -0,0 +1,554 @@ +#ifndef LEX_TOKENS_H +#define LEX_TOKENS_H + +/* Define LEX_DEBUG to enable token printing and describing functions. */ + + +enum { + + /* + * EOF is not emitted by lexer, but may be used by driver after + * last buffer is processed. + */ + LEX_TOK_EOF = 0, + + /* + * Either EOB or EOS is emitted as the last token before exit, + * or also ABORT in some lexers. Unterminated string or comment + * will be emitted immediately one of these when relevant. + * + * It may be useful to redefine lex_emit_eos and lex_emit_eob to + * produce LEX_TOK_EOF or error directly for simple string lexing. + */ + LEX_TOK_EOB = 1, + LEX_TOK_EOS = 2, + + /* + * ABORT can be used for early exit by some lexers while other + * lexers may choose to run to buffer end regardless of input (with + * the exception of deeply nested comments). + */ + LEX_TOK_ABORT = 3, + + /* + * Byte order marker. Only happen if lexer was started in bom mode + * and the input stream contains a leading bom marker. + * The token can only be the first token in the stream. Utf-8 is the + * only supported bom, but the lexeme may be checked in case other + * boms are added later. Normally it is routed to lex_emit_other + * along with comments so it just ignores the bom if present. It is + * generally recommended to consume utf-8 bom for interoperability, + * but also to not store it for the same reason. + */ + LEX_TOK_BOM, + + /* + * Any control character that is not newline or blank will be + * emitted as single character token here. This token is discussed + * in several comments below. For strings and comments, also + * blank control characters will be emitted since they are usually + * not desired unexpectd. + */ + LEX_TOK_CTRL, + LEX_TOK_STRING_CTRL, + LEX_TOK_COMMENT_CTRL, + + /* + * Any printable ASCII character that is not otherwise consumed will + * be issued as a single length symbol token. Further discussion + * below. The symbol and CTRL tokens ensure that the entire input + * stream is covered by tokens. If utf-8 identifies have not been + * flagged, utf-8 leading characters may also end up here, and so + * my utf-8 characters in general, that are not viewed as valid + * identifiers (depending on configuration). + */ + LEX_TOK_SYMBOL, + + /* + * Variable length identifier starting with (_A-Za-z) by default and + * followed by zero or more (_A-Za-z0-9) characters. (_) can be + * flagged out. utf-8 can be flagged in. Be default any non-ASCII + * character (0x80 and above), is treated as part of an identifier + * for simplicity and speed, but this may be redefined. Any broken + * utf-8 is not sanitized, thus 0x80 would be a valid identifier + * token with utf-8 identifiers enabled, and otherwise it would be a + * symbol token. + * + * The ID does a magic trick: It maps the lexeme to a very simple + * and fast 32 bit hash code called a tag. The tag is emitted with + * the id token and can be used for fast keyword lookup. The + * hash tag is: + * + * (length)(first char)(second char)(last char) + * + * where length is ASCII '0' .. '9' where any length overflow is an + * arbitrary value, but such that the length is never longer than + * the lexeme. The last char is the last char regardless of length. + * For short identifiers, the second char may be the first char + * duplicated, and the last char may be first char. + * + * This code is very simple to write by hand: "5whe" means while, + * and can be used in a case switch before a strcmp with "while". + * Conflicts are possible, but then several keywords are tested like + * any other hash conflict. This keyword lookup is user driven, but + * can follow example code quite straightforward. + * + * The lex_emit_id macro can be implemented to provide the above + * lookup and inject a keyword token instead. By convention such + * tokens have negative values to avoid conflicts with lexer + * generated tokens. + * + * The ID also has a special role in prefixes and suffixes: C string + * literals like (L"hello") and numeric literals like (42f) are + * lexed as two tokens, one of which is an ID. The parser must + * process this and observe absence of whitespace where such syntax + * is relevant. + * + * While not specific to ID, the emitter macroes can be designed to + * keep track of start of lines and end of whitespace and attach + * state flags to each token (at line start, after whitespace). The + * whitespace tokens can then be dropped. This might help parsing + * things like suffixes efficiently. + */ + LEX_TOK_ID, + + /* + * C-int :: pos-dec-digit dec-digit * + * Julia-int ::= dec-digit+ + * + * pos-dec-digit ::= '1'..'9' + * dec-digit ::= '0'..'9' + * + * Floating point numbers take precedence when possible so 00.10 is + * always a deciaml floating point value when decimal floats are + * enabled. + * + * The C-int is automatically enabled if C-octals are enabled, and + * disabled otherwise. There is no specific Julia-int type - we just + * use the terminology to represent integers with leading zeroes. + * + * Julia style integers accept leading zeroes. C style integers with + * leading zeroes are consumed as C style octal numbers, so 0019 is + * parsed as either 0019(Julia-int), or 001(C-octal), 9(C-int). + * + * Single digit '0' maps to octal when C-octals are enabled and to + * Julia-int otherwise. (Yes, integers are not that simple, it + * seems). + * + * Both C and Julia octal numbers (see octal token) can be active + * simultaneously. This can be used to control leading zero + * behavior, even if C-octal numbers are not part of the grammar + * being parsed. For example, a language might use 0o777 octal + * numbers and disallow 0777 integers. Enabling C-octals makes this + * easy to detect (but should accept octal 0). + * + * There is no destinction between the styles in the int token, but + * leading zeroes are easily detected in the lexeme. + * + * Constant suffixes like 1L are treated as 1(INT), and L(ID). The + * same goes for other numeric values. + * + * Parser should check for leading zeroes and decide if it is valid, + * a warning, or an error (it is in JSON). This also goes for float. + * + * Numericals, not limited to INT, may appear shorter than they are + * due to buffer splits. Special recovery is required, but will only + * happen just before EOS or EOB tokens (i.e. buffer split events). + */ + LEX_TOK_INT, + + /* + * float ::= (int ['.' dec-digits*] dec-exponent) + * | ([int] '.' dec-digits* [dec-exponent]) + * dec-exponents ::= ('e' | 'E') ['+' | '-'] dec-digits* + * dec-digits ::= '0'..'9' + * int ::= dec-digits* + * + * Consumes a superset of C float representation without suffix. + * Some invalid tokens such as 0.E are accepted. Valid tokens such + * as 00.10 take precedence over octal numbers even if it is a + * prefix, and the same is obviously true with respect to decimal + * integers. + * + * JSON does not allow leading zeroes, and also not leading '.'. + * This can easily be checked in the lexeme. + * + * The octal notation affecting integer leading zeroes is not + * relevant to floats because floats take precedence over octal and + * decimal int when containing '.', 'e' or 'E'. + */ + LEX_TOK_FLOAT, + + /* + * binary ::= (0b | 0B) ('0' | '1')* + * + * 0b100 or just 0b, parser must check that digits are present, + * otherwise it may be interpreted as zero, just like octal zero + * in C. + * + * Like 0X hex, 0B can be flagged out because Julia v0.3 does not + * support uppercase 0B. + */ + LEX_TOK_BINARY, + + /* + * C-octal ::= 0 octal-digit* + * octal-digits ::= '0'..'7' + * + * Julia-octal ::= 0o octal-digits* + * octal-digits ::= '0'..'7' + * + * 0777 for C style octal numbers, or 0o777 for Julia syntax. Julia + * v.0.3 does not allow uppercase 0O777, it would mean 0 * O777. + * + * When enabled, decimal floating points take precedence: 00.10 is + * parsed as 00.10(decimal float), as per C standard. + * + * NOTE: It is possible for both styles to be active simultaneously. + * This may be relevant in order to control handling of leading + * zeroes in decimal integers. + * + * If C-octal numbers are flagged out, leading zeroes are mapped to + * integers and the numerical value may change. Julia behaves this + * way. Nothing prevents support of both C and Julia octal numbers, + * but leading zeroes will then be interpreted the C way - it is not + * recommended to do this. + */ + LEX_TOK_OCTAL, + + /* + * hex ::= hex-int + * hex-digits ::= 'a'..'f'| 'A'..'f' | '0'..'9' + * hex-int ::= (0x | 0X) hex_digts* + * + * where hex_digits are customizable (e.g. all lower case), and hex + * prefix 0x can be flagged to be lower case only (as in Julia). + * + * If hex floats are enabled, they take precedence: + * 0x1.0(hex-float), if not, 0x1.0 will parse as: 0x1(hex) followed + * by .0(decimal float). + * + * The lead prefix 0x may by flagged to be lower case only because + * this is required by Julia v0.3 where 0X means 0 * X. Julia + * accepts uppercase in the remaining hex digits (and exponent for + * floats). This could possibly change in future versions. + * + * The zero length sequence (0x | 0X) is accepted and left to the + * parser since the lexer emits a token for everything it sees. + * Conceptually it may be interpreted as zero, equivalent to 0 being + * both octal prefix and numeric 0 in C style octal representation. + * Or it may be an error. + */ + LEX_TOK_HEX, + + /* + * hex_float ::= hex-int ['.' hex_digit*] hex-exponent + * hex-exponent ::= ('p' | 'P') ['+' | '-'] decimal-digit* + * decimal-digit ::= '0'..'9' + * + * A superset of IEEE-754-2008 Hexadecimal Floating Point notation. + * + * We require the exponent to be present, but does not ensure the + * value is otherwise complete, e.g. 0x1p+ would be accepted. The p + * is needed because otherwise 0x1.f could be accepted, and f is a + * float suffix in C, and juxtapostion factor (0x1. * f) in Julia, + * at least, that is one possible interpretation. + * + * The exponent can be flagged optional in which case 0x1.f will be + * consumed as a single hex float toke as a single hex float token. + * This may either simply be accepted in some grammars, or used to + * provide an error message. If the exponent is required, 0x1.f will + * be lexed as three tokens: + * + * <'0x1'(hex int), '.'(op), 'f'(id)>. + * + * Thus it may be a good idea to allow the exponent to be optional + * anyway and issue an error message or warning if the p is absent + * later in the parsing stage. + * + * Note that, as per IEEE-754, the exponent is a decimal power of + * two. In other words, the number of bits to shift the + * (hexa)decimal point. Also note that it is p and not e because e + * is a hex digit. + */ + LEX_TOK_HEX_FLOAT, + + /* + * blank ::= ('\t' | '\x20')+ + * + * Longest run in buffer holding only '\t' and '\x20' (space). + * + * buffer splits may generate adjacent blanks depending on recovery + * processing. (The same goes for other line oriented runs such as + * string parts and comment parts). + */ + LEX_TOK_BLANK, + + /* newline ::= '\r' | '\n' | '\r\n' | '\n\r' + * + * Will always appear, also inside strings and comments. Can be used + * to track line starts and counts reliably as only one newline is + * issued at a time, and it is issued everywhere, also in strings + * and comments. + * + * May be preceeded by string escape token inside strings. This can + * be interpreted as line continuation within strings specifically, + * as is the case in Python and Javascript (and in C via + * pre-processor). + * + * The LEX_TOK_STRING_NEWLINE is emitted inside strings so the ordinary + * newline may be ignored in comments and other non-string content. + */ + LEX_TOK_NEWLINE, + LEX_TOK_STRING_NEWLINE, + + /* + * string ::= string_start + * (string_part | string_escape | + * string_ctrl | string_newline)* + * (string_end | string_unterminated) + * + * There are several optional string styles. They all start with + * this token. The length and content provided details. Python + * may start with """ or ''' and this token will then have length + * 3 and three quotes as lexeme content. If the lexer exits before + * string end token, the returned lexer mode will remember the + * state and can be used for reentry - this also goes for comments. + * + * Strings can only contain part, escape, newline, and control + * tokens, and either string unterminated or string end token + * at last. + */ + LEX_TOK_STRING_BEGIN, + + /* Longest run without control characters, without (\), without + * newline, and without the relevant end delimiter. The run may be + * shortened due to buffer splits. The part may, as an exception, + * begin with an end delimiter character or a (\) if it was + * preceeded by a string escape token. The escape character is + * always (\). Strings that use "" or '' as escape will be treated + * as start and end of separate strings. Strings that do not supoort + * (\) should just treat escape as a part of the string. + */ + LEX_TOK_STRING_PART, + + /* + * This is always a single character token (\) and only happens + * inside strings. See also string part token. + */ + LEX_TOK_STRING_ESCAPE, + + /* This token is similar to string start. It may be absent at buffer + * splits, but will then an unterminated string token will be used + * just before the split event token. + * + * */ + LEX_TOK_STRING_END, + + /* + * This is emitted before the buffer ends, or before unescaped + * newlines for line oriented string types (the usual strings). + * At buffer splits, recovery should clean it up. The returned + * mode allow parsing to continue in a new buffer with a slight + * content overlap. + * + * If string like ("hello, world!") in C, reaches end of line, it + * may be continued" ("hello, \)newline(world!"). If this line + * continuation is flagged out, this will lead to string + * unterminated, even if not at end of buffer. For block strings + * like """hello""", this only happens at end of buffer. + */ + LEX_TOK_STRING_UNTERMINATED, + + /* + * + * comment ::= comment_start + * (comment_part | ctrl | newline)* + * (comment_end | comment_unterminated) + * + * + * Comments work like strings in most respects. They emit parts, and + * control characters, but not escape characters, and cannot be + * continued at end of line. Block comments are like python block + * strings ('''). + * + * Julia supports nested comments (#= ... #= =# =#). In this case + * a new start token can be emitted before an end token. If the + * parser exits due to buffer split, the mode has the nesting level + * encoded so it can resumed in a new buffer. + * + * Line comments will have their end token just before newline, or + * unterminated comment just before buffer split token (EOB or EOS). + * (\) characters are consumed by the comment part tokens and do not + * affect the end of any comment. + * + * Comment begin may include extra characters when a doc comment is + * recognized. The emitter flags this. End comments are unaffected. + */ + LEX_TOK_COMMENT_BEGIN, + LEX_TOK_COMMENT_PART, + LEX_TOK_COMMENT_END, + LEX_TOK_COMMENT_UNTERMINATED, + + /* + * Issued before ABORT token if nesting level is above a predefined + * level. This is to protect against malicious and misguided + * content, otherwise the nesting level counter could wrap and + * generate a different interpretation, which could be bad. The + * parser would probably do similar things with nested tokens. + */ + LEX_TOK_COMMENT_DEEPLY_NESTED, + + + /* Operators are all recognized single character symbols, or up to + * four characters. The token value is the ASCII codes shifted 8 + * bits per extra character, by default, but the emitter macros + * can redefine this. Values below 32 are reserved token types as + * discussed above. + * + * What exactly represents an operator depends on what the lexer has + * enabled. + * + * Printable ASCII symbols that are NOT recognized, are emitted as + * the SYMBOL token and is always length 1. The value can be derived + * from the lexeme, but not the token itself. This may be perfectly + * fine for the parser, or it may be used to indicate an error. + * There are no illegal characters per se. + * + * Non-printable ASCII characters that are not covered by newline or + * blank, are emitted as CTRL tokens. These act the same as the + * symbol token and may be used to indicate error, or to handle form + * feed and other whitespace not handled by default. Unlike symbol, + * however, CTRL also appear in strings and comments since they are + * generally not allowed and this makes it easy to capture (there is + * virtually no performance overhead in providing this service + * unless attempting to parse a binary format). + */ + + /* Don't bleed into this range. */ + LEX_TOK_OPERATOR_BASE = 32, + + + /* + * Operators use ASCII range. + * Compound operators use range 0x80 to 0x7fff + * and possibly above for triple sequences. + * Custom keywords are normally negative but can be mapped + * to any other. + * + * The layout is designed for efficient table lookup. + * Compound operators might benefit from remapping down to a smaller + * range for compact lookup tables, but it depends on the parser. + */ +}; + +/* + * Custom keyword token range is negative, and well below -99..0 where + * special codes are reserved. + */ +#ifndef LEX_TOK_KW_BASE +#define LEX_TOK_KW_BASE -1000 +#endif + +#ifndef LEX_TOK_KW_NOT_FOUND +#define LEX_TOK_KW_NOT_FOUND LEX_TOK_ID +#endif + + +#ifdef LEX_DEBUG + +#include <stdio.h> +#include <string.h> + +static const char *lex_describe_token(long token) +{ + switch(token) { + case LEX_TOK_BOM: return "BOM marker"; + case LEX_TOK_EOF: return "EOF"; + case LEX_TOK_EOS: return "buffer zero terminated"; + case LEX_TOK_EOB: return "buffer exhausted"; + case LEX_TOK_ABORT: return "abort"; + case LEX_TOK_CTRL: return "control"; + case LEX_TOK_STRING_CTRL: return "string control"; + case LEX_TOK_COMMENT_CTRL: return "comment control"; + case LEX_TOK_SYMBOL: return "symbol"; + case LEX_TOK_ID: return "identifier"; + case LEX_TOK_INT: return "integer"; + case LEX_TOK_FLOAT: return "float"; + case LEX_TOK_BINARY: return "binary"; + case LEX_TOK_OCTAL: return "octal"; + case LEX_TOK_HEX: return "hex"; + case LEX_TOK_HEX_FLOAT: return "hex float"; + case LEX_TOK_BLANK: return "blank"; + case LEX_TOK_NEWLINE: return "newline"; + case LEX_TOK_STRING_NEWLINE: return "string newline"; + case LEX_TOK_STRING_BEGIN: return "string begin"; + case LEX_TOK_STRING_PART: return "string part"; + case LEX_TOK_STRING_END: return "string end"; + case LEX_TOK_STRING_ESCAPE: return "string escape"; + case LEX_TOK_STRING_UNTERMINATED: return "unterminated string"; + case LEX_TOK_COMMENT_BEGIN: return "comment begin"; + case LEX_TOK_COMMENT_PART: return "comment part"; + case LEX_TOK_COMMENT_END: return "comment end"; + case LEX_TOK_COMMENT_UNTERMINATED: return "unterminated comment"; + case LEX_TOK_COMMENT_DEEPLY_NESTED: return "deeply nested comment"; + + default: + if (token < LEX_TOK_EOF) { + return "keyword"; + } + if (token < 32) { + return "undefined"; + } + if (token < 0x100L) { + return "operator"; + } + if (token < 0x10000L) { + return "compound operator"; + } + if (token < 0x1000000L) { + return "tricompound operator"; + } + if (token < 0x7f0000000L) { + return "quadcompound operator"; + } + return "reserved"; + } +} + +static void lex_fprint_token(FILE *fp, + long token, + const char *first, const char *last, + int line, int pos) +{ + char buf[10]; + const char *lexeme = first; + int len = (int)(last - first); + switch (token) { + case LEX_TOK_EOS: + case LEX_TOK_CTRL: + sprintf(buf, "^%02x", (int)*first); + lexeme = buf; + len = strlen(buf); + break; + default: + break; + } + fprintf(fp, "%04d:%03d %s (0x%lx): `%.*s`\n", + line, pos, lex_describe_token(token), token, len, lexeme); +} + +#define lex_print_token(token, first, last, line, pos) \ + lex_fprint_token(stdout, token, first, last, line, pos) + +#else /* LEX_DEBUG */ + +#define lex_describe_token(token) "debug not available" +#define lex_fprint_token(fp, token, first, last, line, pos) ((void)0) +#define lex_print_token(token, first, last, line, pos) ((void)0) + +#endif /* LEX_DEBUG */ + + +#endif /* LEX_TOKENS_H */ + |