aboutsummaryrefslogtreecommitdiff
path: root/external/hash/hash_table_impl_rh.h
diff options
context:
space:
mode:
Diffstat (limited to 'external/hash/hash_table_impl_rh.h')
-rw-r--r--external/hash/hash_table_impl_rh.h360
1 files changed, 360 insertions, 0 deletions
diff --git a/external/hash/hash_table_impl_rh.h b/external/hash/hash_table_impl_rh.h
new file mode 100644
index 0000000..b4cabae
--- /dev/null
+++ b/external/hash/hash_table_impl_rh.h
@@ -0,0 +1,360 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015 Mikkel F. Jørgensen, dvide.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* We use the same define for all implementations */
+#ifdef HASH_TABLE_IMPL
+#error "cannot have multiple implementations in same compilation unit"
+#endif
+#define HASH_TABLE_IMPL
+/* Robin Hood (with offset table) */
+#define HT_RH
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4127) /* conditional expression is constant */
+#endif
+
+#include <stdlib.h>
+#include <assert.h>
+
+/*
+ * A variation of Robin Hashing:
+ * We do not calcute distance from buckets, nor do we cache
+ * hash keys. Instead we maintain a 7-bit offset that points
+ * to where the first entry of a bucket is stored. In Robin Hood hashing
+ * all entries conceptually chained to the same bucket are stored
+ * immediately after each other in order of insertion. The offset of
+ * the next bucket is naturally the end of the previous bucket, off by
+ * one. This breaks down when the bucket offset is 0 and the bucket is
+ * empty because it suggests there is an element. We cannot distinguish
+ * between a single used and unused entry, except by looking at the
+ * content or otherwise tag the information on. This is not a problem,
+ * just a special case to deal with.
+ *
+ * The offsets are stored separately which might lead to more cache line
+ * traffic, but the alternative is not very elegant - either wasting
+ * space or trying to pack offsets on a per cache line basis. We only
+ * need 8 bits for offsets. If the offset overflows, bit 7 will be set
+ * which we can easily detect. During insertion, offsets are increated
+ * on all affected buckets, and likewise decrement on remove. In
+ * principle we can use bit parallel increments to update most offsets
+ * in a single operation, but it is hardly worthwhile due to setup
+ * cost. The approach bears some resemblance to hopscotch hashing which
+ * uses local offsets for chaining, but we prefer the simpler Robin
+ * Hood approach.
+ *
+ * If the offset overflows, the table is resized. We expect the packed
+ * chains to behave like a special case of a hopscotch layout and
+ * consequently have the same bounds, meaning we are unlikely to have
+ * neither long offsets nor long chains if we resize below very full
+ * so resizing on an offset of 128 should be ok.
+ *
+ * Our main motivation for this hashing is actually to get rid of
+ * tombstones in quadruatic and linear probing. Avoiding tombstones
+ * is much simpler when sorting chains Robin Hood style, and we avoid
+ * checking for tombstones. We loose this benefit by having to inspect
+ * offsets, but then also avoid checking keys before the chain, and
+ * after because we can zero in on exactly the entries belonging to
+ * bucket.
+ *
+ * Unlike traditional Robin Hood, we can find a missing key very quickly
+ * without any heuristics: we only need to inspect exactly the number
+ * of entries in the bucket (or at most 1 if the bucket is empty).
+ *
+ * Find operations start exactly at an entry with a matching hash key
+ * unlike normal Robin Hood which must scan past a few earlier entries
+ * on average, or guestimate where to start and seek both ways.
+ *
+ * We can also very quickly insert a key that is known to be unique
+ * because we can add it directly to the end (but possibly requiring
+ * a shift of later entries Robin Hood style).
+ *
+ * Whether these benefits outweighs the cost of a separate offset
+ * lookup is unclear, but the reduced memory consumption certainly
+ * allows for a lower load factor, which also helps a lot.
+ *
+ * Traditional Robin Hood Hashing actually permits a chain to become
+ * very long. We do not permit this, in line with hopscotch hashing.
+ * This is a drawback from a security perspective because worst case
+ * this can trigger resizing ad infinitum iff the hash function can
+ * be hacked or massive duplicate key insertion can be triggered. By
+ * used the provided hash functions and seeding them randomly at
+ * startup, and avoiding the multi key feature, it is very unlikely to
+ * be a problem with what is known about hash table attacks so far.
+ *
+ * Values and keys are not stored, only item pointers. Custom macroes
+ * or inline functions provide access to key data from the item. We
+ * could add a separate value array and treat the item strictly as a
+ * key, but we can have a smaller load factor instead, and can more
+ * easily avoid copying complex key structures, such as start end
+ * pointers to token data for parser.
+ *
+ * A typical hash table has: key pointer or key value, value pointer
+ * or value, a cached hash key or bitmap (for Robin Hood or Hopscotch)
+ * which on 64 bit platforms easily amounts to 20 bytes or more per
+ * bucket. We use 9 bytes on 64 bit platforms and 5 bytes on 32 bit.
+ * This gets us down to a max load of 0.5 and on average about 0.37.
+ * This should make it very likely that the first bucket inspected is
+ * a direct hit negating the benefit of caching hash keys. In addition,
+ * when it is not a direct hit, we get pointers loaded in a cache line
+ * to inspect, all known to have the same hash key.
+ */
+
+int ht_init(hash_table_t *ht, size_t count)
+{
+ size_t buckets = 4;
+
+ if ((HT_LOAD_FACTOR_FRAC) > 256 || (HT_LOAD_FACTOR_FRAC) < 1) {
+ /*
+ * 101% will never terminate insertion.
+ * 0% will never terminate resize.
+ */
+ HT_PANIC("robin hood hash table failed with impossible load factor");
+ return -1;
+ }
+ while (count > buckets * (HT_LOAD_FACTOR_FRAC) / 256) {
+ buckets *= 2;
+ }
+ ht->table = calloc(buckets, sizeof(ht_item_t));
+ if (ht->table == 0) {
+ return -1;
+ }
+ ht->offsets = calloc(buckets, sizeof(char));
+ if (ht->offsets == 0) {
+ free(ht->table);
+ ht->table = 0;
+ return -1;
+ }
+ ht->buckets = buckets;
+ ht->count = 0;
+ return 0;
+}
+
+int ht_resize(hash_table_t *ht, size_t count)
+{
+ size_t i;
+ hash_table_t ht2;
+ ht_item_t *T = ht->table;
+ ht_item_t item;
+
+ if (count < ht->count) {
+ count = ht->count;
+ }
+ if (ht_init(&ht2, count)) {
+ return -1;
+ }
+ for (i = 0; i < ht->buckets; ++i) {
+ item = T[i];
+ if (item > (ht_item_t)1) {
+ ht_insert(&ht2, ht_key(item), ht_key_len(item), item, ht_multi);
+ }
+ }
+ ht_clear(ht);
+ memcpy(ht, &ht2, sizeof(*ht));
+ return 0;
+}
+
+ht_item_t ht_insert(hash_table_t *ht,
+ const void *key, size_t len, ht_item_t item, int mode)
+{
+ ht_item_t *T;
+ size_t N, n, j, k, offset;
+ ht_item_t new_item;
+ char overflow = 0;
+
+ new_item = item;
+ if (ht->count >= ht->buckets * (HT_LOAD_FACTOR_FRAC) / 256) {
+ if (ht_resize(ht, ht->count * 2)) {
+ HT_PANIC("robin hood hash table failed to allocate memory during resize");
+ return HT_NOMEM;
+ }
+ }
+ T = ht->table;
+ N = ht->buckets - 1;
+ k = HT_HASH_FUNCTION(key, len) & N;
+ offset = ht->offsets[k];
+ j = (k + offset) & N;
+ /*
+ * T[j] == 0 is a special case because we cannot count
+ * zero probe length, and because we should not increment
+ * the offset at insertion point in this case.
+ *
+ * T[j] == 0 implies offset == 0, but this way we avoid
+ * hitting memory that we don't need.
+ */
+ if (offset == 0 && T[j] == 0) {
+ ++ht->count;
+ T[j] = new_item;
+ return 0;
+ }
+ n = ht->offsets[(k + 1) & N] - offset + 1;
+ if (mode == ht_multi) {
+ /* Don't search for match before inserting. */
+ j = (j + n) & N;
+ n = 0;
+ }
+ while (n--) {
+ item = T[j];
+ if (ht_match(key, len, item)) {
+ if (mode == ht_replace) {
+ T[j] = new_item;
+ }
+ return item;
+ }
+ j = (j + 1) & N;
+ }
+ ++ht->count;
+ while (k != j) {
+ /* Only increment buckets after own bucket. */
+ k = (k + 1) & N;
+ overflow |= ++ht->offsets[k];
+ }
+ while ((item = T[j])) {
+ T[j] = new_item;
+ new_item = item;
+ j = (j + 1) & N;
+ overflow |= ++ht->offsets[j];
+ }
+ T[j] = new_item;
+
+ if (overflow < 0) {
+ /*
+ * At least one offset overflowed, so we need to
+ * resize the table.
+ */
+ if (ht->count * 10 < ht->buckets) {
+ HT_PANIC("FATAL: hash table resize on low utilization would explode\n"\
+ " possible collision DoS or bad hash function");
+ return HT_NOMEM;
+ }
+ if (ht_resize(ht, ht->count * 2)) {
+ HT_PANIC("FATAL: hash table resize failed and left hash table inconsistent");\
+ /*
+ * This renders the hash table in a bad state
+ * because we have updated to an inconsistent
+ * state.
+ */
+ return HT_NOMEM;
+ }
+ }
+ return item;
+}
+
+ht_item_t ht_find(hash_table_t *ht, const void *key, size_t len)
+{
+ ht_item_t *T = ht->table;
+ size_t N, n, j, k, offset;
+ ht_item_t item;
+
+ if (T == 0) {
+ return 0;
+ }
+ N = ht->buckets - 1;
+ k = HT_HASH_FUNCTION(key, len) & N;
+ offset = ht->offsets[k];
+ j = (k + offset) & N;
+ if (offset == 0 && T[j] == 0) {
+ /* Special case because we cannot count zero probe length. */
+ return 0;
+ }
+ n = ht->offsets[(k + 1) & N] - offset + 1;
+ while (n--) {
+ item = T[j];
+ if (ht_match(key, len, item)) {
+ return item;
+ }
+ j = (j + 1) & N;
+ }
+ return 0;
+}
+
+ht_item_t ht_remove(hash_table_t *ht, const void *key, size_t len)
+{
+ ht_item_t *T = ht->table;
+ size_t N, n, j, k, offset;
+ ht_item_t item, *next_item;
+
+ if (T == 0) {
+ return 0;
+ }
+ N = ht->buckets - 1;
+ k = HT_HASH_FUNCTION(key, len) & N;
+ offset = ht->offsets[k];
+ j = (k + offset) & N;
+ if (offset == 0 && T[j] == 0) {
+ return 0;
+ }
+ n = ht->offsets[(k + 1) & N] - offset + 1;
+ while (n) {
+ item = T[j];
+ if (ht_match(key, len, item)) {
+ break;
+ }
+ j = (j + 1) & N;
+ --n;
+ }
+ if (n == 0) {
+ return 0;
+ }
+ --ht->count;
+ while (k != j) {
+ /* Do not update the offset of the bucket that we own. */
+ k = (k + 1) & N;
+ --ht->offsets[k];
+ }
+ for (;;) {
+ j = (j + 1) & N;
+ if (ht->offsets[j] == 0) {
+ T[k] = 0;
+ return item;
+ }
+ --ht->offsets[j];
+ T[k] = T[j];
+ k = j;
+ }
+}
+
+void ht_visit(hash_table_t *ht, ht_visitor_f *visitor, void *context)
+{
+ size_t i;
+ ht_item_t *T = ht->table;
+ ht_item_t item;
+
+ for (i = 0; i < ht->buckets; ++i) {
+ item = T[i];
+ if (item > (ht_item_t)1) {
+ visitor(context, item);
+ }
+ }
+}
+
+void ht_clear(hash_table_t *ht)
+{
+ if (ht->table) {
+ free(ht->table);
+ }
+ if (ht->offsets) {
+ free(ht->offsets);
+ }
+ memset(ht, 0, sizeof(*ht));
+}