diff options
author | Luca Deri <deri@ntop.org> | 2020-06-10 23:43:35 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2020-06-10 23:43:35 +0200 |
commit | 60aaa80570b48b15c14c2a5133d9b73f7578b21a (patch) | |
tree | 9615961ce54a59adb3ed993f58d5f125a58380b4 | |
parent | 0b9e46e65a8be9b30de96ec2e9f6061e2b4034b8 (diff) |
Added HyperLogLog cardinality estimator API calls
/* Memory lifecycle */
int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits);
void ndpi_hll_destroy(struct ndpi_hll *hll);
/* Add values */
void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len);
void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) ;
/* Get cardinality estimation */
double ndpi_hll_count(struct ndpi_hll *hll);
-rw-r--r-- | example/ndpiReader.c | 18 | ||||
-rw-r--r-- | src/include/ndpi_api.h.in | 16 | ||||
-rw-r--r-- | src/include/ndpi_define.h.in | 2 | ||||
-rw-r--r-- | src/include/ndpi_typedefs.h | 8 | ||||
-rw-r--r-- | src/lib/ndpi_analyze.c | 28 | ||||
-rw-r--r-- | src/lib/third_party/include/MurmurHash3.h | 8 | ||||
-rw-r--r-- | src/lib/third_party/include/hll.h | 27 | ||||
-rw-r--r-- | src/lib/third_party/src/hll/MurmurHash3.c | 61 | ||||
-rw-r--r-- | src/lib/third_party/src/hll/hll.c | 128 |
9 files changed, 296 insertions, 0 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c index 3409500ef..fd55c3290 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -3048,6 +3048,23 @@ void test_lib() { /* *********************************************** */ +static void hllUnitTest() { + struct ndpi_hll h; + u_int8_t bits = 8; /* >= 4, <= 16 */ + u_int32_t i; + + assert(ndpi_hll_init(&h, bits) == 0); + + for(i=0; i<21320; i++) + ndpi_hll_add_number(&h, i); + + /* printf("Count estimate: %f\n", ndpi_hll_count(&h)); */ + + ndpi_hll_destroy(&h); +} + +/* *********************************************** */ + static void bitmapUnitTest() { u_int32_t val, i, j; @@ -3341,6 +3358,7 @@ int orginal_main(int argc, char **argv) { } /* Internal checks */ + hllUnitTest(); bitmapUnitTest(); automataUnitTest(); serializerUnitTest(); diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in index 5f8604f8e..87429c6de 100644 --- a/src/include/ndpi_api.h.in +++ b/src/include/ndpi_api.h.in @@ -1043,6 +1043,22 @@ extern "C" { void ndpi_serialize_risk(ndpi_serializer *serializer, struct ndpi_flow_struct *flow); const char* ndpi_risk2str(ndpi_risk_enum risk); + + /* ******************************* */ + + /* HyperLogLog cardinality estimator */ + + /* Memory lifecycle */ + int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits); + void ndpi_hll_destroy(struct ndpi_hll *hll); + + /* Add values */ + void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len); + void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) ; + + /* Get cardinality estimation */ + double ndpi_hll_count(struct ndpi_hll *hll); + #ifdef __cplusplus } #endif diff --git a/src/include/ndpi_define.h.in b/src/include/ndpi_define.h.in index fd0575b03..be7c21175 100644 --- a/src/include/ndpi_define.h.in +++ b/src/include/ndpi_define.h.in @@ -349,6 +349,8 @@ #define NDPI_CIPHER_WEAK 1 #define NDPI_CIPHER_INSECURE 2 +#define NDPI_OPTIMAL_HLL_NUM_BUCKETS 16 + #ifdef __APPLE__ #include <libkern/OSByteOrder.h> diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h index 3ae88d0b3..29c9ed364 100644 --- a/src/include/ndpi_typedefs.h +++ b/src/include/ndpi_typedefs.h @@ -1489,4 +1489,12 @@ struct ndpi_analyze_struct { typedef struct ndpi_ptree ndpi_ptree_t; +/* **************************************** */ + +struct ndpi_hll { + u_int8_t bits; + size_t size; + u_int8_t *registers; +}; + #endif /* __NDPI_TYPEDEFS_H__ */ diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index f648e92ae..2d7e11abc 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -200,8 +200,36 @@ float ndpi_data_ratio(u_int32_t sent, u_int32_t rcvd) { return((s == 0) ? 0 : (d/s)); } +/* ********************************************************************************* */ + const char* ndpi_data_ratio2str(float ratio) { if(ratio < -0.2) return("Download"); else if(ratio > 0.2) return("Upload"); else return("Mixed"); } + +/* ********************************************************************************* */ +/* ********************************************************************************* */ + +#include "third_party/src/hll/hll.c" +#include "third_party/src/hll/MurmurHash3.c" + +int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits) { + return(hll_init(hll, bits)); +} + +void ndpi_hll_destroy(struct ndpi_hll *hll) { + hll_destroy(hll); +} + +void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len) { + hll_add(hll, (const void *)data, data_len); +} + +void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) { + hll_add(hll, (const void *)&value, sizeof(value)); +} + +double ndpi_hll_count(struct ndpi_hll *hll) { + return(hll_count(hll)); +} diff --git a/src/lib/third_party/include/MurmurHash3.h b/src/lib/third_party/include/MurmurHash3.h new file mode 100644 index 000000000..a048eb37d --- /dev/null +++ b/src/lib/third_party/include/MurmurHash3.h @@ -0,0 +1,8 @@ +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include <stdint.h> + +uint32_t MurmurHash3_x86_32(const void * key, uint32_t len, uint32_t seed); + +#endif diff --git a/src/lib/third_party/include/hll.h b/src/lib/third_party/include/hll.h new file mode 100644 index 000000000..00975ec36 --- /dev/null +++ b/src/lib/third_party/include/hll.h @@ -0,0 +1,27 @@ +/* + Code taken from https://github.com/avz/hll + + Copyright (c) 2015 Artem Zaytsev <arepo@nologin.ru> + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + */ + + +extern int hll_init(struct ndpi_hll *hll, u_int8_t bits); +extern void hll_destroy(struct ndpi_hll *hll); +extern void hll_add(struct ndpi_hll *hll, const void *buf, size_t size); +extern double hll_count(const struct ndpi_hll *hll); diff --git a/src/lib/third_party/src/hll/MurmurHash3.c b/src/lib/third_party/src/hll/MurmurHash3.c new file mode 100644 index 000000000..47fb9da14 --- /dev/null +++ b/src/lib/third_party/src/hll/MurmurHash3.c @@ -0,0 +1,61 @@ +/*----------------------------------------------------------------------------- + MurmurHash3 was written by Austin Appleby, and is placed in the public + domain. The author hereby disclaims copyright to this source code. +*/ + +#include "MurmurHash3.h" + +#define ROTL32(x, r) ((x) << (r)) | ((x) >> (32 - (r))) + +u_int32_t MurmurHash3_x86_32(const void *key, u_int32_t len, u_int32_t seed) { + const u_int8_t *data = (const u_int8_t *)key; + const int32_t nblocks = (int32_t)len / 4; + + u_int32_t h1 = seed; + int i; + + const u_int32_t c1 = 0xcc9e2d51; + const u_int32_t c2 = 0x1b873593; + const u_int32_t *blocks = (const u_int32_t *)(data + nblocks * 4); + + for(i = -nblocks; i; i++) + { + u_int32_t k1 = blocks[i]; + + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + const u_int8_t * tail = (const u_int8_t *)(data + nblocks * 4); + + u_int32_t k1 = 0; + + switch(len & 3) + { + case 3: + k1 ^= (u_int32_t)tail[2] << 16; + case 2: + k1 ^= (u_int32_t)tail[1] << 8; + case 1: + k1 ^= tail[0]; + k1 *= c1; + k1 = ROTL32(k1, 15); + k1 *= c2; + h1 ^= k1; + }; + + h1 ^= len; + + h1 ^= h1 >> 16; + h1 *= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *= 0xc2b2ae35; + h1 ^= h1 >> 16; + + return h1; +} diff --git a/src/lib/third_party/src/hll/hll.c b/src/lib/third_party/src/hll/hll.c new file mode 100644 index 000000000..07d33f6dd --- /dev/null +++ b/src/lib/third_party/src/hll/hll.c @@ -0,0 +1,128 @@ +/* + Code taken from https://github.com/avz/hll + + Copyright (c) 2015 Artem Zaytsev <arepo@nologin.ru> + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <stdlib.h> +#include <errno.h> +#include <math.h> +#include <string.h> + +#include <stdio.h> + +#include "../include/MurmurHash3.h" +#include "../include/hll.h" + +u_int32_t _hll_hash(const struct ndpi_hll *hll) { + return MurmurHash3_x86_32(hll->registers, (u_int32_t)hll->size, 0); +} + +static __inline u_int8_t _hll_rank(u_int32_t hash, u_int8_t bits) { + u_int8_t i; + + for(i = 1; i <= 32 - bits; i++) { + if(hash & 1) + break; + + hash >>= 1; + } + + return i; +} + +int hll_init(struct ndpi_hll *hll, u_int8_t bits) { + if(bits < 4 || bits > 20) { + errno = ERANGE; + return -1; + } + + hll->bits = bits; + hll->size = (size_t)1 << bits; + hll->registers = ndpi_calloc(hll->size, 1); + + /* printf("%lu bytes\n", hll->size); */ + return 0; +} + +void hll_destroy(struct ndpi_hll *hll) { + ndpi_free(hll->registers); + + hll->registers = NULL; +} + +static __inline void _hll_add_hash(struct ndpi_hll *hll, u_int32_t hash) { + u_int32_t index = hash >> (32 - hll->bits); + u_int8_t rank = _hll_rank(hash, hll->bits); + + if(rank > hll->registers[index]) { + hll->registers[index] = rank; + } +} + +void hll_add(struct ndpi_hll *hll, const void *buf, size_t size) { + u_int32_t hash = MurmurHash3_x86_32((const char *)buf, (u_int32_t)size, 0x5f61767a); + + _hll_add_hash(hll, hash); +} + +double hll_count(const struct ndpi_hll *hll) { + double alpha_mm; + u_int32_t i; + + switch (hll->bits) { + case 4: + alpha_mm = 0.673; + break; + case 5: + alpha_mm = 0.697; + break; + case 6: + alpha_mm = 0.709; + break; + default: + alpha_mm = 0.7213 / (1.0 + 1.079 / (double)hll->size); + break; + } + + alpha_mm *= ((double)hll->size * (double)hll->size); + + double sum = 0; + for(i = 0; i < hll->size; i++) { + sum += 1.0 / (1 << hll->registers[i]); + } + + double estimate = alpha_mm / sum; + + if (estimate <= 5.0 / 2.0 * (double)hll->size) { + int zeros = 0; + + for(i = 0; i < hll->size; i++) + zeros += (hll->registers[i] == 0); + + if(zeros) + estimate = (double)hll->size * log((double)hll->size / zeros); + + } else if (estimate > (1.0 / 30.0) * 4294967296.0) { + estimate = -4294967296.0 * log(1.0 - (estimate / 4294967296.0)); + } + + return estimate; +} + |