aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2020-06-10 23:43:35 +0200
committerLuca Deri <deri@ntop.org>2020-06-10 23:43:35 +0200
commit60aaa80570b48b15c14c2a5133d9b73f7578b21a (patch)
tree9615961ce54a59adb3ed993f58d5f125a58380b4
parent0b9e46e65a8be9b30de96ec2e9f6061e2b4034b8 (diff)
Added HyperLogLog cardinality estimator API calls
/* Memory lifecycle */ int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits); void ndpi_hll_destroy(struct ndpi_hll *hll); /* Add values */ void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len); void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) ; /* Get cardinality estimation */ double ndpi_hll_count(struct ndpi_hll *hll);
-rw-r--r--example/ndpiReader.c18
-rw-r--r--src/include/ndpi_api.h.in16
-rw-r--r--src/include/ndpi_define.h.in2
-rw-r--r--src/include/ndpi_typedefs.h8
-rw-r--r--src/lib/ndpi_analyze.c28
-rw-r--r--src/lib/third_party/include/MurmurHash3.h8
-rw-r--r--src/lib/third_party/include/hll.h27
-rw-r--r--src/lib/third_party/src/hll/MurmurHash3.c61
-rw-r--r--src/lib/third_party/src/hll/hll.c128
9 files changed, 296 insertions, 0 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index 3409500ef..fd55c3290 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -3048,6 +3048,23 @@ void test_lib() {
/* *********************************************** */
+static void hllUnitTest() {
+ struct ndpi_hll h;
+ u_int8_t bits = 8; /* >= 4, <= 16 */
+ u_int32_t i;
+
+ assert(ndpi_hll_init(&h, bits) == 0);
+
+ for(i=0; i<21320; i++)
+ ndpi_hll_add_number(&h, i);
+
+ /* printf("Count estimate: %f\n", ndpi_hll_count(&h)); */
+
+ ndpi_hll_destroy(&h);
+}
+
+/* *********************************************** */
+
static void bitmapUnitTest() {
u_int32_t val, i, j;
@@ -3341,6 +3358,7 @@ int orginal_main(int argc, char **argv) {
}
/* Internal checks */
+ hllUnitTest();
bitmapUnitTest();
automataUnitTest();
serializerUnitTest();
diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in
index 5f8604f8e..87429c6de 100644
--- a/src/include/ndpi_api.h.in
+++ b/src/include/ndpi_api.h.in
@@ -1043,6 +1043,22 @@ extern "C" {
void ndpi_serialize_risk(ndpi_serializer *serializer, struct ndpi_flow_struct *flow);
const char* ndpi_risk2str(ndpi_risk_enum risk);
+
+ /* ******************************* */
+
+ /* HyperLogLog cardinality estimator */
+
+ /* Memory lifecycle */
+ int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits);
+ void ndpi_hll_destroy(struct ndpi_hll *hll);
+
+ /* Add values */
+ void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len);
+ void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) ;
+
+ /* Get cardinality estimation */
+ double ndpi_hll_count(struct ndpi_hll *hll);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/include/ndpi_define.h.in b/src/include/ndpi_define.h.in
index fd0575b03..be7c21175 100644
--- a/src/include/ndpi_define.h.in
+++ b/src/include/ndpi_define.h.in
@@ -349,6 +349,8 @@
#define NDPI_CIPHER_WEAK 1
#define NDPI_CIPHER_INSECURE 2
+#define NDPI_OPTIMAL_HLL_NUM_BUCKETS 16
+
#ifdef __APPLE__
#include <libkern/OSByteOrder.h>
diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h
index 3ae88d0b3..29c9ed364 100644
--- a/src/include/ndpi_typedefs.h
+++ b/src/include/ndpi_typedefs.h
@@ -1489,4 +1489,12 @@ struct ndpi_analyze_struct {
typedef struct ndpi_ptree ndpi_ptree_t;
+/* **************************************** */
+
+struct ndpi_hll {
+ u_int8_t bits;
+ size_t size;
+ u_int8_t *registers;
+};
+
#endif /* __NDPI_TYPEDEFS_H__ */
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index f648e92ae..2d7e11abc 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -200,8 +200,36 @@ float ndpi_data_ratio(u_int32_t sent, u_int32_t rcvd) {
return((s == 0) ? 0 : (d/s));
}
+/* ********************************************************************************* */
+
const char* ndpi_data_ratio2str(float ratio) {
if(ratio < -0.2) return("Download");
else if(ratio > 0.2) return("Upload");
else return("Mixed");
}
+
+/* ********************************************************************************* */
+/* ********************************************************************************* */
+
+#include "third_party/src/hll/hll.c"
+#include "third_party/src/hll/MurmurHash3.c"
+
+int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits) {
+ return(hll_init(hll, bits));
+}
+
+void ndpi_hll_destroy(struct ndpi_hll *hll) {
+ hll_destroy(hll);
+}
+
+void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len) {
+ hll_add(hll, (const void *)data, data_len);
+}
+
+void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) {
+ hll_add(hll, (const void *)&value, sizeof(value));
+}
+
+double ndpi_hll_count(struct ndpi_hll *hll) {
+ return(hll_count(hll));
+}
diff --git a/src/lib/third_party/include/MurmurHash3.h b/src/lib/third_party/include/MurmurHash3.h
new file mode 100644
index 000000000..a048eb37d
--- /dev/null
+++ b/src/lib/third_party/include/MurmurHash3.h
@@ -0,0 +1,8 @@
+#ifndef _MURMURHASH3_H_
+#define _MURMURHASH3_H_
+
+#include <stdint.h>
+
+uint32_t MurmurHash3_x86_32(const void * key, uint32_t len, uint32_t seed);
+
+#endif
diff --git a/src/lib/third_party/include/hll.h b/src/lib/third_party/include/hll.h
new file mode 100644
index 000000000..00975ec36
--- /dev/null
+++ b/src/lib/third_party/include/hll.h
@@ -0,0 +1,27 @@
+/*
+ Code taken from https://github.com/avz/hll
+
+ Copyright (c) 2015 Artem Zaytsev <arepo@nologin.ru>
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+extern int hll_init(struct ndpi_hll *hll, u_int8_t bits);
+extern void hll_destroy(struct ndpi_hll *hll);
+extern void hll_add(struct ndpi_hll *hll, const void *buf, size_t size);
+extern double hll_count(const struct ndpi_hll *hll);
diff --git a/src/lib/third_party/src/hll/MurmurHash3.c b/src/lib/third_party/src/hll/MurmurHash3.c
new file mode 100644
index 000000000..47fb9da14
--- /dev/null
+++ b/src/lib/third_party/src/hll/MurmurHash3.c
@@ -0,0 +1,61 @@
+/*-----------------------------------------------------------------------------
+ MurmurHash3 was written by Austin Appleby, and is placed in the public
+ domain. The author hereby disclaims copyright to this source code.
+*/
+
+#include "MurmurHash3.h"
+
+#define ROTL32(x, r) ((x) << (r)) | ((x) >> (32 - (r)))
+
+u_int32_t MurmurHash3_x86_32(const void *key, u_int32_t len, u_int32_t seed) {
+ const u_int8_t *data = (const u_int8_t *)key;
+ const int32_t nblocks = (int32_t)len / 4;
+
+ u_int32_t h1 = seed;
+ int i;
+
+ const u_int32_t c1 = 0xcc9e2d51;
+ const u_int32_t c2 = 0x1b873593;
+ const u_int32_t *blocks = (const u_int32_t *)(data + nblocks * 4);
+
+ for(i = -nblocks; i; i++)
+ {
+ u_int32_t k1 = blocks[i];
+
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = ROTL32(h1, 13);
+ h1 = h1 * 5 + 0xe6546b64;
+ }
+
+ const u_int8_t * tail = (const u_int8_t *)(data + nblocks * 4);
+
+ u_int32_t k1 = 0;
+
+ switch(len & 3)
+ {
+ case 3:
+ k1 ^= (u_int32_t)tail[2] << 16;
+ case 2:
+ k1 ^= (u_int32_t)tail[1] << 8;
+ case 1:
+ k1 ^= tail[0];
+ k1 *= c1;
+ k1 = ROTL32(k1, 15);
+ k1 *= c2;
+ h1 ^= k1;
+ };
+
+ h1 ^= len;
+
+ h1 ^= h1 >> 16;
+ h1 *= 0x85ebca6b;
+ h1 ^= h1 >> 13;
+ h1 *= 0xc2b2ae35;
+ h1 ^= h1 >> 16;
+
+ return h1;
+}
diff --git a/src/lib/third_party/src/hll/hll.c b/src/lib/third_party/src/hll/hll.c
new file mode 100644
index 000000000..07d33f6dd
--- /dev/null
+++ b/src/lib/third_party/src/hll/hll.c
@@ -0,0 +1,128 @@
+/*
+ Code taken from https://github.com/avz/hll
+
+ Copyright (c) 2015 Artem Zaytsev <arepo@nologin.ru>
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included
+ in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+
+#include <stdio.h>
+
+#include "../include/MurmurHash3.h"
+#include "../include/hll.h"
+
+u_int32_t _hll_hash(const struct ndpi_hll *hll) {
+ return MurmurHash3_x86_32(hll->registers, (u_int32_t)hll->size, 0);
+}
+
+static __inline u_int8_t _hll_rank(u_int32_t hash, u_int8_t bits) {
+ u_int8_t i;
+
+ for(i = 1; i <= 32 - bits; i++) {
+ if(hash & 1)
+ break;
+
+ hash >>= 1;
+ }
+
+ return i;
+}
+
+int hll_init(struct ndpi_hll *hll, u_int8_t bits) {
+ if(bits < 4 || bits > 20) {
+ errno = ERANGE;
+ return -1;
+ }
+
+ hll->bits = bits;
+ hll->size = (size_t)1 << bits;
+ hll->registers = ndpi_calloc(hll->size, 1);
+
+ /* printf("%lu bytes\n", hll->size); */
+ return 0;
+}
+
+void hll_destroy(struct ndpi_hll *hll) {
+ ndpi_free(hll->registers);
+
+ hll->registers = NULL;
+}
+
+static __inline void _hll_add_hash(struct ndpi_hll *hll, u_int32_t hash) {
+ u_int32_t index = hash >> (32 - hll->bits);
+ u_int8_t rank = _hll_rank(hash, hll->bits);
+
+ if(rank > hll->registers[index]) {
+ hll->registers[index] = rank;
+ }
+}
+
+void hll_add(struct ndpi_hll *hll, const void *buf, size_t size) {
+ u_int32_t hash = MurmurHash3_x86_32((const char *)buf, (u_int32_t)size, 0x5f61767a);
+
+ _hll_add_hash(hll, hash);
+}
+
+double hll_count(const struct ndpi_hll *hll) {
+ double alpha_mm;
+ u_int32_t i;
+
+ switch (hll->bits) {
+ case 4:
+ alpha_mm = 0.673;
+ break;
+ case 5:
+ alpha_mm = 0.697;
+ break;
+ case 6:
+ alpha_mm = 0.709;
+ break;
+ default:
+ alpha_mm = 0.7213 / (1.0 + 1.079 / (double)hll->size);
+ break;
+ }
+
+ alpha_mm *= ((double)hll->size * (double)hll->size);
+
+ double sum = 0;
+ for(i = 0; i < hll->size; i++) {
+ sum += 1.0 / (1 << hll->registers[i]);
+ }
+
+ double estimate = alpha_mm / sum;
+
+ if (estimate <= 5.0 / 2.0 * (double)hll->size) {
+ int zeros = 0;
+
+ for(i = 0; i < hll->size; i++)
+ zeros += (hll->registers[i] == 0);
+
+ if(zeros)
+ estimate = (double)hll->size * log((double)hll->size / zeros);
+
+ } else if (estimate > (1.0 / 30.0) * 4294967296.0) {
+ estimate = -4294967296.0 * log(1.0 - (estimate / 4294967296.0));
+ }
+
+ return estimate;
+}
+