From 1c60c22893e465da75b825ce4bab80ca018e9104 Mon Sep 17 00:00:00 2001 From: Luca Deri Date: Tue, 7 Jul 2020 15:10:15 +0200 Subject: Added ndpi_cluster_bins() for clustering bins and ancillary functions for bins manipulation --- src/include/ndpi_api.h.in | 12 ++-- src/lib/ndpi_analyze.c | 175 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 177 insertions(+), 10 deletions(-) (limited to 'src') diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in index 865ddc8dd..94f5f54fe 100644 --- a/src/include/ndpi_api.h.in +++ b/src/include/ndpi_api.h.in @@ -931,7 +931,7 @@ extern "C" { int ndpi_check_dga_name(struct ndpi_detection_module_struct *ndpi_str, struct ndpi_flow_struct *flow, char *name); - + /* Serializer */ int ndpi_init_serializer_ll(ndpi_serializer *serializer, ndpi_serialization_format fmt, u_int32_t buffer_size); @@ -1067,14 +1067,18 @@ extern "C" { double ndpi_hll_count(struct ndpi_hll *hll); /* ******************************* */ - + int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins); void ndpi_free_bin(struct ndpi_bin *b); - void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id); + void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val); + void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value); + u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id); + void ndpi_reset_bin(struct ndpi_bin *b); void ndpi_normalize_bin(struct ndpi_bin *b); char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len); float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first); - + int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, + u_int8_t num_clusters, u_int16_t *cluster_ids); #ifdef __cplusplus } diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index 4ca3ac25a..74adbb2c9 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -284,26 +284,81 @@ void ndpi_free_bin(struct ndpi_bin *b) { /* ********************************************************************************* */ -void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id) { +void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { if(slot_id >= b->num_bins) slot_id = 0; - b->num_incs += 1; + b->num_incs += val; switch(b->family) { case ndpi_bin_family8: - b->u.bins8[slot_id]++; + b->u.bins8[slot_id] = (u_int8_t)val; break; case ndpi_bin_family16: - b->u.bins16[slot_id]++; + b->u.bins16[slot_id] = (u_int16_t)val; break; case ndpi_bin_family32: - b->u.bins32[slot_id]++; + b->u.bins32[slot_id] = (u_int32_t)val; break; } } /* ********************************************************************************* */ +void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { + if(slot_id >= b->num_bins) slot_id = 0; + + b->num_incs += val; + + switch(b->family) { + case ndpi_bin_family8: + b->u.bins8[slot_id] += (u_int8_t)val; + break; + case ndpi_bin_family16: + b->u.bins16[slot_id] += (u_int16_t)val; + break; + case ndpi_bin_family32: + b->u.bins32[slot_id] += (u_int32_t)val; + break; + } +} + +/* ********************************************************************************* */ + +u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) { + if(slot_id >= b->num_bins) slot_id = 0; + + switch(b->family) { + case ndpi_bin_family8: + return(b->u.bins8[slot_id]); + break; + case ndpi_bin_family16: + return(b->u.bins16[slot_id]); + break; + case ndpi_bin_family32: + return(b->u.bins32[slot_id]); + break; + } +} + +/* ********************************************************************************* */ + +void ndpi_reset_bin(struct ndpi_bin *b) { + b->num_incs = 0; + + switch(b->family) { + case ndpi_bin_family8: + memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins); + break; + case ndpi_bin_family16: + memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins); + break; + case ndpi_bin_family32: + memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins); + break; + } +} +/* ********************************************************************************* */ + /* Each bin slot is transformed in a % with respect to the value total */ @@ -387,7 +442,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor u_int32_t sumxx = 0, sumxy = 0, sumyy = 0; if((b1->num_incs == 0) || (b2->num_incs == 0) - || (b1->family != b2->family) || (b1->num_bins != b2->num_bins)) + // || (b1->family != b2->family) + || (b1->num_bins != b2->num_bins)) return(0); if(normalize_first) @@ -412,3 +468,110 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor } /* ********************************************************************************* */ + +/* + Clusters bins into 'num_clusters' + - (in) bins: a vection 'num_bins' long of bins to cluster + - (in) 'num_clusters': number of desired clusters 0...(num_clusters-1) + - (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin + + See + - https://en.wikipedia.org/wiki/K-means_clustering + */ +int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, + u_int8_t num_clusters, u_int16_t *cluster_ids) { + u_int16_t i, j, max_iterations = 100, num_iterations = 0, num_moves; + struct ndpi_bin *centroids; + u_int8_t verbose = 0; + + if(num_clusters > num_bins) return(-1); + + if((centroids = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) + return(-2); + else { + for(i=0; i top_similarity) + cluster_id = j, top_similarity = similarity; + } + + cluster_ids[i] = cluster_id; + } + + /* Now let's try to find a better arrangement */ + while(num_iterations++ < max_iterations) { + /* Find the center of each cluster */ + + if(verbose) printf("Iteration %u\n", num_iterations); + + for(i=0; i top_similarity) + cluster_id = j, top_similarity = similarity; + } + + if((top_similarity > 0) && (cluster_ids[i] != cluster_id)) { + if(verbose) + printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", + i, cluster_ids[i], cluster_id, top_similarity); + + cluster_ids[i] = cluster_id; + num_moves++; + } + } + + if(num_moves == 0) + break; + } + + for(i=0; i