From 439558f6a3c2bd449fba5675290abe1c1d308494 Mon Sep 17 00:00:00 2001 From: Luca Deri Date: Wed, 22 Jul 2020 23:24:13 +0200 Subject: Improved bin clustering --- src/lib/ndpi_analyze.c | 112 +++++++++++++++++++++++++++++++++++++++---------- src/lib/ndpi_main.c | 4 +- 2 files changed, 91 insertions(+), 25 deletions(-) (limited to 'src') diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index a2fe557ae..035b49a6a 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -78,7 +78,7 @@ void ndpi_free_data_analysis(struct ndpi_analyze_struct *d) { void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d) { memset(d, 0, sizeof(struct ndpi_analyze_struct)); memset(d->values, 0, sizeof(u_int32_t)*d->num_values_array_len); - d->num_data_entries = 0; + d->num_data_entries = 0; } /* ********************************************************************************* */ @@ -123,11 +123,11 @@ float ndpi_data_average(struct ndpi_analyze_struct *s) { u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s) { if((s->num_data_entries == 0) || (s->sum_total == 0)) return(0); - + if(s->next_value_insert_index == 0) return(s->values[s->num_values_array_len-1]); else - return(s->values[s->next_value_insert_index-1]); + return(s->values[s->next_value_insert_index-1]); } /* Return min/max on all values */ @@ -164,7 +164,7 @@ float ndpi_data_window_average(struct ndpi_analyze_struct *s) { if(n == 0) return(0); - + for(i=0; ivalues[i]; @@ -183,7 +183,7 @@ float ndpi_data_window_variance(struct ndpi_analyze_struct *s) { if(n == 0) return(0); - + for(i=0; ivalues[i]-avg, 2); @@ -393,7 +393,7 @@ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { b->is_empty = 0; - + if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { @@ -456,7 +456,7 @@ void ndpi_normalize_bin(struct ndpi_bin *b) { u_int32_t tot = 0; if(b->is_empty) return; - + switch(b->family) { case ndpi_bin_family8: for(i=0; inum_bins; i++) tot += b->u.bins8[i]; @@ -567,10 +567,10 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor for(i=0; inum_bins; i++) { u_int32_t a = ndpi_get_bin_value(b1, i); u_int32_t b = ndpi_get_bin_value(b2, i); - + sumxx += a*a, sumyy += b*b, sumxy += a*b; } - + if((sumxx == 0) || (sumyy == 0)) return(0); else @@ -586,7 +586,7 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor sum += pow(a-b, 2); } - + /* The lower the more similar */ return(sqrt(sum)); } @@ -595,6 +595,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor /* ********************************************************************************* */ +#define MAX_NUM_CLUSTERS 128 + /* Clusters bins into 'num_clusters' - (in) bins: a vection 'num_bins' long of bins to cluster @@ -610,18 +612,25 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, u_int16_t i, j, max_iterations = 25, num_iterations, num_moves; u_int8_t verbose = 0, alloc_centroids = 0; char out_buf[256]; - - if(num_clusters > num_bins) num_clusters = num_bins; + float *bin_score; + u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 }; + + if(num_clusters > num_bins) num_clusters = num_bins; + if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS; if(verbose) printf("Distributing %u bins over %u clusters\n", num_bins, num_clusters); + if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL) + return(-2); + if(centroids == NULL) { alloc_centroids = 1; - if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) + if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) { + ndpi_free(bin_score); return(-2); - else { + } else { for(i=0; i best_similarity) + if(similarity > best_similarity) { + cluster_id = j, best_similarity = similarity; + } #else - if(similarity < best_similarity) -#endif + if(similarity < best_similarity) { cluster_id = j, best_similarity = similarity; + } +#endif } + bin_score[i] = best_similarity; + if(/* (best_similarity > 0) && */ (cluster_ids[i] != cluster_id)) { if(verbose) printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", i, cluster_ids[i], cluster_id, best_similarity); + num_cluster_elems[cluster_ids[i]]--; + num_cluster_elems[cluster_id]++; + cluster_ids[i] = cluster_id; num_moves++; } @@ -739,7 +764,46 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, if(num_moves == 0) break; - } + + if(verbose) { + for(j=0; j 1)) + score = bin_score[i], candidate = i; + } +#else + score = 0; + + for(i=0; i score) && (num_cluster_elems[cluster_ids[i]] > 1)) + score = bin_score[i], candidate = i; + } +#endif + + if(verbose) + printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n", + candidate, cluster_ids[candidate], j, score); + + num_cluster_elems[cluster_ids[candidate]]--; + num_cluster_elems[j]++; + cluster_ids[candidate] = j; + } + } + } /* while(...) */ if(alloc_centroids) { for(i=0; i