diff options
author | Luca Deri <deri@ntop.org> | 2020-07-22 23:24:13 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2020-07-22 23:56:50 +0200 |
commit | 439558f6a3c2bd449fba5675290abe1c1d308494 (patch) | |
tree | f24f3b173f6314ee5c86f0a690f509c34f3a9cb5 | |
parent | 0956badd11b8a046b30aa62be07c225c6d54f7ff (diff) |
Improved bin clustering
-rw-r--r-- | example/ndpiReader.c | 2 | ||||
-rw-r--r-- | src/lib/ndpi_analyze.c | 112 | ||||
-rw-r--r-- | src/lib/ndpi_main.c | 4 |
3 files changed, 92 insertions, 26 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c index 4f1767846..8071f2fc5 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -2509,7 +2509,7 @@ static void printFlowsStats() { all_flows[i].flow->detected_protocol, buf, sizeof(buf)), all_flows[i].flow->src_name, ntohs(all_flows[i].flow->src_port), - all_flows[i].flow->src_name, + all_flows[i].flow->dst_name, ntohs(all_flows[i].flow->dst_port)); print_bin(out, NULL, &bins[i]); diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index a2fe557ae..035b49a6a 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -78,7 +78,7 @@ void ndpi_free_data_analysis(struct ndpi_analyze_struct *d) { void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d) { memset(d, 0, sizeof(struct ndpi_analyze_struct)); memset(d->values, 0, sizeof(u_int32_t)*d->num_values_array_len); - d->num_data_entries = 0; + d->num_data_entries = 0; } /* ********************************************************************************* */ @@ -123,11 +123,11 @@ float ndpi_data_average(struct ndpi_analyze_struct *s) { u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s) { if((s->num_data_entries == 0) || (s->sum_total == 0)) return(0); - + if(s->next_value_insert_index == 0) return(s->values[s->num_values_array_len-1]); else - return(s->values[s->next_value_insert_index-1]); + return(s->values[s->next_value_insert_index-1]); } /* Return min/max on all values */ @@ -164,7 +164,7 @@ float ndpi_data_window_average(struct ndpi_analyze_struct *s) { if(n == 0) return(0); - + for(i=0; i<n; i++) sum += s->values[i]; @@ -183,7 +183,7 @@ float ndpi_data_window_variance(struct ndpi_analyze_struct *s) { if(n == 0) return(0); - + for(i=0; i<n; i++) sum += pow(s->values[i]-avg, 2); @@ -393,7 +393,7 @@ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { b->is_empty = 0; - + if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { @@ -456,7 +456,7 @@ void ndpi_normalize_bin(struct ndpi_bin *b) { u_int32_t tot = 0; if(b->is_empty) return; - + switch(b->family) { case ndpi_bin_family8: for(i=0; i<b->num_bins; i++) tot += b->u.bins8[i]; @@ -567,10 +567,10 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor for(i=0; i<b1->num_bins; i++) { u_int32_t a = ndpi_get_bin_value(b1, i); u_int32_t b = ndpi_get_bin_value(b2, i); - + sumxx += a*a, sumyy += b*b, sumxy += a*b; } - + if((sumxx == 0) || (sumyy == 0)) return(0); else @@ -586,7 +586,7 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor sum += pow(a-b, 2); } - + /* The lower the more similar */ return(sqrt(sum)); } @@ -595,6 +595,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor /* ********************************************************************************* */ +#define MAX_NUM_CLUSTERS 128 + /* Clusters bins into 'num_clusters' - (in) bins: a vection 'num_bins' long of bins to cluster @@ -610,18 +612,25 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, u_int16_t i, j, max_iterations = 25, num_iterations, num_moves; u_int8_t verbose = 0, alloc_centroids = 0; char out_buf[256]; - - if(num_clusters > num_bins) num_clusters = num_bins; + float *bin_score; + u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 }; + + if(num_clusters > num_bins) num_clusters = num_bins; + if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS; if(verbose) printf("Distributing %u bins over %u clusters\n", num_bins, num_clusters); + if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL) + return(-2); + if(centroids == NULL) { alloc_centroids = 1; - if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) + if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) { + ndpi_free(bin_score); return(-2); - else { + } else { for(i=0; i<num_clusters; i++) ndpi_init_bin(¢roids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins); } @@ -638,6 +647,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, printf("Initializing cluster %u: %s\n", i, ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf))); + num_cluster_elems[i]++; } /* Assign the remaining bins to the nearest cluster */ @@ -668,6 +678,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)), best_similarity); cluster_ids[i] = cluster_id; + num_cluster_elems[cluster_id]++; } num_iterations = 0; @@ -675,9 +686,15 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, /* Now let's try to find a better arrangement */ while(num_iterations++ < max_iterations) { /* Find the center of each cluster */ + memset(bin_score, 0, num_bins*sizeof(float)); - if(verbose) printf("Iteration %u\n", num_iterations); - + if(verbose) { + printf("\nIteration %u\n", num_iterations); + + for(j=0; j<num_clusters; j++) + printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]); + } + for(i=0; i<num_clusters; i++) ndpi_reset_bin(¢roids[i]); @@ -704,34 +721,42 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, u_int8_t cluster_id = 0; #ifdef COSINE_SIMILARITY - best_similarity = -1; + best_similarity = -1; #else - best_similarity = 99999999999; + best_similarity = 99999999999; #endif for(j=0; j<num_clusters; j++) { float similarity; if(centroids[j].is_empty) continue; - + similarity = ndpi_bin_similarity(&bins[i], ¢roids[j], 0); if(verbose) printf("Bin %u / centroid %u [similarity: %f]\n", i, j, similarity); #ifdef COSINE_SIMILARITY - if(similarity > best_similarity) + if(similarity > best_similarity) { + cluster_id = j, best_similarity = similarity; + } #else - if(similarity < best_similarity) -#endif + if(similarity < best_similarity) { cluster_id = j, best_similarity = similarity; + } +#endif } + bin_score[i] = best_similarity; + if(/* (best_similarity > 0) && */ (cluster_ids[i] != cluster_id)) { if(verbose) printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", i, cluster_ids[i], cluster_id, best_similarity); + num_cluster_elems[cluster_ids[i]]--; + num_cluster_elems[cluster_id]++; + cluster_ids[i] = cluster_id; num_moves++; } @@ -739,7 +764,46 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, if(num_moves == 0) break; - } + + if(verbose) { + for(j=0; j<num_clusters; j++) + printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]); + } + + for(j=0; j<num_clusters; j++) { + if(num_cluster_elems[j] == 0) { + u_int16_t candidate; + float score; + + if(verbose) + printf("\nCluster %u is empty: need to rebalance\n", j); + +#ifdef COSINE_SIMILARITY + score = 99999999999; + + for(i=0; i<num_bins; i++) { + if((cluster_ids[i] != j) && (bin_score[i] < score) && (num_cluster_elems[cluster_ids[i]] > 1)) + score = bin_score[i], candidate = i; + } +#else + score = 0; + + for(i=0; i<num_bins; i++) { + if((cluster_ids[i] != j) && (bin_score[i] > score) && (num_cluster_elems[cluster_ids[i]] > 1)) + score = bin_score[i], candidate = i; + } +#endif + + if(verbose) + printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n", + candidate, cluster_ids[candidate], j, score); + + num_cluster_elems[cluster_ids[candidate]]--; + num_cluster_elems[j]++; + cluster_ids[candidate] = j; + } + } + } /* while(...) */ if(alloc_centroids) { for(i=0; i<num_clusters; i++) @@ -748,6 +812,8 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, ndpi_free(centroids); } + ndpi_free(bin_score); + return(0); } diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c index b2f294c0b..b497bbaf8 100644 --- a/src/lib/ndpi_main.c +++ b/src/lib/ndpi_main.c @@ -1035,8 +1035,8 @@ static void ndpi_init_protocol_defaults(struct ndpi_detection_module_struct *ndp ndpi_build_default_ports(ports_a, 5900, 5901, 5800, 0, 0) /* TCP */, ndpi_build_default_ports(ports_b, 0, 0, 0, 0, 0) /* UDP */); ndpi_set_proto_defaults(ndpi_str, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_FREE90, 0 /* can_have_a_subprotocol */, - no_master, no_master, "Free90", NDPI_PROTOCOL_CATEGORY_REMOTE_ACCESS, - ndpi_build_default_ports(ports_a, 5900, 5901, 5800, 0, 0) /* TCP */, + no_master, no_master, "FREE_90", NDPI_PROTOCOL_CATEGORY_REMOTE_ACCESS, + ndpi_build_default_ports(ports_a, 0, 0, 0, 0, 0) /* TCP */, ndpi_build_default_ports(ports_b, 0, 0, 0, 0, 0) /* UDP */); ndpi_set_proto_defaults(ndpi_str, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_ZOOM, 0 /* can_have_a_subprotocol */, no_master, no_master, "Zoom", NDPI_PROTOCOL_CATEGORY_VIDEO, |