aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2020-07-22 23:24:13 +0200
committerLuca Deri <deri@ntop.org>2020-07-22 23:56:50 +0200
commit439558f6a3c2bd449fba5675290abe1c1d308494 (patch)
treef24f3b173f6314ee5c86f0a690f509c34f3a9cb5
parent0956badd11b8a046b30aa62be07c225c6d54f7ff (diff)
Improved bin clustering
-rw-r--r--example/ndpiReader.c2
-rw-r--r--src/lib/ndpi_analyze.c112
-rw-r--r--src/lib/ndpi_main.c4
3 files changed, 92 insertions, 26 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index 4f1767846..8071f2fc5 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -2509,7 +2509,7 @@ static void printFlowsStats() {
all_flows[i].flow->detected_protocol, buf, sizeof(buf)),
all_flows[i].flow->src_name,
ntohs(all_flows[i].flow->src_port),
- all_flows[i].flow->src_name,
+ all_flows[i].flow->dst_name,
ntohs(all_flows[i].flow->dst_port));
print_bin(out, NULL, &bins[i]);
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index a2fe557ae..035b49a6a 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -78,7 +78,7 @@ void ndpi_free_data_analysis(struct ndpi_analyze_struct *d) {
void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d) {
memset(d, 0, sizeof(struct ndpi_analyze_struct));
memset(d->values, 0, sizeof(u_int32_t)*d->num_values_array_len);
- d->num_data_entries = 0;
+ d->num_data_entries = 0;
}
/* ********************************************************************************* */
@@ -123,11 +123,11 @@ float ndpi_data_average(struct ndpi_analyze_struct *s) {
u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s) {
if((s->num_data_entries == 0) || (s->sum_total == 0))
return(0);
-
+
if(s->next_value_insert_index == 0)
return(s->values[s->num_values_array_len-1]);
else
- return(s->values[s->next_value_insert_index-1]);
+ return(s->values[s->next_value_insert_index-1]);
}
/* Return min/max on all values */
@@ -164,7 +164,7 @@ float ndpi_data_window_average(struct ndpi_analyze_struct *s) {
if(n == 0)
return(0);
-
+
for(i=0; i<n; i++)
sum += s->values[i];
@@ -183,7 +183,7 @@ float ndpi_data_window_variance(struct ndpi_analyze_struct *s) {
if(n == 0)
return(0);
-
+
for(i=0; i<n; i++)
sum += pow(s->values[i]-avg, 2);
@@ -393,7 +393,7 @@ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
b->is_empty = 0;
-
+
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
@@ -456,7 +456,7 @@ void ndpi_normalize_bin(struct ndpi_bin *b) {
u_int32_t tot = 0;
if(b->is_empty) return;
-
+
switch(b->family) {
case ndpi_bin_family8:
for(i=0; i<b->num_bins; i++) tot += b->u.bins8[i];
@@ -567,10 +567,10 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
for(i=0; i<b1->num_bins; i++) {
u_int32_t a = ndpi_get_bin_value(b1, i);
u_int32_t b = ndpi_get_bin_value(b2, i);
-
+
sumxx += a*a, sumyy += b*b, sumxy += a*b;
}
-
+
if((sumxx == 0) || (sumyy == 0))
return(0);
else
@@ -586,7 +586,7 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
sum += pow(a-b, 2);
}
-
+
/* The lower the more similar */
return(sqrt(sum));
}
@@ -595,6 +595,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
/* ********************************************************************************* */
+#define MAX_NUM_CLUSTERS 128
+
/*
Clusters bins into 'num_clusters'
- (in) bins: a vection 'num_bins' long of bins to cluster
@@ -610,18 +612,25 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
u_int16_t i, j, max_iterations = 25, num_iterations, num_moves;
u_int8_t verbose = 0, alloc_centroids = 0;
char out_buf[256];
-
- if(num_clusters > num_bins) num_clusters = num_bins;
+ float *bin_score;
+ u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 };
+
+ if(num_clusters > num_bins) num_clusters = num_bins;
+ if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS;
if(verbose)
printf("Distributing %u bins over %u clusters\n", num_bins, num_clusters);
+ if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL)
+ return(-2);
+
if(centroids == NULL) {
alloc_centroids = 1;
- if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL)
+ if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) {
+ ndpi_free(bin_score);
return(-2);
- else {
+ } else {
for(i=0; i<num_clusters; i++)
ndpi_init_bin(&centroids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins);
}
@@ -638,6 +647,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
printf("Initializing cluster %u: %s\n", i,
ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)));
+ num_cluster_elems[i]++;
}
/* Assign the remaining bins to the nearest cluster */
@@ -668,6 +678,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)), best_similarity);
cluster_ids[i] = cluster_id;
+ num_cluster_elems[cluster_id]++;
}
num_iterations = 0;
@@ -675,9 +686,15 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
/* Now let's try to find a better arrangement */
while(num_iterations++ < max_iterations) {
/* Find the center of each cluster */
+ memset(bin_score, 0, num_bins*sizeof(float));
- if(verbose) printf("Iteration %u\n", num_iterations);
-
+ if(verbose) {
+ printf("\nIteration %u\n", num_iterations);
+
+ for(j=0; j<num_clusters; j++)
+ printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]);
+ }
+
for(i=0; i<num_clusters; i++)
ndpi_reset_bin(&centroids[i]);
@@ -704,34 +721,42 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
u_int8_t cluster_id = 0;
#ifdef COSINE_SIMILARITY
- best_similarity = -1;
+ best_similarity = -1;
#else
- best_similarity = 99999999999;
+ best_similarity = 99999999999;
#endif
for(j=0; j<num_clusters; j++) {
float similarity;
if(centroids[j].is_empty) continue;
-
+
similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0);
if(verbose)
printf("Bin %u / centroid %u [similarity: %f]\n", i, j, similarity);
#ifdef COSINE_SIMILARITY
- if(similarity > best_similarity)
+ if(similarity > best_similarity) {
+ cluster_id = j, best_similarity = similarity;
+ }
#else
- if(similarity < best_similarity)
-#endif
+ if(similarity < best_similarity) {
cluster_id = j, best_similarity = similarity;
+ }
+#endif
}
+ bin_score[i] = best_similarity;
+
if(/* (best_similarity > 0) && */ (cluster_ids[i] != cluster_id)) {
if(verbose)
printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n",
i, cluster_ids[i], cluster_id, best_similarity);
+ num_cluster_elems[cluster_ids[i]]--;
+ num_cluster_elems[cluster_id]++;
+
cluster_ids[i] = cluster_id;
num_moves++;
}
@@ -739,7 +764,46 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
if(num_moves == 0)
break;
- }
+
+ if(verbose) {
+ for(j=0; j<num_clusters; j++)
+ printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]);
+ }
+
+ for(j=0; j<num_clusters; j++) {
+ if(num_cluster_elems[j] == 0) {
+ u_int16_t candidate;
+ float score;
+
+ if(verbose)
+ printf("\nCluster %u is empty: need to rebalance\n", j);
+
+#ifdef COSINE_SIMILARITY
+ score = 99999999999;
+
+ for(i=0; i<num_bins; i++) {
+ if((cluster_ids[i] != j) && (bin_score[i] < score) && (num_cluster_elems[cluster_ids[i]] > 1))
+ score = bin_score[i], candidate = i;
+ }
+#else
+ score = 0;
+
+ for(i=0; i<num_bins; i++) {
+ if((cluster_ids[i] != j) && (bin_score[i] > score) && (num_cluster_elems[cluster_ids[i]] > 1))
+ score = bin_score[i], candidate = i;
+ }
+#endif
+
+ if(verbose)
+ printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n",
+ candidate, cluster_ids[candidate], j, score);
+
+ num_cluster_elems[cluster_ids[candidate]]--;
+ num_cluster_elems[j]++;
+ cluster_ids[candidate] = j;
+ }
+ }
+ } /* while(...) */
if(alloc_centroids) {
for(i=0; i<num_clusters; i++)
@@ -748,6 +812,8 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
ndpi_free(centroids);
}
+ ndpi_free(bin_score);
+
return(0);
}
diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c
index b2f294c0b..b497bbaf8 100644
--- a/src/lib/ndpi_main.c
+++ b/src/lib/ndpi_main.c
@@ -1035,8 +1035,8 @@ static void ndpi_init_protocol_defaults(struct ndpi_detection_module_struct *ndp
ndpi_build_default_ports(ports_a, 5900, 5901, 5800, 0, 0) /* TCP */,
ndpi_build_default_ports(ports_b, 0, 0, 0, 0, 0) /* UDP */);
ndpi_set_proto_defaults(ndpi_str, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_FREE90, 0 /* can_have_a_subprotocol */,
- no_master, no_master, "Free90", NDPI_PROTOCOL_CATEGORY_REMOTE_ACCESS,
- ndpi_build_default_ports(ports_a, 5900, 5901, 5800, 0, 0) /* TCP */,
+ no_master, no_master, "FREE_90", NDPI_PROTOCOL_CATEGORY_REMOTE_ACCESS,
+ ndpi_build_default_ports(ports_a, 0, 0, 0, 0, 0) /* TCP */,
ndpi_build_default_ports(ports_b, 0, 0, 0, 0, 0) /* UDP */);
ndpi_set_proto_defaults(ndpi_str, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_ZOOM, 0 /* can_have_a_subprotocol */,
no_master, no_master, "Zoom", NDPI_PROTOCOL_CATEGORY_VIDEO,