From 21e2e576146be8f4451d703c3e54a8208c4e4701 Mon Sep 17 00:00:00 2001 From: Luca Deri Date: Fri, 24 Jul 2020 01:30:58 +0200 Subject: Fixed bin similarity --- src/lib/ndpi_analyze.c | 95 ++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 50 deletions(-) (limited to 'src') diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index 035b49a6a..26f2ae041 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -583,8 +583,11 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor for(i=0; inum_bins; i++) { u_int32_t a = ndpi_get_bin_value(b1, i); u_int32_t b = ndpi_get_bin_value(b2, i); + u_int32_t diff = (a > b) ? (a - b) : (b - a); + + if(a != b) sum += pow(diff, 2); - sum += pow(a-b, 2); + // printf("[a: %u][b: %u][sum: %u]\n", a, b, sum); } /* The lower the more similar */ @@ -614,7 +617,9 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, char out_buf[256]; float *bin_score; u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 }; - + + srand(time(NULL)); + if(num_clusters > num_bins) num_clusters = num_bins; if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS; @@ -623,7 +628,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL) return(-2); - + if(centroids == NULL) { alloc_centroids = 1; @@ -640,44 +645,16 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, memset(cluster_ids, 0, sizeof(u_int16_t) * num_bins); /* Randomly pick a cluster id */ - for(i=0; i best_similarity) -#else - if(similarity < best_similarity) -#endif - cluster_id = j, best_similarity = similarity; - } + cluster_ids[i] = cluster_id; if(verbose) - printf("Assigned bin to cluster %u: %s [score: %f]\n", cluster_id, - ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)), best_similarity); + printf("Initializing cluster %u for bin %u: %s\n", + cluster_id, i, + ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf))); - cluster_ids[i] = cluster_id; num_cluster_elems[cluster_id]++; } @@ -685,16 +662,17 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, /* Now let's try to find a better arrangement */ while(num_iterations++ < max_iterations) { - /* Find the center of each cluster */ - memset(bin_score, 0, num_bins*sizeof(float)); + + /* Compute the centroids for each cluster */ + memset(bin_score, 0, num_bins*sizeof(float)); if(verbose) { printf("\nIteration %u\n", num_iterations); - + for(j=0; j 1)) { + /* + In case of identical similarity let's leave things as they are + this unless this is a cluster with only one element + */ + cluster_id = cluster_ids[i]; + } - if(/* (best_similarity > 0) && */ (cluster_ids[i] != cluster_id)) { + bin_score[i] = best_similarity; + + if(cluster_ids[i] != cluster_id) { if(verbose) printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", i, cluster_ids[i], cluster_id, best_similarity); @@ -770,14 +763,15 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, printf("Cluster %u: %u bins\n", j, num_cluster_elems[j]); } +#if 0 for(j=0; j score) && (num_cluster_elems[cluster_ids[i]] > 1)) - score = bin_score[i], candidate = i; + score = bin_score[i], candidate = i; } #endif - + if(verbose) printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n", candidate, cluster_ids[candidate], j, score); @@ -803,6 +797,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, cluster_ids[candidate] = j; } } +#endif } /* while(...) */ if(alloc_centroids) { @@ -813,7 +808,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, } ndpi_free(bin_score); - + return(0); } -- cgit v1.2.3