aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/include/ndpi_api.h.in12
-rw-r--r--src/lib/ndpi_analyze.c175
2 files changed, 177 insertions, 10 deletions
diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in
index 865ddc8dd..94f5f54fe 100644
--- a/src/include/ndpi_api.h.in
+++ b/src/include/ndpi_api.h.in
@@ -931,7 +931,7 @@ extern "C" {
int ndpi_check_dga_name(struct ndpi_detection_module_struct *ndpi_str,
struct ndpi_flow_struct *flow,
char *name);
-
+
/* Serializer */
int ndpi_init_serializer_ll(ndpi_serializer *serializer, ndpi_serialization_format fmt,
u_int32_t buffer_size);
@@ -1067,14 +1067,18 @@ extern "C" {
double ndpi_hll_count(struct ndpi_hll *hll);
/* ******************************* */
-
+
int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins);
void ndpi_free_bin(struct ndpi_bin *b);
- void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id);
+ void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val);
+ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value);
+ u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id);
+ void ndpi_reset_bin(struct ndpi_bin *b);
void ndpi_normalize_bin(struct ndpi_bin *b);
char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len);
float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first);
-
+ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
+ u_int8_t num_clusters, u_int16_t *cluster_ids);
#ifdef __cplusplus
}
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index 4ca3ac25a..74adbb2c9 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -284,26 +284,81 @@ void ndpi_free_bin(struct ndpi_bin *b) {
/* ********************************************************************************* */
-void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id) {
+void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
if(slot_id >= b->num_bins) slot_id = 0;
- b->num_incs += 1;
+ b->num_incs += val;
switch(b->family) {
case ndpi_bin_family8:
- b->u.bins8[slot_id]++;
+ b->u.bins8[slot_id] = (u_int8_t)val;
break;
case ndpi_bin_family16:
- b->u.bins16[slot_id]++;
+ b->u.bins16[slot_id] = (u_int16_t)val;
break;
case ndpi_bin_family32:
- b->u.bins32[slot_id]++;
+ b->u.bins32[slot_id] = (u_int32_t)val;
break;
}
}
/* ********************************************************************************* */
+void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
+ if(slot_id >= b->num_bins) slot_id = 0;
+
+ b->num_incs += val;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ b->u.bins8[slot_id] += (u_int8_t)val;
+ break;
+ case ndpi_bin_family16:
+ b->u.bins16[slot_id] += (u_int16_t)val;
+ break;
+ case ndpi_bin_family32:
+ b->u.bins32[slot_id] += (u_int32_t)val;
+ break;
+ }
+}
+
+/* ********************************************************************************* */
+
+u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) {
+ if(slot_id >= b->num_bins) slot_id = 0;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ return(b->u.bins8[slot_id]);
+ break;
+ case ndpi_bin_family16:
+ return(b->u.bins16[slot_id]);
+ break;
+ case ndpi_bin_family32:
+ return(b->u.bins32[slot_id]);
+ break;
+ }
+}
+
+/* ********************************************************************************* */
+
+void ndpi_reset_bin(struct ndpi_bin *b) {
+ b->num_incs = 0;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins);
+ break;
+ case ndpi_bin_family16:
+ memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins);
+ break;
+ case ndpi_bin_family32:
+ memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins);
+ break;
+ }
+}
+/* ********************************************************************************* */
+
/*
Each bin slot is transformed in a % with respect to the value total
*/
@@ -387,7 +442,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
u_int32_t sumxx = 0, sumxy = 0, sumyy = 0;
if((b1->num_incs == 0) || (b2->num_incs == 0)
- || (b1->family != b2->family) || (b1->num_bins != b2->num_bins))
+ // || (b1->family != b2->family)
+ || (b1->num_bins != b2->num_bins))
return(0);
if(normalize_first)
@@ -412,3 +468,110 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
}
/* ********************************************************************************* */
+
+/*
+ Clusters bins into 'num_clusters'
+ - (in) bins: a vection 'num_bins' long of bins to cluster
+ - (in) 'num_clusters': number of desired clusters 0...(num_clusters-1)
+ - (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin
+
+ See
+ - https://en.wikipedia.org/wiki/K-means_clustering
+ */
+int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
+ u_int8_t num_clusters, u_int16_t *cluster_ids) {
+ u_int16_t i, j, max_iterations = 100, num_iterations = 0, num_moves;
+ struct ndpi_bin *centroids;
+ u_int8_t verbose = 0;
+
+ if(num_clusters > num_bins) return(-1);
+
+ if((centroids = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL)
+ return(-2);
+ else {
+ for(i=0; i<num_clusters; i++)
+ ndpi_init_bin(&centroids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins);
+ }
+
+ /* Reset the id's */
+ memset(cluster_ids, 0, sizeof(u_int16_t) * num_bins);
+
+ /* Randomly pick a cluster id */
+ for(i=0; i<num_clusters; i++) cluster_ids[i] = i;
+
+ /* Assign the remaining bins to the nearest cluster */
+ for(i=num_clusters; i<num_bins; i++) {
+ u_int16_t j;
+ float top_similarity = -1;
+ u_int8_t cluster_id;
+
+ for(j=0; j<num_clusters; j++) {
+ float similarity = ndpi_bin_similarity(&bins[i], &bins[j], 0);
+
+ if(similarity > top_similarity)
+ cluster_id = j, top_similarity = similarity;
+ }
+
+ cluster_ids[i] = cluster_id;
+ }
+
+ /* Now let's try to find a better arrangement */
+ while(num_iterations++ < max_iterations) {
+ /* Find the center of each cluster */
+
+ if(verbose) printf("Iteration %u\n", num_iterations);
+
+ for(i=0; i<num_clusters; i++)
+ ndpi_reset_bin(&centroids[i]);
+
+ for(i=0; i<num_bins; i++) {
+ for(j=0; j<bins[i].num_bins; j++) {
+ ndpi_inc_bin(&centroids[cluster_ids[i]], j, ndpi_get_bin_value(&bins[i], j));
+ }
+ }
+
+ for(i=0; i<num_clusters; i++) {
+ char out_buf[256];
+
+ ndpi_normalize_bin(&centroids[i]);
+ if(verbose)
+ printf("Centroid [%u] %s\n", cluster_ids[i],
+ ndpi_print_bin(&centroids[i], 0, out_buf, sizeof(out_buf)));
+ }
+
+ /* Now let's check if there are bins to move across clusters */
+ num_moves = 0;
+
+ for(i=0; i<num_bins; i++) {
+ u_int16_t j;
+ float top_similarity = -1;
+ u_int8_t cluster_id;
+
+ for(j=0; j<num_clusters; j++) {
+ float similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0);
+
+ if(similarity > top_similarity)
+ cluster_id = j, top_similarity = similarity;
+ }
+
+ if((top_similarity > 0) && (cluster_ids[i] != cluster_id)) {
+ if(verbose)
+ printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n",
+ i, cluster_ids[i], cluster_id, top_similarity);
+
+ cluster_ids[i] = cluster_id;
+ num_moves++;
+ }
+ }
+
+ if(num_moves == 0)
+ break;
+ }
+
+ for(i=0; i<num_clusters; i++)
+ ndpi_free_bin(&centroids[i]);
+
+ return(0);
+}
+
+/* ********************************************************************************* */