/* * ndpi_analyze.c * * Copyright (C) 2019 - ntop.org * * This file is part of nDPI, an open source deep packet inspection * library. * * nDPI is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * nDPI is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with nDPI. If not, see . * */ #ifdef HAVE_CONFIG_H #include "ndpi_config.h" #endif #include #include #include #include #include #include /* FLT_EPSILON */ #include "ndpi_api.h" #include "ndpi_config.h" /* ********************************************************************************* */ void ndpi_init_data_analysis(struct ndpi_analyze_struct *ret, u_int16_t _max_series_len) { u_int32_t len; memset(ret, 0, sizeof(struct ndpi_analyze_struct)); if(_max_series_len > MAX_SERIES_LEN) _max_series_len = MAX_SERIES_LEN; ret->num_values_array_len = _max_series_len; if(ret->num_values_array_len > 0) { len = sizeof(u_int32_t)*ret->num_values_array_len; if((ret->values = ndpi_malloc(len)) == NULL) { ndpi_free(ret); ret = NULL; } else memset(ret->values, 0, len); } else ret->values = NULL; } /* ********************************************************************************* */ struct ndpi_analyze_struct* ndpi_alloc_data_analysis(u_int16_t _max_series_len) { struct ndpi_analyze_struct *ret = ndpi_malloc(sizeof(struct ndpi_analyze_struct)); if(ret != NULL) ndpi_init_data_analysis(ret, _max_series_len); return(ret); } /* ********************************************************************************* */ void ndpi_free_data_analysis(struct ndpi_analyze_struct *d) { if(d->values) ndpi_free(d->values); ndpi_free(d); } /* ********************************************************************************* */ void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d) { memset(d, 0, sizeof(struct ndpi_analyze_struct)); memset(d->values, 0, sizeof(u_int32_t)*d->num_values_array_len); d->num_data_entries = 0; } /* ********************************************************************************* */ /* Add a new point to analyze */ void ndpi_data_add_value(struct ndpi_analyze_struct *s, const u_int32_t value) { if(s->sum_total == 0) s->min_val = s->max_val = value; else { if(value < s->min_val) s->min_val = value; if(value > s->max_val) s->max_val = value; } s->sum_total += value, s->num_data_entries++; if(s->num_values_array_len) { s->values[s->next_value_insert_index] = value; if(++s->next_value_insert_index == s->num_values_array_len) s->next_value_insert_index = 0; } /* Optimized stddev calculation https://www.khanacademy.org/math/probability/data-distributions-a1/summarizing-spread-distributions/a/calculating-standard-deviation-step-by-step https://math.stackexchange.com/questions/683297/how-to-calculate-standard-deviation-without-detailed-historical-data http://mathcentral.uregina.ca/QQ/database/QQ.09.02/carlos1.html */ s->stddev.sum_square_total += value * value; } /* ********************************************************************************* */ /* Compute the average on all values */ float ndpi_data_average(struct ndpi_analyze_struct *s) { return((s->num_data_entries == 0) ? 0 : ((float)s->sum_total / (float)s->num_data_entries)); } /* ********************************************************************************* */ u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s) { if((s->num_data_entries == 0) || (s->sum_total == 0)) return(0); if(s->next_value_insert_index == 0) return(s->values[s->num_values_array_len-1]); else return(s->values[s->next_value_insert_index-1]); } /* Return min/max on all values */ u_int32_t ndpi_data_min(struct ndpi_analyze_struct *s) { return(s->min_val); } u_int32_t ndpi_data_max(struct ndpi_analyze_struct *s) { return(s->max_val); } /* ********************************************************************************* */ /* Compute the variance on all values */ float ndpi_data_variance(struct ndpi_analyze_struct *s) { return(s->num_data_entries ? (float)(s->stddev.sum_square_total - ((s->sum_total * s->sum_total) / (s->num_data_entries))) / (float)s->num_data_entries : 0); } /* ********************************************************************************* */ /* See the link below for "Population and sample standard deviation review" https://www.khanacademy.org/math/statistics-probability/summarizing-quantitative-data/variance-standard-deviation-sample/a/population-and-sample-standard-deviation-review In nDPI we use an approximate stddev calculation to avoid storing all data in memory */ /* Compute the standard deviation on all values */ float ndpi_data_stddev(struct ndpi_analyze_struct *s) { return(sqrt(ndpi_data_variance(s))); } /* ********************************************************************************* */ /* Compute the average only on the sliding window */ float ndpi_data_window_average(struct ndpi_analyze_struct *s) { if(s->num_values_array_len) { float sum = 0.0; u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len); if(n == 0) return(0); for(i=0; ivalues[i]; return((float)sum / (float)n); } else return(0); } /* ********************************************************************************* */ /* Compute the variance only on the sliding window */ float ndpi_data_window_variance(struct ndpi_analyze_struct *s) { if(s->num_values_array_len) { float sum = 0.0, avg = ndpi_data_window_average(s); u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len); if(n == 0) return(0); for(i=0; ivalues[i]-avg, 2); return((float)sum / (float)n); } else return(0); } /* ********************************************************************************* */ /* Compute the variance only on the sliding window */ float ndpi_data_window_stddev(struct ndpi_analyze_struct *s) { return(sqrt(ndpi_data_window_variance(s))); } /* ********************************************************************************* */ /* Compute entropy on the last sliding window values */ float ndpi_data_entropy(struct ndpi_analyze_struct *s) { if(s->num_values_array_len) { int i; float sum = 0.0, total = 0.0; for(i=0; inum_values_array_len; i++) total += s->values[i]; for (i=0; inum_values_array_len; i++) { float tmp = (float)s->values[i] / (float)total; if(tmp > FLT_EPSILON) sum -= tmp * logf(tmp); } return(sum / logf(2.0)); } else return(0); } /* ********************************************************************************* */ void ndpi_data_print_window_values(struct ndpi_analyze_struct *s) { if(s->num_values_array_len) { u_int16_t i, n = ndpi_min(s->num_data_entries, s->num_values_array_len); for(i=0; ivalues[i]); printf("\n"); } } /* ********************************************************************************* */ /* Upload / download ration -1 Download 0 Mixed 1 Upload */ float ndpi_data_ratio(u_int32_t sent, u_int32_t rcvd) { float s = (float)((int64_t)sent + (int64_t)rcvd); float d = (float)((int64_t)sent - (int64_t)rcvd); return((s == 0) ? 0 : (d/s)); } /* ********************************************************************************* */ const char* ndpi_data_ratio2str(float ratio) { if(ratio < -0.2) return("Download"); else if(ratio > 0.2) return("Upload"); else return("Mixed"); } /* ********************************************************************************* */ /* ********************************************************************************* */ #include "third_party/src/hll/hll.c" #include "third_party/src/hll/MurmurHash3.c" int ndpi_hll_init(struct ndpi_hll *hll, u_int8_t bits) { return(hll_init(hll, bits)); } void ndpi_hll_destroy(struct ndpi_hll *hll) { hll_destroy(hll); } void ndpi_hll_reset(struct ndpi_hll *hll) { hll_reset(hll); } void ndpi_hll_add(struct ndpi_hll *hll, const char *data, size_t data_len) { hll_add(hll, (const void *)data, data_len); } void ndpi_hll_add_number(struct ndpi_hll *hll, u_int32_t value) { hll_add(hll, (const void *)&value, sizeof(value)); } double ndpi_hll_count(struct ndpi_hll *hll) { return(hll_count(hll)); } /* ********************************************************************************* */ /* ********************************************************************************* */ int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins) { b->num_bins = num_bins, b->family = f, b->is_empty = 1; switch(f) { case ndpi_bin_family8: if((b->u.bins8 = (u_int8_t*)ndpi_calloc(num_bins, sizeof(u_int8_t))) == NULL) return(-1); break; case ndpi_bin_family16: if((b->u.bins16 = (u_int16_t*)ndpi_calloc(num_bins, sizeof(u_int16_t))) == NULL) return(-1); break; case ndpi_bin_family32: if((b->u.bins32 = (u_int32_t*)ndpi_calloc(num_bins, sizeof(u_int32_t))) == NULL) return(-1); break; } return(0); } /* ********************************************************************************* */ void ndpi_free_bin(struct ndpi_bin *b) { switch(b->family) { case ndpi_bin_family8: free(b->u.bins8); break; case ndpi_bin_family16: free(b->u.bins16); break; case ndpi_bin_family32: free(b->u.bins32); break; } } /* ********************************************************************************* */ struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b) { struct ndpi_bin *out = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)); if(!out) return(NULL); out->num_bins = b->num_bins, out->family = b->family, out->is_empty = b->is_empty; switch(out->family) { case ndpi_bin_family8: if((out->u.bins8 = (u_int8_t*)ndpi_calloc(out->num_bins, sizeof(u_int8_t))) == NULL) { free(out); return(NULL); } else memcpy(out->u.bins8, b->u.bins8, out->num_bins*sizeof(u_int8_t)); break; case ndpi_bin_family16: if((out->u.bins16 = (u_int16_t*)ndpi_calloc(out->num_bins, sizeof(u_int16_t))) == NULL) { free(out); return(NULL); } else memcpy(out->u.bins16, b->u.bins16, out->num_bins*sizeof(u_int16_t)); break; case ndpi_bin_family32: if((out->u.bins32 = (u_int32_t*)ndpi_calloc(out->num_bins, sizeof(u_int32_t))) == NULL) { free(out); return(NULL); } else memcpy(out->u.bins32, b->u.bins32, out->num_bins*sizeof(u_int32_t)); break; } return(out); } /* ********************************************************************************* */ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { case ndpi_bin_family8: b->u.bins8[slot_id] = (u_int8_t)val; break; case ndpi_bin_family16: b->u.bins16[slot_id] = (u_int16_t)val; break; case ndpi_bin_family32: b->u.bins32[slot_id] = (u_int32_t)val; break; } } /* ********************************************************************************* */ void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { b->is_empty = 0; if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { case ndpi_bin_family8: b->u.bins8[slot_id] += (u_int8_t)val; break; case ndpi_bin_family16: b->u.bins16[slot_id] += (u_int16_t)val; break; case ndpi_bin_family32: b->u.bins32[slot_id] += (u_int32_t)val; break; } } /* ********************************************************************************* */ u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) { if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { case ndpi_bin_family8: return(b->u.bins8[slot_id]); break; case ndpi_bin_family16: return(b->u.bins16[slot_id]); break; case ndpi_bin_family32: return(b->u.bins32[slot_id]); break; } return(0); } /* ********************************************************************************* */ void ndpi_reset_bin(struct ndpi_bin *b) { b->is_empty = 1; switch(b->family) { case ndpi_bin_family8: memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins); break; case ndpi_bin_family16: memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins); break; case ndpi_bin_family32: memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins); break; } } /* ********************************************************************************* */ /* Each bin slot is transformed in a % with respect to the value total */ void ndpi_normalize_bin(struct ndpi_bin *b) { u_int8_t i; u_int32_t tot = 0; if(b->is_empty) return; switch(b->family) { case ndpi_bin_family8: for(i=0; inum_bins; i++) tot += b->u.bins8[i]; if(tot > 0) { for(i=0; inum_bins; i++) b->u.bins8[i] = (b->u.bins8[i]*100) / tot; } break; case ndpi_bin_family16: for(i=0; inum_bins; i++) tot += b->u.bins16[i]; if(tot > 0) { for(i=0; inum_bins; i++) b->u.bins16[i] = (b->u.bins16[i]*100) / tot; } break; case ndpi_bin_family32: for(i=0; inum_bins; i++) tot += b->u.bins32[i]; if(tot > 0) { for(i=0; inum_bins; i++) b->u.bins32[i] = (b->u.bins32[i]*100) / tot; } break; } } /* ********************************************************************************* */ char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len) { u_int8_t i; u_int len = 0; if(!out_buf) return(out_buf); else out_buf[0] = '\0'; if(normalize_first) ndpi_normalize_bin(b); switch(b->family) { case ndpi_bin_family8: for(i=0; inum_bins; i++) { int rc = snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins8[i]); if(rc < 0) break; len += rc; } break; case ndpi_bin_family16: for(i=0; inum_bins; i++) { int rc = snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins16[i]); if(rc < 0) break; len += rc; } break; case ndpi_bin_family32: for(i=0; inum_bins; i++) { int rc = snprintf(&out_buf[len], out_buf_len-len, "%s%u", (i > 0) ? "," : "", b->u.bins32[i]); if(rc < 0) break; len += rc; } break; } return(out_buf); } /* ********************************************************************************* */ // #define COSINE_SIMILARITY /* Determines how similar are two bins Cosine Similiarity 0 = Very differet ... (gray zone) 1 = Alike See https://en.wikipedia.org/wiki/Cosine_similarity for more details --- Euclidean similarity 0 = alike ... the higher the more different */ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first) { u_int8_t i; if( // (b1->family != b2->family) || (b1->num_bins != b2->num_bins)) return(-1); if(normalize_first) ndpi_normalize_bin(b1), ndpi_normalize_bin(b2); #ifdef COSINE_SIMILARITY { u_int32_t sumxx = 0, sumxy = 0, sumyy = 0; for(i=0; inum_bins; i++) { u_int32_t a = ndpi_get_bin_value(b1, i); u_int32_t b = ndpi_get_bin_value(b2, i); sumxx += a*a, sumyy += b*b, sumxy += a*b; } if((sumxx == 0) || (sumyy == 0)) return(0); else return((float)sumxy / sqrt((float)(sumxx * sumyy))); } #else { u_int32_t sum = 0; for(i=0; inum_bins; i++) { u_int32_t a = ndpi_get_bin_value(b1, i); u_int32_t b = ndpi_get_bin_value(b2, i); u_int32_t diff = (a > b) ? (a - b) : (b - a); if(a != b) sum += pow(diff, 2); // printf("[a: %u][b: %u][sum: %u]\n", a, b, sum); } /* The lower the more similar */ return(sqrt(sum)); } #endif } /* ********************************************************************************* */ #define MAX_NUM_CLUSTERS 128 /* Clusters bins into 'num_clusters' - (in) bins: a vection 'num_bins' long of bins to cluster - (in) 'num_clusters': number of desired clusters 0...(num_clusters-1) - (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin - (out) 'centroids': an optional 'num_clusters' long vector of (centroid) bins See - https://en.wikipedia.org/wiki/K-means_clustering */ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, u_int8_t num_clusters, u_int16_t *cluster_ids, struct ndpi_bin *centroids) { u_int16_t i, j, max_iterations = 25, num_iterations, num_moves; u_int8_t verbose = 0, alloc_centroids = 0; char out_buf[256]; float *bin_score; u_int16_t num_cluster_elems[MAX_NUM_CLUSTERS] = { 0 }; srand(time(NULL)); if(num_clusters > num_bins) num_clusters = num_bins; if(num_clusters > MAX_NUM_CLUSTERS) num_clusters = MAX_NUM_CLUSTERS; if(verbose) printf("Distributing %u bins over %u clusters\n", num_bins, num_clusters); if((bin_score = (float*)ndpi_calloc(num_bins, sizeof(float))) == NULL) return(-2); if(centroids == NULL) { alloc_centroids = 1; if((centroids = (struct ndpi_bin*)ndpi_malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) { ndpi_free(bin_score); return(-2); } else { for(i=0; i best_similarity) { cluster_id = j, best_similarity = similarity; } #else if(similarity < best_similarity) { cluster_id = j, best_similarity = similarity; } #endif } if((best_similarity == current_similarity) && (num_cluster_elems[cluster_ids[i]] > 1)) { /* In case of identical similarity let's leave things as they are this unless this is a cluster with only one element */ cluster_id = cluster_ids[i]; } bin_score[i] = best_similarity; if(cluster_ids[i] != cluster_id) { if(verbose) printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", i, cluster_ids[i], cluster_id, best_similarity); num_cluster_elems[cluster_ids[i]]--; num_cluster_elems[cluster_id]++; cluster_ids[i] = cluster_id; num_moves++; } } if(num_moves == 0) break; if(verbose) { for(j=0; j 1)) score = bin_score[i], candidate = i; } #else score = 0; for(i=0; i score) && (num_cluster_elems[cluster_ids[i]] > 1)) score = bin_score[i], candidate = i; } #endif if(verbose) printf("Rebalance: moving bin %u from cluster %u -> %u [similarity: %f]\n", candidate, cluster_ids[candidate], j, score); num_cluster_elems[cluster_ids[candidate]]--; num_cluster_elems[j]++; cluster_ids[candidate] = j; } } #endif } /* while(...) */ if(alloc_centroids) { for(i=0; i