diff options
author | Luca Deri <deri@ntop.org> | 2020-07-07 15:10:15 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2020-07-07 15:10:51 +0200 |
commit | 1c60c22893e465da75b825ce4bab80ca018e9104 (patch) | |
tree | b0eea373c19ad19b7285281602f69371faadb743 | |
parent | db707e0829d29f7aed6d2a5848706600ca8ff971 (diff) |
Added ndpi_cluster_bins() for clustering bins and ancillary functions for bins manipulation
-rw-r--r-- | example/ndpiReader.c | 155 | ||||
-rw-r--r-- | example/reader_util.c | 31 | ||||
-rw-r--r-- | src/include/ndpi_api.h.in | 12 | ||||
-rw-r--r-- | src/lib/ndpi_analyze.c | 175 |
4 files changed, 291 insertions, 82 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c index a03c461be..39e36f248 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -67,9 +67,9 @@ static char *results_path = NULL; static char * bpfFilter = NULL; /**< bpf filter */ static char *_protoFilePath = NULL; /**< Protocol file path */ static char *_customCategoryFilePath= NULL; /**< Custom categories file path */ -static FILE *csv_fp = NULL; /**< for CSV export */ static u_int8_t live_capture = 0; static u_int8_t undetected_flows_deleted = 0; +FILE *csv_fp = NULL; /**< for CSV export */ /** User preferences **/ u_int8_t enable_protocol_guess = 1, enable_payload_analyzer = 0; u_int8_t verbose = 0, enable_joy_stats = 0; @@ -96,7 +96,7 @@ static struct ndpi_detection_module_struct *ndpi_info_mod = NULL; extern u_int32_t max_num_packets_per_flow, max_packet_payload_dissection, max_num_reported_top_payloads; extern u_int16_t min_pattern_len, max_pattern_len; extern void ndpi_self_check_host_match(); /* Self check function */ - + struct flow_info { struct ndpi_flow_info *flow; u_int16_t thread_id; @@ -311,7 +311,7 @@ flowGetBDMeanandVariance(struct ndpi_flow_info* flow) { fprintf(out, "%u]", (unsigned char)array[i]); } #endif - + /* Output the mean */ if(num_bytes != 0) { double entropy = ndpi_flow_get_byte_count_entropy(array, num_bytes); @@ -420,7 +420,7 @@ static void help(u_int long_help) { ndpi_dump_protocols(ndpi_info_mod); } - + exit(!long_help); } @@ -621,14 +621,16 @@ void printCSVHeader() { fprintf(csv_fp, "tls_version,ja3c,tls_client_unsafe,"); fprintf(csv_fp, "ja3s,tls_server_unsafe,"); fprintf(csv_fp, "tls_alpn,tls_supported_versions,"); - fprintf(csv_fp, "tls_issuerDN,tls_subjectDN,"); - fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info"); - +#if 0 + fprintf(csv_fp, "tls_issuerDN,tls_subjectDN,"); +#endif + fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info,plen_bins"); + /* Joy */ if(enable_joy_stats) { - fprintf(csv_fp, ",byte_dist_mean,byte_dist_std,entropy,total_entropy"); + fprintf(csv_fp, ",byte_dist_mean,byte_dist_std,entropy,total_entropy"); } - + fprintf(csv_fp, "\n"); } @@ -996,32 +998,33 @@ static char* is_unsafe_cipher(ndpi_cipher_weakness c) { /* ********************************** */ -void print_bin(const char *label, struct ndpi_bin *b) { +void print_bin(FILE *fout, const char *label, struct ndpi_bin *b) { if(b->num_incs == 0) return; else { u_int8_t i; FILE *out = results_file ? results_file : stdout; - + const char *sep = label ? "," : ";"; + ndpi_normalize_bin(b); - fprintf(out, "[%s: ", label); - + if(label) fprintf(fout, "[%s: ", label); + for(i=0; i<b->num_bins; i++) { switch(b->family) { case ndpi_bin_family8: - fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins8[i]); + fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins8[i]); break; case ndpi_bin_family16: - fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins16[i]); + fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins16[i]); break; case ndpi_bin_family32: - fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins32[i]); + fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins32[i]); break; } } - fprintf(out, "]"); + if(label) fprintf(fout, "]"); } } @@ -1054,7 +1057,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa /* PLEASE KEEP IN SYNC WITH printCSVHeader() */ dos_ge_score = Dos_goldeneye_score(flow); - + dos_slow_score = Dos_slow_score(flow); dos_hulk_score = Dos_hulk_score(flow); ddos_score = Ddos_score(flow); @@ -1065,7 +1068,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa ssh_patator_score = Ssh_patator_score(flow); inf_score = Infiltration_score(flow); - + double benign_score = dos_ge_score < 1 && dos_slow_score < 1 && \ dos_hulk_score < 1 && ddos_score < 1 && hearthbleed_score < 1 && \ ftp_patator_score < 1 && ssh_patator_score < 1 && inf_score < 1 ? 1.1 : 0; @@ -1087,12 +1090,12 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa ndpi_protocol2name(ndpi_thread_info[thread_id].workflow->ndpi_struct, flow->detected_protocol, buf, sizeof(buf)), flow->host_server_name); - + fprintf(csv_fp, "%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,", \ benign_score, dos_slow_score, dos_ge_score, dos_hulk_score, \ ddos_score, hearthbleed_score, ftp_patator_score, \ ssh_patator_score, inf_score); - + fprintf(csv_fp, "%u,%llu,%llu,", flow->src2dst_packets, (long long unsigned int) flow->src2dst_bytes, (long long unsigned int) flow->src2dst_goodput_bytes); fprintf(csv_fp, "%u,%llu,%llu,", flow->dst2src_packets, @@ -1100,7 +1103,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa fprintf(csv_fp, "%.3f,%s,", data_ratio, ndpi_data_ratio2str(data_ratio)); fprintf(csv_fp, "%.1f,%.1f,", 100.0*((float)flow->src2dst_goodput_bytes / (float)(flow->src2dst_bytes+1)), 100.0*((float)flow->dst2src_goodput_bytes / (float)(flow->dst2src_bytes+1))); - + /* IAT (Inter Arrival Time) */ fprintf(csv_fp, "%u,%.1f,%u,%.1f,", ndpi_data_min(flow->iat_flow), ndpi_data_average(flow->iat_flow), ndpi_data_max(flow->iat_flow), ndpi_data_stddev(flow->iat_flow)); @@ -1134,21 +1137,27 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa (flow->ssh_tls.ja3_client[0] != '\0') ? is_unsafe_cipher(flow->ssh_tls.client_unsafe_cipher) : "0", (flow->ssh_tls.ja3_server[0] != '\0') ? flow->ssh_tls.ja3_server : "", (flow->ssh_tls.ja3_server[0] != '\0') ? is_unsafe_cipher(flow->ssh_tls.server_unsafe_cipher) : "0"); - + fprintf(csv_fp, "%s,%s,", flow->ssh_tls.tls_alpn ? flow->ssh_tls.tls_alpn : "", - flow->ssh_tls.tls_supported_versions ? flow->ssh_tls.tls_supported_versions : "" + flow->ssh_tls.tls_supported_versions ? flow->ssh_tls.tls_supported_versions : "" ); + +#if 0 fprintf(csv_fp, "%s,%s,", flow->ssh_tls.tls_issuerDN ? flow->ssh_tls.tls_issuerDN : "", - flow->ssh_tls.tls_subjectDN ? flow->ssh_tls.tls_subjectDN : "" + flow->ssh_tls.tls_subjectDN ? flow->ssh_tls.tls_subjectDN : "" ); +#endif + fprintf(csv_fp, "%s,%s", (flow->ssh_tls.client_hassh[0] != '\0') ? flow->ssh_tls.client_hassh : "", (flow->ssh_tls.server_hassh[0] != '\0') ? flow->ssh_tls.server_hassh : "" ); - fprintf(csv_fp, ",%s", flow->info); + fprintf(csv_fp, ",%s,", flow->info); + + print_bin(csv_fp, NULL, &flow->payload_len_bin); } if((verbose != 1) && (verbose != 2)) { @@ -1181,16 +1190,16 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa if(flow->vlan_id > 0) fprintf(out, "[VLAN: %u]", flow->vlan_id); if(enable_payload_analyzer) fprintf(out, "[flowId: %u]", flow->flow_id); } - + if(enable_joy_stats) { /* Print entropy values for monitored flows. */ flowGetBDMeanandVariance(flow); fflush(out); fprintf(out, "[score: %.4f]", flow->entropy.score); } - + if(csv_fp) fprintf(csv_fp, "\n"); - + fprintf(out, "[proto: "); if(flow->tunnel_type != ndpi_no_tunnel) fprintf(out, "%s:", ndpi_tunnel2str(flow->tunnel_type)); @@ -1255,7 +1264,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa if(risk != NDPI_NO_RISK) NDPI_SET_BIT(flow->risk, risk); - + fprintf(out, "[URL: %s][StatusCode: %u]", flow->http.url, flow->http.response_status_code); @@ -1268,16 +1277,16 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa if(flow->risk) { u_int i; - + fprintf(out, "[Risk: "); for(i=0; i<NDPI_MAX_RISK; i++) if(NDPI_ISSET_BIT(flow->risk, i)) fprintf(out, "** %s **", ndpi_risk2str(i)); - + fprintf(out, "]"); } - + if(flow->ssh_tls.ssl_version != 0) fprintf(out, "[%s]", ndpi_ssl_version2str(flow->ssh_tls.ssl_version, &known_tls)); if(flow->ssh_tls.client_requested_server_name[0] != '\0') fprintf(out, "[Client: %s]", flow->ssh_tls.client_requested_server_name); if(flow->ssh_tls.client_hassh[0] != '\0') fprintf(out, "[HASSH-C: %s]", flow->ssh_tls.client_hassh); @@ -1335,12 +1344,12 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa flow->human_readeable_string_buffer); #ifdef DIRECTION_BINS - print_bin("Plen c2s", &flow->payload_len_bin_src2dst); - print_bin("Plen s2c", &flow->payload_len_bin_dst2src); + print_bin(out, "Plen c2s", &flow->payload_len_bin_src2dst); + print_bin(out, "Plen s2c", &flow->payload_len_bin_dst2src); #else - print_bin("Plen Bins", &flow->payload_len_bin); + print_bin(out, "Plen Bins", &flow->payload_len_bin); #endif - + fprintf(out, "\n"); } @@ -2112,7 +2121,7 @@ static void printFlowsStats() { ndpi_ja3_info *tmp2 = NULL; unsigned int num_ja3_client; unsigned int num_ja3_server; - + fprintf(out, "\n"); num_flows = 0; @@ -2590,7 +2599,7 @@ static void printResults(u_int64_t processing_time_usec, u_int64_t setup_time_us float b = (float)(cumulative_stats.total_wire_bytes * 8 *1000000)/(float)processing_time_usec; float traffic_duration; struct tm result; - + if(live_capture) traffic_duration = processing_time_usec; else traffic_duration = (pcap_end.tv_sec*1000000 + pcap_end.tv_usec) - (pcap_start.tv_sec*1000000 + pcap_start.tv_usec); @@ -3116,26 +3125,44 @@ void test_lib() { /* *********************************************** */ static void binUnitTest() { - struct ndpi_bin b1, b2; + struct ndpi_bin *bins; + u_int8_t versbose = 0; u_int8_t num_bins = 32; - u_int32_t i; + u_int8_t num_points = 24; + u_int32_t i, j; + u_int8_t num_clusters = 3; + u_int16_t cluster_ids[256]; char out_buf[128]; - + srand(time(NULL)); - - ndpi_init_bin(&b1, ndpi_bin_family8, num_bins), ndpi_init_bin(&b2, ndpi_bin_family8, num_bins); - - for(i=0; i<32; i++) - ndpi_inc_bin(&b1, rand() % num_bins), ndpi_inc_bin(&b2, rand() % num_bins); -#if 0 - printf("1) %s\n", ndpi_print_bin(&b1, 0, out_buf, sizeof(out_buf))); - printf("2) %s\n", ndpi_print_bin(&b2, 0, out_buf, sizeof(out_buf))); + assert((bins = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_bins)) != NULL); - printf("Similarity: %f\n\n", ndpi_bin_similarity(&b1, &b2, 1)); -#endif - - ndpi_free_bin(&b1), ndpi_free_bin(&b2); + for(i=0; i<num_bins; i++) { + ndpi_init_bin(&bins[i], ndpi_bin_family8, num_points); + + for(j=0; j<num_points; j++) + ndpi_set_bin(&bins[i], j, rand() % 0xFF); + + ndpi_normalize_bin(&bins[i]); + } + + + ndpi_cluster_bins(bins, num_bins, num_clusters, cluster_ids); + + for(j=0; j<num_clusters; j++) { + if(verbose) printf("\n"); + + for(i=0; i<num_bins; i++) { + if(cluster_ids[i] == j) { + if(verbose) printf("[%u] %s\n", cluster_ids[i], ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf))); + } + } + } + // printf("Similarity: %f\n\n", ndpi_bin_similarity(&b1, &b2, 1)); + + for(i=0; i<num_bins; i++) + ndpi_free_bin(&bins[i]); } /* *********************************************** */ @@ -3149,7 +3176,7 @@ static void dgaUnitTest() { "www.e6r5p57kbafwrxj3plz.com", // "grdawgrcwegpjaoo.eu", "mcfpeqbotiwxfxqu.eu", - "adgxwxhqsegnrsih.eu", + "adgxwxhqsegnrsih.eu", NULL }; @@ -3185,7 +3212,7 @@ static void dgaUnitTest() { "mqtt.facebook.com", NULL }; - int i; + int i; NDPI_PROTOCOL_BITMASK all; struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(ndpi_no_prefs); @@ -3195,15 +3222,15 @@ static void dgaUnitTest() { ndpi_set_protocol_detection_bitmask2(ndpi_str, &all); ndpi_finalize_initalization(ndpi_str); - + assert(ndpi_str != NULL); for(i=0; dga[i] != NULL; i++) assert(ndpi_check_dga_name(ndpi_str, NULL, (char*)dga[i]) == 1); - + for(i=0; non_dga[i] != NULL; i++) assert(ndpi_check_dga_name(ndpi_str, NULL, (char*)non_dga[i]) == 0); - + ndpi_exit_detection_module(ndpi_str); } @@ -3214,14 +3241,14 @@ static void hllUnitTest() { struct ndpi_hll h; u_int8_t bits = 8; /* >= 4, <= 16 */ u_int32_t i; - + assert(ndpi_hll_init(&h, bits) == 0); for(i=0; i<21320; i++) ndpi_hll_add_number(&h, i); /* printf("Count estimate: %f\n", ndpi_hll_count(&h)); */ - + ndpi_hll_destroy(&h); } @@ -3233,9 +3260,9 @@ static void bitmapUnitTest() { for(i=0; i<32; i++) { NDPI_ZERO_BIT(val); NDPI_SET_BIT(val, i); - + assert(NDPI_ISSET_BIT(val, i)); - + for(j=0; j<32; j++) { if(j != i) { assert(!NDPI_ISSET_BIT(val, j)); @@ -3506,7 +3533,7 @@ void bpf_filter_port_array_add(int filter_array[], int size, int port) { void analysisUnitTest() { struct ndpi_analyze_struct *s = ndpi_alloc_data_analysis(32); u_int32_t i; - + for(i=0; i<256; i++) ndpi_data_add_value(s, i); diff --git a/example/reader_util.c b/example/reader_util.c index 43afcd402..c07cadeff 100644 --- a/example/reader_util.c +++ b/example/reader_util.c @@ -89,7 +89,7 @@ extern u_int8_t enable_protocol_guess, enable_joy_stats, enable_payload_analyzer; extern u_int8_t verbose, human_readeable_string_len; extern u_int8_t max_num_udp_dissected_pkts /* 8 */, max_num_tcp_dissected_pkts /* 10 */; - +extern FILE *csv_fp; static u_int32_t flow_id = 0; /* ****************************************************** */ @@ -1176,14 +1176,24 @@ void process_ndpi_collected_info(struct ndpi_workflow * workflow, struct ndpi_fl && flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions) { correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.alpn); correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions); - snprintf(flow->info, sizeof(flow->info), "ALPN: %s][TLS Supported Versions: %s", - flow->ndpi_flow->protos.stun_ssl.ssl.alpn, - flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions); + + if(csv_fp) + snprintf(flow->info, sizeof(flow->info), "%s", + flow->ndpi_flow->protos.stun_ssl.ssl.alpn); + else + snprintf(flow->info, sizeof(flow->info), "ALPN: %s][TLS Supported Versions: %s", + flow->ndpi_flow->protos.stun_ssl.ssl.alpn, + flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions); } else if(flow->ndpi_flow->protos.stun_ssl.ssl.alpn) { correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.alpn); - snprintf(flow->info, sizeof(flow->info), "ALPN: %s", - flow->ndpi_flow->protos.stun_ssl.ssl.alpn); + + if(csv_fp) + snprintf(flow->info, sizeof(flow->info), "%s,", + flow->ndpi_flow->protos.stun_ssl.ssl.alpn); + else + snprintf(flow->info, sizeof(flow->info), "ALPN: %s", + flow->ndpi_flow->protos.stun_ssl.ssl.alpn); } } @@ -1385,8 +1395,13 @@ static struct ndpi_proto packet_processing(struct ndpi_workflow * workflow, } #ifndef DIRECTION_BINS - if(payload_len && ((flow->src2dst_packets+flow->dst2src_packets) < MAX_NUM_BIN_PKTS)) - ndpi_inc_bin(&flow->payload_len_bin, plen2slot(payload_len)); + if(payload_len && ((flow->src2dst_packets+flow->dst2src_packets) < MAX_NUM_BIN_PKTS)) { +#if 0 + /* Discard packets until the protocol is detected */ + if(flow->detected_protocol.app_protocol != NDPI_PROTOCOL_UNKNOWN) +#endif + ndpi_inc_bin(&flow->payload_len_bin, plen2slot(payload_len), 1); + } #endif if(enable_payload_analyzer && (payload_len > 0)) diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in index 865ddc8dd..94f5f54fe 100644 --- a/src/include/ndpi_api.h.in +++ b/src/include/ndpi_api.h.in @@ -931,7 +931,7 @@ extern "C" { int ndpi_check_dga_name(struct ndpi_detection_module_struct *ndpi_str, struct ndpi_flow_struct *flow, char *name); - + /* Serializer */ int ndpi_init_serializer_ll(ndpi_serializer *serializer, ndpi_serialization_format fmt, u_int32_t buffer_size); @@ -1067,14 +1067,18 @@ extern "C" { double ndpi_hll_count(struct ndpi_hll *hll); /* ******************************* */ - + int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins); void ndpi_free_bin(struct ndpi_bin *b); - void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id); + void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val); + void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value); + u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id); + void ndpi_reset_bin(struct ndpi_bin *b); void ndpi_normalize_bin(struct ndpi_bin *b); char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len); float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first); - + int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, + u_int8_t num_clusters, u_int16_t *cluster_ids); #ifdef __cplusplus } diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index 4ca3ac25a..74adbb2c9 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -284,26 +284,81 @@ void ndpi_free_bin(struct ndpi_bin *b) { /* ********************************************************************************* */ -void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id) { +void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { if(slot_id >= b->num_bins) slot_id = 0; - b->num_incs += 1; + b->num_incs += val; switch(b->family) { case ndpi_bin_family8: - b->u.bins8[slot_id]++; + b->u.bins8[slot_id] = (u_int8_t)val; break; case ndpi_bin_family16: - b->u.bins16[slot_id]++; + b->u.bins16[slot_id] = (u_int16_t)val; break; case ndpi_bin_family32: - b->u.bins32[slot_id]++; + b->u.bins32[slot_id] = (u_int32_t)val; break; } } /* ********************************************************************************* */ +void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { + if(slot_id >= b->num_bins) slot_id = 0; + + b->num_incs += val; + + switch(b->family) { + case ndpi_bin_family8: + b->u.bins8[slot_id] += (u_int8_t)val; + break; + case ndpi_bin_family16: + b->u.bins16[slot_id] += (u_int16_t)val; + break; + case ndpi_bin_family32: + b->u.bins32[slot_id] += (u_int32_t)val; + break; + } +} + +/* ********************************************************************************* */ + +u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) { + if(slot_id >= b->num_bins) slot_id = 0; + + switch(b->family) { + case ndpi_bin_family8: + return(b->u.bins8[slot_id]); + break; + case ndpi_bin_family16: + return(b->u.bins16[slot_id]); + break; + case ndpi_bin_family32: + return(b->u.bins32[slot_id]); + break; + } +} + +/* ********************************************************************************* */ + +void ndpi_reset_bin(struct ndpi_bin *b) { + b->num_incs = 0; + + switch(b->family) { + case ndpi_bin_family8: + memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins); + break; + case ndpi_bin_family16: + memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins); + break; + case ndpi_bin_family32: + memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins); + break; + } +} +/* ********************************************************************************* */ + /* Each bin slot is transformed in a % with respect to the value total */ @@ -387,7 +442,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor u_int32_t sumxx = 0, sumxy = 0, sumyy = 0; if((b1->num_incs == 0) || (b2->num_incs == 0) - || (b1->family != b2->family) || (b1->num_bins != b2->num_bins)) + // || (b1->family != b2->family) + || (b1->num_bins != b2->num_bins)) return(0); if(normalize_first) @@ -412,3 +468,110 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor } /* ********************************************************************************* */ + +/* + Clusters bins into 'num_clusters' + - (in) bins: a vection 'num_bins' long of bins to cluster + - (in) 'num_clusters': number of desired clusters 0...(num_clusters-1) + - (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin + + See + - https://en.wikipedia.org/wiki/K-means_clustering + */ +int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, + u_int8_t num_clusters, u_int16_t *cluster_ids) { + u_int16_t i, j, max_iterations = 100, num_iterations = 0, num_moves; + struct ndpi_bin *centroids; + u_int8_t verbose = 0; + + if(num_clusters > num_bins) return(-1); + + if((centroids = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL) + return(-2); + else { + for(i=0; i<num_clusters; i++) + ndpi_init_bin(¢roids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins); + } + + /* Reset the id's */ + memset(cluster_ids, 0, sizeof(u_int16_t) * num_bins); + + /* Randomly pick a cluster id */ + for(i=0; i<num_clusters; i++) cluster_ids[i] = i; + + /* Assign the remaining bins to the nearest cluster */ + for(i=num_clusters; i<num_bins; i++) { + u_int16_t j; + float top_similarity = -1; + u_int8_t cluster_id; + + for(j=0; j<num_clusters; j++) { + float similarity = ndpi_bin_similarity(&bins[i], &bins[j], 0); + + if(similarity > top_similarity) + cluster_id = j, top_similarity = similarity; + } + + cluster_ids[i] = cluster_id; + } + + /* Now let's try to find a better arrangement */ + while(num_iterations++ < max_iterations) { + /* Find the center of each cluster */ + + if(verbose) printf("Iteration %u\n", num_iterations); + + for(i=0; i<num_clusters; i++) + ndpi_reset_bin(¢roids[i]); + + for(i=0; i<num_bins; i++) { + for(j=0; j<bins[i].num_bins; j++) { + ndpi_inc_bin(¢roids[cluster_ids[i]], j, ndpi_get_bin_value(&bins[i], j)); + } + } + + for(i=0; i<num_clusters; i++) { + char out_buf[256]; + + ndpi_normalize_bin(¢roids[i]); + if(verbose) + printf("Centroid [%u] %s\n", cluster_ids[i], + ndpi_print_bin(¢roids[i], 0, out_buf, sizeof(out_buf))); + } + + /* Now let's check if there are bins to move across clusters */ + num_moves = 0; + + for(i=0; i<num_bins; i++) { + u_int16_t j; + float top_similarity = -1; + u_int8_t cluster_id; + + for(j=0; j<num_clusters; j++) { + float similarity = ndpi_bin_similarity(&bins[i], ¢roids[j], 0); + + if(similarity > top_similarity) + cluster_id = j, top_similarity = similarity; + } + + if((top_similarity > 0) && (cluster_ids[i] != cluster_id)) { + if(verbose) + printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n", + i, cluster_ids[i], cluster_id, top_similarity); + + cluster_ids[i] = cluster_id; + num_moves++; + } + } + + if(num_moves == 0) + break; + } + + for(i=0; i<num_clusters; i++) + ndpi_free_bin(¢roids[i]); + + return(0); +} + +/* ********************************************************************************* */ |