aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--example/ndpiReader.c155
-rw-r--r--example/reader_util.c31
-rw-r--r--src/include/ndpi_api.h.in12
-rw-r--r--src/lib/ndpi_analyze.c175
4 files changed, 291 insertions, 82 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index a03c461be..39e36f248 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -67,9 +67,9 @@ static char *results_path = NULL;
static char * bpfFilter = NULL; /**< bpf filter */
static char *_protoFilePath = NULL; /**< Protocol file path */
static char *_customCategoryFilePath= NULL; /**< Custom categories file path */
-static FILE *csv_fp = NULL; /**< for CSV export */
static u_int8_t live_capture = 0;
static u_int8_t undetected_flows_deleted = 0;
+FILE *csv_fp = NULL; /**< for CSV export */
/** User preferences **/
u_int8_t enable_protocol_guess = 1, enable_payload_analyzer = 0;
u_int8_t verbose = 0, enable_joy_stats = 0;
@@ -96,7 +96,7 @@ static struct ndpi_detection_module_struct *ndpi_info_mod = NULL;
extern u_int32_t max_num_packets_per_flow, max_packet_payload_dissection, max_num_reported_top_payloads;
extern u_int16_t min_pattern_len, max_pattern_len;
extern void ndpi_self_check_host_match(); /* Self check function */
-
+
struct flow_info {
struct ndpi_flow_info *flow;
u_int16_t thread_id;
@@ -311,7 +311,7 @@ flowGetBDMeanandVariance(struct ndpi_flow_info* flow) {
fprintf(out, "%u]", (unsigned char)array[i]);
}
#endif
-
+
/* Output the mean */
if(num_bytes != 0) {
double entropy = ndpi_flow_get_byte_count_entropy(array, num_bytes);
@@ -420,7 +420,7 @@ static void help(u_int long_help) {
ndpi_dump_protocols(ndpi_info_mod);
}
-
+
exit(!long_help);
}
@@ -621,14 +621,16 @@ void printCSVHeader() {
fprintf(csv_fp, "tls_version,ja3c,tls_client_unsafe,");
fprintf(csv_fp, "ja3s,tls_server_unsafe,");
fprintf(csv_fp, "tls_alpn,tls_supported_versions,");
- fprintf(csv_fp, "tls_issuerDN,tls_subjectDN,");
- fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info");
-
+#if 0
+ fprintf(csv_fp, "tls_issuerDN,tls_subjectDN,");
+#endif
+ fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info,plen_bins");
+
/* Joy */
if(enable_joy_stats) {
- fprintf(csv_fp, ",byte_dist_mean,byte_dist_std,entropy,total_entropy");
+ fprintf(csv_fp, ",byte_dist_mean,byte_dist_std,entropy,total_entropy");
}
-
+
fprintf(csv_fp, "\n");
}
@@ -996,32 +998,33 @@ static char* is_unsafe_cipher(ndpi_cipher_weakness c) {
/* ********************************** */
-void print_bin(const char *label, struct ndpi_bin *b) {
+void print_bin(FILE *fout, const char *label, struct ndpi_bin *b) {
if(b->num_incs == 0)
return;
else {
u_int8_t i;
FILE *out = results_file ? results_file : stdout;
-
+ const char *sep = label ? "," : ";";
+
ndpi_normalize_bin(b);
- fprintf(out, "[%s: ", label);
-
+ if(label) fprintf(fout, "[%s: ", label);
+
for(i=0; i<b->num_bins; i++) {
switch(b->family) {
case ndpi_bin_family8:
- fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins8[i]);
+ fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins8[i]);
break;
case ndpi_bin_family16:
- fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins16[i]);
+ fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins16[i]);
break;
case ndpi_bin_family32:
- fprintf(out, "%s%u", (i > 0) ? "," : "", b->u.bins32[i]);
+ fprintf(fout, "%s%u", (i > 0) ? sep : "", b->u.bins32[i]);
break;
}
}
- fprintf(out, "]");
+ if(label) fprintf(fout, "]");
}
}
@@ -1054,7 +1057,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
/* PLEASE KEEP IN SYNC WITH printCSVHeader() */
dos_ge_score = Dos_goldeneye_score(flow);
-
+
dos_slow_score = Dos_slow_score(flow);
dos_hulk_score = Dos_hulk_score(flow);
ddos_score = Ddos_score(flow);
@@ -1065,7 +1068,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
ssh_patator_score = Ssh_patator_score(flow);
inf_score = Infiltration_score(flow);
-
+
double benign_score = dos_ge_score < 1 && dos_slow_score < 1 && \
dos_hulk_score < 1 && ddos_score < 1 && hearthbleed_score < 1 && \
ftp_patator_score < 1 && ssh_patator_score < 1 && inf_score < 1 ? 1.1 : 0;
@@ -1087,12 +1090,12 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
ndpi_protocol2name(ndpi_thread_info[thread_id].workflow->ndpi_struct,
flow->detected_protocol, buf, sizeof(buf)),
flow->host_server_name);
-
+
fprintf(csv_fp, "%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,%.4lf,", \
benign_score, dos_slow_score, dos_ge_score, dos_hulk_score, \
ddos_score, hearthbleed_score, ftp_patator_score, \
ssh_patator_score, inf_score);
-
+
fprintf(csv_fp, "%u,%llu,%llu,", flow->src2dst_packets,
(long long unsigned int) flow->src2dst_bytes, (long long unsigned int) flow->src2dst_goodput_bytes);
fprintf(csv_fp, "%u,%llu,%llu,", flow->dst2src_packets,
@@ -1100,7 +1103,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
fprintf(csv_fp, "%.3f,%s,", data_ratio, ndpi_data_ratio2str(data_ratio));
fprintf(csv_fp, "%.1f,%.1f,", 100.0*((float)flow->src2dst_goodput_bytes / (float)(flow->src2dst_bytes+1)),
100.0*((float)flow->dst2src_goodput_bytes / (float)(flow->dst2src_bytes+1)));
-
+
/* IAT (Inter Arrival Time) */
fprintf(csv_fp, "%u,%.1f,%u,%.1f,",
ndpi_data_min(flow->iat_flow), ndpi_data_average(flow->iat_flow), ndpi_data_max(flow->iat_flow), ndpi_data_stddev(flow->iat_flow));
@@ -1134,21 +1137,27 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
(flow->ssh_tls.ja3_client[0] != '\0') ? is_unsafe_cipher(flow->ssh_tls.client_unsafe_cipher) : "0",
(flow->ssh_tls.ja3_server[0] != '\0') ? flow->ssh_tls.ja3_server : "",
(flow->ssh_tls.ja3_server[0] != '\0') ? is_unsafe_cipher(flow->ssh_tls.server_unsafe_cipher) : "0");
-
+
fprintf(csv_fp, "%s,%s,",
flow->ssh_tls.tls_alpn ? flow->ssh_tls.tls_alpn : "",
- flow->ssh_tls.tls_supported_versions ? flow->ssh_tls.tls_supported_versions : ""
+ flow->ssh_tls.tls_supported_versions ? flow->ssh_tls.tls_supported_versions : ""
);
+
+#if 0
fprintf(csv_fp, "%s,%s,",
flow->ssh_tls.tls_issuerDN ? flow->ssh_tls.tls_issuerDN : "",
- flow->ssh_tls.tls_subjectDN ? flow->ssh_tls.tls_subjectDN : ""
+ flow->ssh_tls.tls_subjectDN ? flow->ssh_tls.tls_subjectDN : ""
);
+#endif
+
fprintf(csv_fp, "%s,%s",
(flow->ssh_tls.client_hassh[0] != '\0') ? flow->ssh_tls.client_hassh : "",
(flow->ssh_tls.server_hassh[0] != '\0') ? flow->ssh_tls.server_hassh : ""
);
- fprintf(csv_fp, ",%s", flow->info);
+ fprintf(csv_fp, ",%s,", flow->info);
+
+ print_bin(csv_fp, NULL, &flow->payload_len_bin);
}
if((verbose != 1) && (verbose != 2)) {
@@ -1181,16 +1190,16 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
if(flow->vlan_id > 0) fprintf(out, "[VLAN: %u]", flow->vlan_id);
if(enable_payload_analyzer) fprintf(out, "[flowId: %u]", flow->flow_id);
}
-
+
if(enable_joy_stats) {
/* Print entropy values for monitored flows. */
flowGetBDMeanandVariance(flow);
fflush(out);
fprintf(out, "[score: %.4f]", flow->entropy.score);
}
-
+
if(csv_fp) fprintf(csv_fp, "\n");
-
+
fprintf(out, "[proto: ");
if(flow->tunnel_type != ndpi_no_tunnel)
fprintf(out, "%s:", ndpi_tunnel2str(flow->tunnel_type));
@@ -1255,7 +1264,7 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
if(risk != NDPI_NO_RISK)
NDPI_SET_BIT(flow->risk, risk);
-
+
fprintf(out, "[URL: %s][StatusCode: %u]",
flow->http.url, flow->http.response_status_code);
@@ -1268,16 +1277,16 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
if(flow->risk) {
u_int i;
-
+
fprintf(out, "[Risk: ");
for(i=0; i<NDPI_MAX_RISK; i++)
if(NDPI_ISSET_BIT(flow->risk, i))
fprintf(out, "** %s **", ndpi_risk2str(i));
-
+
fprintf(out, "]");
}
-
+
if(flow->ssh_tls.ssl_version != 0) fprintf(out, "[%s]", ndpi_ssl_version2str(flow->ssh_tls.ssl_version, &known_tls));
if(flow->ssh_tls.client_requested_server_name[0] != '\0') fprintf(out, "[Client: %s]", flow->ssh_tls.client_requested_server_name);
if(flow->ssh_tls.client_hassh[0] != '\0') fprintf(out, "[HASSH-C: %s]", flow->ssh_tls.client_hassh);
@@ -1335,12 +1344,12 @@ static void printFlow(u_int16_t id, struct ndpi_flow_info *flow, u_int16_t threa
flow->human_readeable_string_buffer);
#ifdef DIRECTION_BINS
- print_bin("Plen c2s", &flow->payload_len_bin_src2dst);
- print_bin("Plen s2c", &flow->payload_len_bin_dst2src);
+ print_bin(out, "Plen c2s", &flow->payload_len_bin_src2dst);
+ print_bin(out, "Plen s2c", &flow->payload_len_bin_dst2src);
#else
- print_bin("Plen Bins", &flow->payload_len_bin);
+ print_bin(out, "Plen Bins", &flow->payload_len_bin);
#endif
-
+
fprintf(out, "\n");
}
@@ -2112,7 +2121,7 @@ static void printFlowsStats() {
ndpi_ja3_info *tmp2 = NULL;
unsigned int num_ja3_client;
unsigned int num_ja3_server;
-
+
fprintf(out, "\n");
num_flows = 0;
@@ -2590,7 +2599,7 @@ static void printResults(u_int64_t processing_time_usec, u_int64_t setup_time_us
float b = (float)(cumulative_stats.total_wire_bytes * 8 *1000000)/(float)processing_time_usec;
float traffic_duration;
struct tm result;
-
+
if(live_capture) traffic_duration = processing_time_usec;
else traffic_duration = (pcap_end.tv_sec*1000000 + pcap_end.tv_usec) - (pcap_start.tv_sec*1000000 + pcap_start.tv_usec);
@@ -3116,26 +3125,44 @@ void test_lib() {
/* *********************************************** */
static void binUnitTest() {
- struct ndpi_bin b1, b2;
+ struct ndpi_bin *bins;
+ u_int8_t versbose = 0;
u_int8_t num_bins = 32;
- u_int32_t i;
+ u_int8_t num_points = 24;
+ u_int32_t i, j;
+ u_int8_t num_clusters = 3;
+ u_int16_t cluster_ids[256];
char out_buf[128];
-
+
srand(time(NULL));
-
- ndpi_init_bin(&b1, ndpi_bin_family8, num_bins), ndpi_init_bin(&b2, ndpi_bin_family8, num_bins);
-
- for(i=0; i<32; i++)
- ndpi_inc_bin(&b1, rand() % num_bins), ndpi_inc_bin(&b2, rand() % num_bins);
-#if 0
- printf("1) %s\n", ndpi_print_bin(&b1, 0, out_buf, sizeof(out_buf)));
- printf("2) %s\n", ndpi_print_bin(&b2, 0, out_buf, sizeof(out_buf)));
+ assert((bins = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_bins)) != NULL);
- printf("Similarity: %f\n\n", ndpi_bin_similarity(&b1, &b2, 1));
-#endif
-
- ndpi_free_bin(&b1), ndpi_free_bin(&b2);
+ for(i=0; i<num_bins; i++) {
+ ndpi_init_bin(&bins[i], ndpi_bin_family8, num_points);
+
+ for(j=0; j<num_points; j++)
+ ndpi_set_bin(&bins[i], j, rand() % 0xFF);
+
+ ndpi_normalize_bin(&bins[i]);
+ }
+
+
+ ndpi_cluster_bins(bins, num_bins, num_clusters, cluster_ids);
+
+ for(j=0; j<num_clusters; j++) {
+ if(verbose) printf("\n");
+
+ for(i=0; i<num_bins; i++) {
+ if(cluster_ids[i] == j) {
+ if(verbose) printf("[%u] %s\n", cluster_ids[i], ndpi_print_bin(&bins[i], 0, out_buf, sizeof(out_buf)));
+ }
+ }
+ }
+ // printf("Similarity: %f\n\n", ndpi_bin_similarity(&b1, &b2, 1));
+
+ for(i=0; i<num_bins; i++)
+ ndpi_free_bin(&bins[i]);
}
/* *********************************************** */
@@ -3149,7 +3176,7 @@ static void dgaUnitTest() {
"www.e6r5p57kbafwrxj3plz.com",
// "grdawgrcwegpjaoo.eu",
"mcfpeqbotiwxfxqu.eu",
- "adgxwxhqsegnrsih.eu",
+ "adgxwxhqsegnrsih.eu",
NULL
};
@@ -3185,7 +3212,7 @@ static void dgaUnitTest() {
"mqtt.facebook.com",
NULL
};
- int i;
+ int i;
NDPI_PROTOCOL_BITMASK all;
struct ndpi_detection_module_struct *ndpi_str = ndpi_init_detection_module(ndpi_no_prefs);
@@ -3195,15 +3222,15 @@ static void dgaUnitTest() {
ndpi_set_protocol_detection_bitmask2(ndpi_str, &all);
ndpi_finalize_initalization(ndpi_str);
-
+
assert(ndpi_str != NULL);
for(i=0; dga[i] != NULL; i++)
assert(ndpi_check_dga_name(ndpi_str, NULL, (char*)dga[i]) == 1);
-
+
for(i=0; non_dga[i] != NULL; i++)
assert(ndpi_check_dga_name(ndpi_str, NULL, (char*)non_dga[i]) == 0);
-
+
ndpi_exit_detection_module(ndpi_str);
}
@@ -3214,14 +3241,14 @@ static void hllUnitTest() {
struct ndpi_hll h;
u_int8_t bits = 8; /* >= 4, <= 16 */
u_int32_t i;
-
+
assert(ndpi_hll_init(&h, bits) == 0);
for(i=0; i<21320; i++)
ndpi_hll_add_number(&h, i);
/* printf("Count estimate: %f\n", ndpi_hll_count(&h)); */
-
+
ndpi_hll_destroy(&h);
}
@@ -3233,9 +3260,9 @@ static void bitmapUnitTest() {
for(i=0; i<32; i++) {
NDPI_ZERO_BIT(val);
NDPI_SET_BIT(val, i);
-
+
assert(NDPI_ISSET_BIT(val, i));
-
+
for(j=0; j<32; j++) {
if(j != i) {
assert(!NDPI_ISSET_BIT(val, j));
@@ -3506,7 +3533,7 @@ void bpf_filter_port_array_add(int filter_array[], int size, int port) {
void analysisUnitTest() {
struct ndpi_analyze_struct *s = ndpi_alloc_data_analysis(32);
u_int32_t i;
-
+
for(i=0; i<256; i++)
ndpi_data_add_value(s, i);
diff --git a/example/reader_util.c b/example/reader_util.c
index 43afcd402..c07cadeff 100644
--- a/example/reader_util.c
+++ b/example/reader_util.c
@@ -89,7 +89,7 @@
extern u_int8_t enable_protocol_guess, enable_joy_stats, enable_payload_analyzer;
extern u_int8_t verbose, human_readeable_string_len;
extern u_int8_t max_num_udp_dissected_pkts /* 8 */, max_num_tcp_dissected_pkts /* 10 */;
-
+extern FILE *csv_fp;
static u_int32_t flow_id = 0;
/* ****************************************************** */
@@ -1176,14 +1176,24 @@ void process_ndpi_collected_info(struct ndpi_workflow * workflow, struct ndpi_fl
&& flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions) {
correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions);
- snprintf(flow->info, sizeof(flow->info), "ALPN: %s][TLS Supported Versions: %s",
- flow->ndpi_flow->protos.stun_ssl.ssl.alpn,
- flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions);
+
+ if(csv_fp)
+ snprintf(flow->info, sizeof(flow->info), "%s",
+ flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
+ else
+ snprintf(flow->info, sizeof(flow->info), "ALPN: %s][TLS Supported Versions: %s",
+ flow->ndpi_flow->protos.stun_ssl.ssl.alpn,
+ flow->ndpi_flow->protos.stun_ssl.ssl.tls_supported_versions);
}
else if(flow->ndpi_flow->protos.stun_ssl.ssl.alpn) {
correct_csv_data_field(flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
- snprintf(flow->info, sizeof(flow->info), "ALPN: %s",
- flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
+
+ if(csv_fp)
+ snprintf(flow->info, sizeof(flow->info), "%s,",
+ flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
+ else
+ snprintf(flow->info, sizeof(flow->info), "ALPN: %s",
+ flow->ndpi_flow->protos.stun_ssl.ssl.alpn);
}
}
@@ -1385,8 +1395,13 @@ static struct ndpi_proto packet_processing(struct ndpi_workflow * workflow,
}
#ifndef DIRECTION_BINS
- if(payload_len && ((flow->src2dst_packets+flow->dst2src_packets) < MAX_NUM_BIN_PKTS))
- ndpi_inc_bin(&flow->payload_len_bin, plen2slot(payload_len));
+ if(payload_len && ((flow->src2dst_packets+flow->dst2src_packets) < MAX_NUM_BIN_PKTS)) {
+#if 0
+ /* Discard packets until the protocol is detected */
+ if(flow->detected_protocol.app_protocol != NDPI_PROTOCOL_UNKNOWN)
+#endif
+ ndpi_inc_bin(&flow->payload_len_bin, plen2slot(payload_len), 1);
+ }
#endif
if(enable_payload_analyzer && (payload_len > 0))
diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in
index 865ddc8dd..94f5f54fe 100644
--- a/src/include/ndpi_api.h.in
+++ b/src/include/ndpi_api.h.in
@@ -931,7 +931,7 @@ extern "C" {
int ndpi_check_dga_name(struct ndpi_detection_module_struct *ndpi_str,
struct ndpi_flow_struct *flow,
char *name);
-
+
/* Serializer */
int ndpi_init_serializer_ll(ndpi_serializer *serializer, ndpi_serialization_format fmt,
u_int32_t buffer_size);
@@ -1067,14 +1067,18 @@ extern "C" {
double ndpi_hll_count(struct ndpi_hll *hll);
/* ******************************* */
-
+
int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins);
void ndpi_free_bin(struct ndpi_bin *b);
- void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id);
+ void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val);
+ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value);
+ u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id);
+ void ndpi_reset_bin(struct ndpi_bin *b);
void ndpi_normalize_bin(struct ndpi_bin *b);
char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len);
float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first);
-
+ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
+ u_int8_t num_clusters, u_int16_t *cluster_ids);
#ifdef __cplusplus
}
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index 4ca3ac25a..74adbb2c9 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -284,26 +284,81 @@ void ndpi_free_bin(struct ndpi_bin *b) {
/* ********************************************************************************* */
-void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id) {
+void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
if(slot_id >= b->num_bins) slot_id = 0;
- b->num_incs += 1;
+ b->num_incs += val;
switch(b->family) {
case ndpi_bin_family8:
- b->u.bins8[slot_id]++;
+ b->u.bins8[slot_id] = (u_int8_t)val;
break;
case ndpi_bin_family16:
- b->u.bins16[slot_id]++;
+ b->u.bins16[slot_id] = (u_int16_t)val;
break;
case ndpi_bin_family32:
- b->u.bins32[slot_id]++;
+ b->u.bins32[slot_id] = (u_int32_t)val;
break;
}
}
/* ********************************************************************************* */
+void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
+ if(slot_id >= b->num_bins) slot_id = 0;
+
+ b->num_incs += val;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ b->u.bins8[slot_id] += (u_int8_t)val;
+ break;
+ case ndpi_bin_family16:
+ b->u.bins16[slot_id] += (u_int16_t)val;
+ break;
+ case ndpi_bin_family32:
+ b->u.bins32[slot_id] += (u_int32_t)val;
+ break;
+ }
+}
+
+/* ********************************************************************************* */
+
+u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) {
+ if(slot_id >= b->num_bins) slot_id = 0;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ return(b->u.bins8[slot_id]);
+ break;
+ case ndpi_bin_family16:
+ return(b->u.bins16[slot_id]);
+ break;
+ case ndpi_bin_family32:
+ return(b->u.bins32[slot_id]);
+ break;
+ }
+}
+
+/* ********************************************************************************* */
+
+void ndpi_reset_bin(struct ndpi_bin *b) {
+ b->num_incs = 0;
+
+ switch(b->family) {
+ case ndpi_bin_family8:
+ memset(b->u.bins8, 0, sizeof(u_int8_t)*b->num_bins);
+ break;
+ case ndpi_bin_family16:
+ memset(b->u.bins16, 0, sizeof(u_int16_t)*b->num_bins);
+ break;
+ case ndpi_bin_family32:
+ memset(b->u.bins32, 0, sizeof(u_int32_t)*b->num_bins);
+ break;
+ }
+}
+/* ********************************************************************************* */
+
/*
Each bin slot is transformed in a % with respect to the value total
*/
@@ -387,7 +442,8 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
u_int32_t sumxx = 0, sumxy = 0, sumyy = 0;
if((b1->num_incs == 0) || (b2->num_incs == 0)
- || (b1->family != b2->family) || (b1->num_bins != b2->num_bins))
+ // || (b1->family != b2->family)
+ || (b1->num_bins != b2->num_bins))
return(0);
if(normalize_first)
@@ -412,3 +468,110 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
}
/* ********************************************************************************* */
+
+/*
+ Clusters bins into 'num_clusters'
+ - (in) bins: a vection 'num_bins' long of bins to cluster
+ - (in) 'num_clusters': number of desired clusters 0...(num_clusters-1)
+ - (out) 'cluster_ids': a vector 'num_bins' long containing the id's of each clustered bin
+
+ See
+ - https://en.wikipedia.org/wiki/K-means_clustering
+ */
+int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
+ u_int8_t num_clusters, u_int16_t *cluster_ids) {
+ u_int16_t i, j, max_iterations = 100, num_iterations = 0, num_moves;
+ struct ndpi_bin *centroids;
+ u_int8_t verbose = 0;
+
+ if(num_clusters > num_bins) return(-1);
+
+ if((centroids = (struct ndpi_bin*)malloc(sizeof(struct ndpi_bin)*num_clusters)) == NULL)
+ return(-2);
+ else {
+ for(i=0; i<num_clusters; i++)
+ ndpi_init_bin(&centroids[i], ndpi_bin_family32 /* Use 32 bit to avoid overlaps */, bins[0].num_bins);
+ }
+
+ /* Reset the id's */
+ memset(cluster_ids, 0, sizeof(u_int16_t) * num_bins);
+
+ /* Randomly pick a cluster id */
+ for(i=0; i<num_clusters; i++) cluster_ids[i] = i;
+
+ /* Assign the remaining bins to the nearest cluster */
+ for(i=num_clusters; i<num_bins; i++) {
+ u_int16_t j;
+ float top_similarity = -1;
+ u_int8_t cluster_id;
+
+ for(j=0; j<num_clusters; j++) {
+ float similarity = ndpi_bin_similarity(&bins[i], &bins[j], 0);
+
+ if(similarity > top_similarity)
+ cluster_id = j, top_similarity = similarity;
+ }
+
+ cluster_ids[i] = cluster_id;
+ }
+
+ /* Now let's try to find a better arrangement */
+ while(num_iterations++ < max_iterations) {
+ /* Find the center of each cluster */
+
+ if(verbose) printf("Iteration %u\n", num_iterations);
+
+ for(i=0; i<num_clusters; i++)
+ ndpi_reset_bin(&centroids[i]);
+
+ for(i=0; i<num_bins; i++) {
+ for(j=0; j<bins[i].num_bins; j++) {
+ ndpi_inc_bin(&centroids[cluster_ids[i]], j, ndpi_get_bin_value(&bins[i], j));
+ }
+ }
+
+ for(i=0; i<num_clusters; i++) {
+ char out_buf[256];
+
+ ndpi_normalize_bin(&centroids[i]);
+ if(verbose)
+ printf("Centroid [%u] %s\n", cluster_ids[i],
+ ndpi_print_bin(&centroids[i], 0, out_buf, sizeof(out_buf)));
+ }
+
+ /* Now let's check if there are bins to move across clusters */
+ num_moves = 0;
+
+ for(i=0; i<num_bins; i++) {
+ u_int16_t j;
+ float top_similarity = -1;
+ u_int8_t cluster_id;
+
+ for(j=0; j<num_clusters; j++) {
+ float similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0);
+
+ if(similarity > top_similarity)
+ cluster_id = j, top_similarity = similarity;
+ }
+
+ if((top_similarity > 0) && (cluster_ids[i] != cluster_id)) {
+ if(verbose)
+ printf("Moved bin %u from cluster %u -> %u [similarity: %f]\n",
+ i, cluster_ids[i], cluster_id, top_similarity);
+
+ cluster_ids[i] = cluster_id;
+ num_moves++;
+ }
+ }
+
+ if(num_moves == 0)
+ break;
+ }
+
+ for(i=0; i<num_clusters; i++)
+ ndpi_free_bin(&centroids[i]);
+
+ return(0);
+}
+
+/* ********************************************************************************* */