From 2f657cb8f9001e9944033299758a150d8c0b4a2c Mon Sep 17 00:00:00 2001 From: Luca Date: Thu, 28 Dec 2023 19:59:54 +0100 Subject: Implemented ndpi_is_outlier() for detecting outliers using z-score --- example/ndpiReader.c | 57 +++++++++++++++++++++++++++++++------------------- src/include/ndpi_api.h | 23 ++++++++++++++++++++ src/lib/ndpi_analyze.c | 36 ++++++++++++++++++++++++++----- 3 files changed, 90 insertions(+), 26 deletions(-) diff --git a/example/ndpiReader.c b/example/ndpiReader.c index 91463bb02..ce697026d 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -1124,7 +1124,7 @@ static void parseOptions(int argc, char **argv) { module_tmp = ndpi_init_detection_module(0); if(!module_tmp) break; - + NDPI_BITMASK_SET_ALL(all); ndpi_set_protocol_detection_bitmask2(module_tmp, &all); ndpi_finalize_initialization(module_tmp); @@ -1883,7 +1883,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa if(flow->ssh_tls.ja4_client[0] != '\0') fprintf(out, "[JA4: %s%s]", flow->ssh_tls.ja4_client, print_cipher(flow->ssh_tls.client_unsafe_cipher)); - + if(flow->ssh_tls.server_info[0] != '\0') fprintf(out, "[Server: %s]", flow->ssh_tls.server_info); if(flow->ssh_tls.server_names) fprintf(out, "[ServerNames: %s]", flow->ssh_tls.server_names); @@ -2679,7 +2679,7 @@ static void setupDetection(u_int16_t thread_id, pcap_t * pcap_handle) { exit(-1); } } - + if(_riskyDomainFilePath) ndpi_load_risk_domain_file(ndpi_thread_info[thread_id].workflow->ndpi_struct, _riskyDomainFilePath); @@ -2688,7 +2688,7 @@ static void setupDetection(u_int16_t thread_id, pcap_t * pcap_handle) { if(_maliciousSHA1Path) ndpi_load_malicious_sha1_file(ndpi_thread_info[thread_id].workflow->ndpi_struct, _maliciousSHA1Path); - + if(_customCategoryFilePath) { char *label = strrchr(_customCategoryFilePath, '/'); @@ -3773,10 +3773,10 @@ static void printResults(u_int64_t processing_time_usec, u_int64_t setup_time_us float b = (float)(cumulative_stats.total_wire_bytes * 8 *1000000)/(float)processing_time_usec; float traffic_duration; struct tm result; - + if(live_capture) traffic_duration = processing_time_usec; else traffic_duration = ((u_int64_t)pcap_end.tv_sec*1000000 + pcap_end.tv_usec) - ((u_int64_t)pcap_start.tv_sec*1000000 + pcap_start.tv_usec); - + printf("\tnDPI throughput: %s pps / %s/sec\n", formatPackets(t, buf), formatTraffic(b, 1, buf1)); if(traffic_duration != 0) { t = (float)(cumulative_stats.ip_packet_count*1000000)/(float)traffic_duration; @@ -5380,16 +5380,16 @@ void compressedBitmapUnitTest() { void filterUnitTest() { ndpi_filter* f = ndpi_filter_alloc(); u_int32_t v, i; - + assert(f); srand(time(NULL)); - + for(i=0; i<1000; i++) assert(ndpi_filter_add(f, v = rand())); assert(ndpi_filter_contains(f, v)); - + ndpi_filter_free(f); } @@ -5446,7 +5446,7 @@ void sketchUnitTest() { #endif sketch = ndpi_cm_sketch_init(32); - + if(sketch) { u_int32_t i, num_one = 0; bool do_trace = false; @@ -5468,7 +5468,7 @@ void sketchUnitTest() { if(do_trace) exit(0); - } + } } /* *********************************************** */ @@ -5477,7 +5477,7 @@ void binaryBitmapUnitTest() { ndpi_binary_bitmap *b = ndpi_binary_bitmap_alloc(); u_int64_t hashval = 8149764909040470312; u_int8_t category = 33; - + ndpi_binary_bitmap_set(b, hashval, category); ndpi_binary_bitmap_set(b, hashval+1, category); category = 0; @@ -5500,13 +5500,27 @@ void pearsonUnitTest() { /* *********************************************** */ +void outlierUnitTest() { + u_int32_t data[] = {1, 2, 3, 4, 5}; + u_int16_t num = sizeof(data) / sizeof(u_int32_t); + u_int16_t value_to_check = 8; + float threshold = 1.5, lower, upper; + float is_outlier = ndpi_is_outlier(data, num, value_to_check, + threshold, &lower, &upper); + + /* printf("%.2f < %u < %.2f : %s\n", lower, value_to_check, upper, is_outlier ? "OUTLIER" : "OK"); */ + assert(is_outlier == true); +} + +/* *********************************************** */ + void domainSearchUnitTest() { ndpi_domain_classify *sc = ndpi_domain_classify_alloc(); char *domain = "ntop.org"; u_int8_t class_id; - + assert(sc); - + ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, ".ntop.org"); ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, domain); assert(ndpi_domain_classify_contains(sc, &class_id, domain)); @@ -5518,18 +5532,18 @@ void domainSearchUnitTest() { /* Subdomain check */ assert(ndpi_domain_classify_contains(sc, &class_id, "blog.ntop.org")); assert(class_id == NDPI_PROTOCOL_NTOP); - + #ifdef DEBUG_TRACE struct stat st; - + if(stat(fname, &st) == 0) { u_int32_t s = ndpi_domain_classify_size(sc); - + printf("Size: %u [%.1f %% of the original filename size]\n", s, (float)(s * 100) / (float)st.st_size); } #endif - + ndpi_domain_classify_free(sc); } @@ -5543,7 +5557,7 @@ void domainSearchUnitTest2() { ndpi_domain_classify_add(c, class_id, "apple.com"); assert(!ndpi_domain_classify_contains(c, &class_id, "ntop.com")); - + ndpi_domain_classify_free(c); } @@ -5588,6 +5602,7 @@ int main(int argc, char **argv) { exit(0); #endif + outlierUnitTest(); pearsonUnitTest(); binaryBitmapUnitTest(); domainSearchUnitTest(); @@ -5656,7 +5671,7 @@ int main(int argc, char **argv) { } signal(SIGINT, sigproc); - + for(i=0; i 1 - The value to check has to be within (t * stddev) boundaries + * @par lower - [out] Lower threshold + * @par upper - [out] Upper threshold + * + * @return true if the specified value is an outlier, false otherwise + * + */ + + bool ndpi_is_outlier(u_int32_t *past_values, u_int32_t num_past_values, + u_int32_t value_to_check, float threshold, + float *lower, float *upper); + + /* ******************************* */ u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long); diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index 7a2fd495c..06d461561 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -71,7 +71,7 @@ struct ndpi_analyze_struct* ndpi_alloc_data_analysis_from_series(const u_int32_t struct ndpi_analyze_struct *ret = ndpi_alloc_data_analysis(num_values); if(ret == NULL) return(NULL); - + for(i=0; i *upper)) ? true : false); +} + /* ********************************************************************************* */ /* @@ -1690,7 +1715,7 @@ double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int1 double mean_a, mean_b, variance_a, variance_b, covariance; if(num_values == 0) return(0.0); - + for(i = 0; i < num_values; i++) sum_a += values_a[i], sum_b += values_b[i]; @@ -1703,10 +1728,11 @@ double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int1 variance_a = sum_squared_diff_a / (double)num_values, variance_b = sum_squared_diff_b / (double)num_values; covariance = sum_product_diff / (double)num_values; - + return(covariance / sqrt(variance_a * variance_b)); } +/* ********************************************************************************* */ /* ********************************************************************************* */ static const u_int16_t crc16_ccitt_table[256] = { @@ -1911,7 +1937,7 @@ struct ndpi_cm_sketch *ndpi_cm_sketch_init(u_int16_t num_hashes) { num_hashes = ndpi_nearest_power_of_two(num_hashes); sketch->num_hashes = num_hashes; - sketch->num_hash_buckets = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS; + sketch->num_hash_buckets = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS; sketch->num_hash_buckets = ndpi_nearest_power_of_two(sketch->num_hash_buckets)-1, len = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS * sizeof(u_int32_t); @@ -1966,7 +1992,7 @@ u_int32_t ndpi_cm_sketch_count(struct ndpi_cm_sketch *sketch, u_int32_t element) printf("ndpi_add_sketch_add() [hash: %d][num_hash_buckets: %u][hashval: %d][value: %d]\n", idx, sketch->num_hash_buckets, hashval, sketch->tables[hashval]); #endif - + min_value = ndpi_min(min_value, sketch->tables[hashval]); } -- cgit v1.2.3