aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca <deri@ntop.org>2023-12-28 19:59:54 +0100
committerLuca <deri@ntop.org>2023-12-28 19:59:54 +0100
commit2f657cb8f9001e9944033299758a150d8c0b4a2c (patch)
tree055f3352bec2ea995538945046d0b242b5b4b8d5
parent1366518bff88210008159cc385f092bd5cfc6252 (diff)
Implemented ndpi_is_outlier() for detecting outliers using z-score
-rw-r--r--example/ndpiReader.c57
-rw-r--r--src/include/ndpi_api.h23
-rw-r--r--src/lib/ndpi_analyze.c36
3 files changed, 90 insertions, 26 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index 91463bb02..ce697026d 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -1124,7 +1124,7 @@ static void parseOptions(int argc, char **argv) {
module_tmp = ndpi_init_detection_module(0);
if(!module_tmp)
break;
-
+
NDPI_BITMASK_SET_ALL(all);
ndpi_set_protocol_detection_bitmask2(module_tmp, &all);
ndpi_finalize_initialization(module_tmp);
@@ -1883,7 +1883,7 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
if(flow->ssh_tls.ja4_client[0] != '\0') fprintf(out, "[JA4: %s%s]", flow->ssh_tls.ja4_client,
print_cipher(flow->ssh_tls.client_unsafe_cipher));
-
+
if(flow->ssh_tls.server_info[0] != '\0') fprintf(out, "[Server: %s]", flow->ssh_tls.server_info);
if(flow->ssh_tls.server_names) fprintf(out, "[ServerNames: %s]", flow->ssh_tls.server_names);
@@ -2679,7 +2679,7 @@ static void setupDetection(u_int16_t thread_id, pcap_t * pcap_handle) {
exit(-1);
}
}
-
+
if(_riskyDomainFilePath)
ndpi_load_risk_domain_file(ndpi_thread_info[thread_id].workflow->ndpi_struct, _riskyDomainFilePath);
@@ -2688,7 +2688,7 @@ static void setupDetection(u_int16_t thread_id, pcap_t * pcap_handle) {
if(_maliciousSHA1Path)
ndpi_load_malicious_sha1_file(ndpi_thread_info[thread_id].workflow->ndpi_struct, _maliciousSHA1Path);
-
+
if(_customCategoryFilePath) {
char *label = strrchr(_customCategoryFilePath, '/');
@@ -3773,10 +3773,10 @@ static void printResults(u_int64_t processing_time_usec, u_int64_t setup_time_us
float b = (float)(cumulative_stats.total_wire_bytes * 8 *1000000)/(float)processing_time_usec;
float traffic_duration;
struct tm result;
-
+
if(live_capture) traffic_duration = processing_time_usec;
else traffic_duration = ((u_int64_t)pcap_end.tv_sec*1000000 + pcap_end.tv_usec) - ((u_int64_t)pcap_start.tv_sec*1000000 + pcap_start.tv_usec);
-
+
printf("\tnDPI throughput: %s pps / %s/sec\n", formatPackets(t, buf), formatTraffic(b, 1, buf1));
if(traffic_duration != 0) {
t = (float)(cumulative_stats.ip_packet_count*1000000)/(float)traffic_duration;
@@ -5380,16 +5380,16 @@ void compressedBitmapUnitTest() {
void filterUnitTest() {
ndpi_filter* f = ndpi_filter_alloc();
u_int32_t v, i;
-
+
assert(f);
srand(time(NULL));
-
+
for(i=0; i<1000; i++)
assert(ndpi_filter_add(f, v = rand()));
assert(ndpi_filter_contains(f, v));
-
+
ndpi_filter_free(f);
}
@@ -5446,7 +5446,7 @@ void sketchUnitTest() {
#endif
sketch = ndpi_cm_sketch_init(32);
-
+
if(sketch) {
u_int32_t i, num_one = 0;
bool do_trace = false;
@@ -5468,7 +5468,7 @@ void sketchUnitTest() {
if(do_trace)
exit(0);
- }
+ }
}
/* *********************************************** */
@@ -5477,7 +5477,7 @@ void binaryBitmapUnitTest() {
ndpi_binary_bitmap *b = ndpi_binary_bitmap_alloc();
u_int64_t hashval = 8149764909040470312;
u_int8_t category = 33;
-
+
ndpi_binary_bitmap_set(b, hashval, category);
ndpi_binary_bitmap_set(b, hashval+1, category);
category = 0;
@@ -5500,13 +5500,27 @@ void pearsonUnitTest() {
/* *********************************************** */
+void outlierUnitTest() {
+ u_int32_t data[] = {1, 2, 3, 4, 5};
+ u_int16_t num = sizeof(data) / sizeof(u_int32_t);
+ u_int16_t value_to_check = 8;
+ float threshold = 1.5, lower, upper;
+ float is_outlier = ndpi_is_outlier(data, num, value_to_check,
+ threshold, &lower, &upper);
+
+ /* printf("%.2f < %u < %.2f : %s\n", lower, value_to_check, upper, is_outlier ? "OUTLIER" : "OK"); */
+ assert(is_outlier == true);
+}
+
+/* *********************************************** */
+
void domainSearchUnitTest() {
ndpi_domain_classify *sc = ndpi_domain_classify_alloc();
char *domain = "ntop.org";
u_int8_t class_id;
-
+
assert(sc);
-
+
ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, ".ntop.org");
ndpi_domain_classify_add(sc, NDPI_PROTOCOL_NTOP, domain);
assert(ndpi_domain_classify_contains(sc, &class_id, domain));
@@ -5518,18 +5532,18 @@ void domainSearchUnitTest() {
/* Subdomain check */
assert(ndpi_domain_classify_contains(sc, &class_id, "blog.ntop.org"));
assert(class_id == NDPI_PROTOCOL_NTOP);
-
+
#ifdef DEBUG_TRACE
struct stat st;
-
+
if(stat(fname, &st) == 0) {
u_int32_t s = ndpi_domain_classify_size(sc);
-
+
printf("Size: %u [%.1f %% of the original filename size]\n",
s, (float)(s * 100) / (float)st.st_size);
}
#endif
-
+
ndpi_domain_classify_free(sc);
}
@@ -5543,7 +5557,7 @@ void domainSearchUnitTest2() {
ndpi_domain_classify_add(c, class_id, "apple.com");
assert(!ndpi_domain_classify_contains(c, &class_id, "ntop.com"));
-
+
ndpi_domain_classify_free(c);
}
@@ -5588,6 +5602,7 @@ int main(int argc, char **argv) {
exit(0);
#endif
+ outlierUnitTest();
pearsonUnitTest();
binaryBitmapUnitTest();
domainSearchUnitTest();
@@ -5656,7 +5671,7 @@ int main(int argc, char **argv) {
}
signal(SIGINT, sigproc);
-
+
for(i=0; i<num_loops; i++)
test_lib();
@@ -5666,7 +5681,7 @@ int main(int argc, char **argv) {
if(extcap_fifo_h) pcap_close(extcap_fifo_h);
if(enable_malloc_bins) ndpi_free_bin(&malloc_bins);
if(csv_fp) fclose(csv_fp);
-
+
ndpi_free(_debug_protocols);
ndpi_free(_disabled_protocols);
diff --git a/src/include/ndpi_api.h b/src/include/ndpi_api.h
index 9200fcbf5..f834223c6 100644
--- a/src/include/ndpi_api.h
+++ b/src/include/ndpi_api.h
@@ -1948,6 +1948,29 @@ extern "C" {
double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int16_t num_values);
/* ******************************* */
+
+ /*
+ * Checks if a specified value is an outlier with respect to past values
+ * using the Z-score.
+ *
+ * @par past_valuea = List of observed past values (past knowledge)
+ * @par num_past_values = Number of observed past values
+ * @par value_to_check = The value to be checked with respect to past values
+ * @par threshold = Threshold on z-score:. Typical values:
+ * t = 1 - The value to check should not exceed the past values
+ * t > 1 - The value to check has to be within (t * stddev) boundaries
+ * @par lower - [out] Lower threshold
+ * @par upper - [out] Upper threshold
+ *
+ * @return true if the specified value is an outlier, false otherwise
+ *
+ */
+
+ bool ndpi_is_outlier(u_int32_t *past_values, u_int32_t num_past_values,
+ u_int32_t value_to_check, float threshold,
+ float *lower, float *upper);
+
+ /* ******************************* */
u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long);
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index 7a2fd495c..06d461561 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -71,7 +71,7 @@ struct ndpi_analyze_struct* ndpi_alloc_data_analysis_from_series(const u_int32_t
struct ndpi_analyze_struct *ret = ndpi_alloc_data_analysis(num_values);
if(ret == NULL) return(NULL);
-
+
for(i=0; i<num_values; i++)
ndpi_data_add_value(ret, (const u_int64_t)values[i]);
@@ -1648,6 +1648,31 @@ u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values
return(ret);
}
+/* *********************************************************** */
+
+/* Check if the specified value is an outlier with respect to the past values */
+bool ndpi_is_outlier(u_int32_t *past_values, u_int32_t num_past_values,
+ u_int32_t value_to_check, float threshold,
+ float *lower, float *upper) {
+ struct ndpi_analyze_struct *data = ndpi_alloc_data_analysis_from_series(past_values, num_past_values);
+ float mean, stddev, v;
+
+ if(!data) return(false);
+
+ mean = ndpi_data_mean(data);
+ stddev = ndpi_data_stddev(data);
+
+ /* The mimimum threshold is 1 (i.e. the value of the stddev) */
+ if(threshold < 1.) threshold = 1.;
+
+ v = threshold * stddev;
+ *lower = mean - v, *upper = mean + v;
+
+ ndpi_free_data_analysis(data, 1 /* free memory */);
+
+ return(((value_to_check < *lower) || (value_to_check > *upper)) ? true : false);
+}
+
/* ********************************************************************************* */
/*
@@ -1690,7 +1715,7 @@ double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int1
double mean_a, mean_b, variance_a, variance_b, covariance;
if(num_values == 0) return(0.0);
-
+
for(i = 0; i < num_values; i++)
sum_a += values_a[i], sum_b += values_b[i];
@@ -1703,11 +1728,12 @@ double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int1
variance_a = sum_squared_diff_a / (double)num_values, variance_b = sum_squared_diff_b / (double)num_values;
covariance = sum_product_diff / (double)num_values;
-
+
return(covariance / sqrt(variance_a * variance_b));
}
/* ********************************************************************************* */
+/* ********************************************************************************* */
static const u_int16_t crc16_ccitt_table[256] = {
0x0000, 0x1189, 0x2312, 0x329B, 0x4624, 0x57AD, 0x6536, 0x74BF,
@@ -1911,7 +1937,7 @@ struct ndpi_cm_sketch *ndpi_cm_sketch_init(u_int16_t num_hashes) {
num_hashes = ndpi_nearest_power_of_two(num_hashes);
sketch->num_hashes = num_hashes;
- sketch->num_hash_buckets = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS;
+ sketch->num_hash_buckets = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS;
sketch->num_hash_buckets = ndpi_nearest_power_of_two(sketch->num_hash_buckets)-1,
len = num_hashes * NDPI_COUNT_MIN_SKETCH_NUM_BUCKETS * sizeof(u_int32_t);
@@ -1966,7 +1992,7 @@ u_int32_t ndpi_cm_sketch_count(struct ndpi_cm_sketch *sketch, u_int32_t element)
printf("ndpi_add_sketch_add() [hash: %d][num_hash_buckets: %u][hashval: %d][value: %d]\n",
idx, sketch->num_hash_buckets, hashval, sketch->tables[hashval]);
#endif
-
+
min_value = ndpi_min(min_value, sketch->tables[hashval]);
}