diff options
author | Luca Deri <deri@ntop.org> | 2022-04-04 10:02:45 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2022-04-04 10:02:45 +0200 |
commit | a6e2f4a15a52087ff3f2fa16bb990c998c5215a6 (patch) | |
tree | f8b5470e3cd1f620b5861455285fae5d2a1c4dfe | |
parent | 19019383f0ca4262c43af30b8de9a96413f9534e (diff) |
Added ndpi_find_outliers() API call using Z-Score
-rw-r--r-- | example/ndpiReader.c | 19 | ||||
-rw-r--r-- | src/include/ndpi_api.h.in | 15 | ||||
-rw-r--r-- | src/lib/ndpi_analyze.c | 42 |
3 files changed, 76 insertions, 0 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c index b957989e2..2b02a75db 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -5398,6 +5398,24 @@ void compressedBitmapUnitTest() { /* *********************************************** */ +void zscoreUnitTest() { + u_int32_t values[] = { 1, 3, 3, 4, 5, 2, 6, 7, 30, 16 }; + u_int32_t i; + u_int32_t num_outliers, num = sizeof(values) / sizeof(u_int32_t); + bool outliers[num], do_trace = false; + + num_outliers = ndpi_find_outliers(values, outliers, num); + + if(do_trace) { + printf("outliers: %u\n", num_outliers); + + for(i=0; i<num; i++) + printf("%u %s\n", values[i], outliers[i] ? "OUTLIER" : "OK"); + } +} + +/* *********************************************** */ + /** @brief MAIN FUNCTION **/ @@ -5441,6 +5459,7 @@ int original_main(int argc, char **argv) { exit(0); #endif + zscoreUnitTest(); sesUnitTest(); desUnitTest(); diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in index abd67655f..38cd7edc1 100644 --- a/src/include/ndpi_api.h.in +++ b/src/include/ndpi_api.h.in @@ -1558,6 +1558,7 @@ extern "C" { float ndpi_data_entropy(struct ndpi_analyze_struct *s); float ndpi_data_variance(struct ndpi_analyze_struct *s); float ndpi_data_stddev(struct ndpi_analyze_struct *s); + float ndpi_data_mean(struct ndpi_analyze_struct *s); u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s); u_int32_t ndpi_data_min(struct ndpi_analyze_struct *s); u_int32_t ndpi_data_max(struct ndpi_analyze_struct *s); @@ -1653,6 +1654,20 @@ extern "C" { /* ******************************* */ + /* + * Finds outliers using Z-score + * Z-Score = (Value - Mean) / StdDev + * + * @par values = pointer to the individual values to be analyzed [in] + * @par outliers = pointer to a list of outliers identified [out] + * @par num_values = lenght of values and outliers that MUST have the same lenght [in] + * + * @return The number of outliers found + */ + u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values); + + /* ******************************* */ + u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long); /* ******************************* */ diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index beb6ca750..ebb5617ef 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -161,6 +161,16 @@ float ndpi_data_stddev(struct ndpi_analyze_struct *s) { /* ********************************************************************************* */ +/* + Compute the mean on all values + NOTE: In statistics, there is no difference between the mean and average +*/ +float ndpi_data_mean(struct ndpi_analyze_struct *s) { + return(ndpi_data_average(s)); +} + +/* ********************************************************************************* */ + /* Compute the average only on the sliding window */ float ndpi_data_window_average(struct ndpi_analyze_struct *s) { if(s->num_values_array_len) { @@ -1425,3 +1435,35 @@ void ndpi_des_fitting(double *values, u_int32_t num_values, float *ret_alpha, fl *ret_alpha = best_alpha, *ret_beta = best_beta; } + +/* *********************************************************** */ + +/* Z-Score = (Value - Mean) / StdDev */ +u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values) { + u_int i, ret = 0; + float mean, stddev, low_threshold = -2.5, high_threshold = 2.5; + struct ndpi_analyze_struct a; + + ndpi_init_data_analysis(&a, 3 /* this is the window so we do not need to store values and 3 is enough */); + + /* Add values */ + for(i=0; i<num_values; i++) + ndpi_data_add_value(&a, values[i]); + + mean = ndpi_data_mean(&a); + stddev = ndpi_data_stddev(&a); + + /* Process values */ + for(i=0; i<num_values; i++) { + float z_score = (((float)values[i]) - mean) / stddev; + bool is_outlier = ((z_score < low_threshold) || (z_score > high_threshold)) ? true : false; + + if(is_outlier) ret++; + outliers[i] = is_outlier; + } + + ndpi_free_data_analysis(&a, 0); + + return(ret); +} + |