aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2022-04-04 10:02:45 +0200
committerLuca Deri <deri@ntop.org>2022-04-04 10:02:45 +0200
commita6e2f4a15a52087ff3f2fa16bb990c998c5215a6 (patch)
treef8b5470e3cd1f620b5861455285fae5d2a1c4dfe
parent19019383f0ca4262c43af30b8de9a96413f9534e (diff)
Added ndpi_find_outliers() API call using Z-Score
-rw-r--r--example/ndpiReader.c19
-rw-r--r--src/include/ndpi_api.h.in15
-rw-r--r--src/lib/ndpi_analyze.c42
3 files changed, 76 insertions, 0 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index b957989e2..2b02a75db 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -5398,6 +5398,24 @@ void compressedBitmapUnitTest() {
/* *********************************************** */
+void zscoreUnitTest() {
+ u_int32_t values[] = { 1, 3, 3, 4, 5, 2, 6, 7, 30, 16 };
+ u_int32_t i;
+ u_int32_t num_outliers, num = sizeof(values) / sizeof(u_int32_t);
+ bool outliers[num], do_trace = false;
+
+ num_outliers = ndpi_find_outliers(values, outliers, num);
+
+ if(do_trace) {
+ printf("outliers: %u\n", num_outliers);
+
+ for(i=0; i<num; i++)
+ printf("%u %s\n", values[i], outliers[i] ? "OUTLIER" : "OK");
+ }
+}
+
+/* *********************************************** */
+
/**
@brief MAIN FUNCTION
**/
@@ -5441,6 +5459,7 @@ int original_main(int argc, char **argv) {
exit(0);
#endif
+ zscoreUnitTest();
sesUnitTest();
desUnitTest();
diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in
index abd67655f..38cd7edc1 100644
--- a/src/include/ndpi_api.h.in
+++ b/src/include/ndpi_api.h.in
@@ -1558,6 +1558,7 @@ extern "C" {
float ndpi_data_entropy(struct ndpi_analyze_struct *s);
float ndpi_data_variance(struct ndpi_analyze_struct *s);
float ndpi_data_stddev(struct ndpi_analyze_struct *s);
+ float ndpi_data_mean(struct ndpi_analyze_struct *s);
u_int32_t ndpi_data_last(struct ndpi_analyze_struct *s);
u_int32_t ndpi_data_min(struct ndpi_analyze_struct *s);
u_int32_t ndpi_data_max(struct ndpi_analyze_struct *s);
@@ -1653,6 +1654,20 @@ extern "C" {
/* ******************************* */
+ /*
+ * Finds outliers using Z-score
+ * Z-Score = (Value - Mean) / StdDev
+ *
+ * @par values = pointer to the individual values to be analyzed [in]
+ * @par outliers = pointer to a list of outliers identified [out]
+ * @par num_values = lenght of values and outliers that MUST have the same lenght [in]
+ *
+ * @return The number of outliers found
+ */
+ u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values);
+
+ /* ******************************* */
+
u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long);
/* ******************************* */
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index beb6ca750..ebb5617ef 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -161,6 +161,16 @@ float ndpi_data_stddev(struct ndpi_analyze_struct *s) {
/* ********************************************************************************* */
+/*
+ Compute the mean on all values
+ NOTE: In statistics, there is no difference between the mean and average
+*/
+float ndpi_data_mean(struct ndpi_analyze_struct *s) {
+ return(ndpi_data_average(s));
+}
+
+/* ********************************************************************************* */
+
/* Compute the average only on the sliding window */
float ndpi_data_window_average(struct ndpi_analyze_struct *s) {
if(s->num_values_array_len) {
@@ -1425,3 +1435,35 @@ void ndpi_des_fitting(double *values, u_int32_t num_values, float *ret_alpha, fl
*ret_alpha = best_alpha, *ret_beta = best_beta;
}
+
+/* *********************************************************** */
+
+/* Z-Score = (Value - Mean) / StdDev */
+u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values) {
+ u_int i, ret = 0;
+ float mean, stddev, low_threshold = -2.5, high_threshold = 2.5;
+ struct ndpi_analyze_struct a;
+
+ ndpi_init_data_analysis(&a, 3 /* this is the window so we do not need to store values and 3 is enough */);
+
+ /* Add values */
+ for(i=0; i<num_values; i++)
+ ndpi_data_add_value(&a, values[i]);
+
+ mean = ndpi_data_mean(&a);
+ stddev = ndpi_data_stddev(&a);
+
+ /* Process values */
+ for(i=0; i<num_values; i++) {
+ float z_score = (((float)values[i]) - mean) / stddev;
+ bool is_outlier = ((z_score < low_threshold) || (z_score > high_threshold)) ? true : false;
+
+ if(is_outlier) ret++;
+ outliers[i] = is_outlier;
+ }
+
+ ndpi_free_data_analysis(&a, 0);
+
+ return(ret);
+}
+