Implemented ndpi_predict_linear() for predicting a timeseries value overtime

author: Luca Deri <deri@ntop.org> 2023-05-19 11:46:03 +0200
committer: Luca Deri <deri@ntop.org> 2023-05-19 11:46:03 +0200
commit: 5ca6f0ac62d6b2c346bc99d2a1b1200fe9df7917 (patch)
tree: d4ba9b547f52c27e8a7f3a65cbb1a0cd1934ca0d
parent: a8b3baf2b8d7f4fffadc7d35a335ddd2c76770c2 (diff)
3 files changed, 90 insertions, 18 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index efa5bd666..e43211ddd 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -5150,6 +5150,20 @@ void zscoreUnitTest() {
 
 /* *********************************************** */
 
+void linearUnitTest() {
+  u_int32_t values[] = {15, 27, 38, 49, 68, 72, 90, 150, 175, 203};
+  u_int32_t prediction;
+  u_int32_t const num = NDPI_ARRAY_LENGTH(values);
+  bool do_trace = false;
+  int rc = ndpi_predict_linear(values, num, 2*num, &prediction);
+  
+  if(do_trace) {
+    printf("[rc: %d][predicted value: %u]\n", rc, prediction);
+  }
+}
+
+/* *********************************************** */
+
 /**
    @brief MAIN FUNCTION
 **/
@@ -5189,6 +5203,7 @@ int main(int argc, char **argv) {
     exit(0);
 #endif
 
+    linearUnitTest();
     zscoreUnitTest();
     sesUnitTest();
     desUnitTest();
diff --git a/src/include/ndpi_api.h b/src/include/ndpi_api.h
index 7c5523bd6..ecb6c8271 100644
--- a/src/include/ndpi_api.h
+++ b/src/include/ndpi_api.h
@@ -1833,6 +1833,22 @@ extern "C" {
 
   /* ******************************* */
 
+  /*
+   * Predicts a value using simple linear regression
+   * Z-Score = (Value - Mean) / StdDev
+   *
+   * @par    values          = pointer to the individual values to be analyzed [in]
+   * @par    num_values      = number of 'values' [in]
+   * @par    predict_periods = number of periods for which we want to make the prediction [in]
+   * @par    prediction      = predicted value after 'predict_periods' [out]
+   *
+   * @return The number of outliers found
+  */
+  int ndpi_predict_linear(u_int32_t *values, u_int32_t num_values,
+			  u_int32_t predict_periods, u_int32_t *prediction);
+
+  /* ******************************* */
+
   u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long);
 
   /* ******************************* */
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index 5cd901c75..1b3b9cc4f 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -129,7 +129,7 @@ void ndpi_data_add_value(struct ndpi_analyze_struct *s, const u_int32_t value) {
 float ndpi_data_average(struct ndpi_analyze_struct *s) {
   if((!s) || (s->num_data_entries == 0))
     return(0);
-  
+
   return((s->num_data_entries == 0) ? 0 : ((float)s->sum_total / (float)s->num_data_entries));
 }
 
@@ -157,7 +157,7 @@ float ndpi_data_variance(struct ndpi_analyze_struct *s) {
     return(0);
   float v = s->num_data_entries ?
     ((float)s->stddev.sum_square_total - ((float)s->sum_total * (float)s->sum_total / (float)s->num_data_entries)) / (float)s->num_data_entries : 0.0;
-  
+
   return((v < 0  /* rounding problem */) ? 0 : v);
 }
 
@@ -176,8 +176,8 @@ float ndpi_data_stddev(struct ndpi_analyze_struct *s) {
 
 /* ********************************************************************************* */
 
-/* 
-   Compute the mean on all values 
+/*
+   Compute the mean on all values
    NOTE: In statistics, there is no difference between the mean and average
 */
 float ndpi_data_mean(struct ndpi_analyze_struct *s) {
@@ -695,7 +695,7 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2,
 
       if(threshold && (sum > threshold))
 	return(-2); /* Sorry they are not similar */
-	 
+
       // printf("%u/%u) [a: %u][b: %u][sum: %u]\n", i, b1->num_bins, a, b, sum);
     }
 
@@ -1144,7 +1144,7 @@ int ndpi_hw_add_value(struct ndpi_hw_struct *hw, const u_int64_t _value, double
     double prev_u, prev_v, prev_s, value  = (double)_value;
     double sq, error, sq_error;
     u_int observations;
-    
+
     if(hw->num_values == hw->params.num_season_periods) {
       double avg = ndpi_avg_inline(hw->y, hw->params.num_season_periods);
       u_int i;
@@ -1391,7 +1391,7 @@ void ndpi_ses_fitting(double *values, u_int32_t num_values, float *ret_alpha) {
 
   for(alpha=0.1; alpha<0.99; alpha += 0.05) {
     struct ndpi_ses_struct ses;
-      
+
     ndpi_ses_init(&ses, alpha, 0.05);
 
     if(trace)
@@ -1446,7 +1446,7 @@ int ndpi_des_init(struct ndpi_des_struct *des, double alpha, double beta, float
 
   des->params.alpha = alpha;
   des->params.beta = beta;
-	
+
   if((significance < 0) || (significance > 1)) significance = 0.05;
   des->params.ro         = ndpi_normal_cdf_inverse(1 - (significance / 2.));
 
@@ -1460,7 +1460,7 @@ void ndpi_des_reset(struct ndpi_des_struct *des) {
   des->num_values = 0;
   des->sum_square_error = des->last_forecast = des->last_trend = des->last_value = 0;
 }
-  
+
 /* *********************************************************** */
 
 /*
@@ -1488,15 +1488,15 @@ int ndpi_des_add_value(struct ndpi_des_struct *des, const double _value, double
     *forecast = (des->params.alpha * value) + ((1 - des->params.alpha) * (des->last_forecast + des->last_trend));
     des->last_trend = (des->params.beta * (*forecast - des->last_forecast)) + ((1 - des->params.beta) * des->last_trend);
   }
-  
+
   error  = value - *forecast;
   sq_error =  error * error;
   des->sum_square_error += sq_error, des->prev_error.sum_square_error += sq_error;
-  
+
   if(des->num_values > 0) {
     u_int observations = (des->num_values < MAX_SQUARE_ERROR_ITERATIONS) ? (des->num_values + 1) : ((des->num_values % MAX_SQUARE_ERROR_ITERATIONS) + MAX_SQUARE_ERROR_ITERATIONS + 1);
     double sq = sqrt(des->sum_square_error / observations);
-    
+
     *confidence_band = des->params.ro * sq;
     rc = 1;
   } else
@@ -1508,12 +1508,12 @@ int ndpi_des_add_value(struct ndpi_des_struct *des, const double _value, double
     des->sum_square_error = des->prev_error.sum_square_error;
     des->prev_error.num_values_rollup = 0, des->prev_error.sum_square_error = 0;
   }
-  
+
 #ifdef DES_DEBUG
   printf("[num_values: %u][[error: %.3f][forecast: %.3f][trend: %.3f[sqe: %.3f][sq: %.3f][confidence_band: %.3f]\n",
 	 des->num_values, error, *forecast, des->last_trend, des->sum_square_error, sq, *confidence_band);
 #endif
-  
+
   return(rc);
 }
 
@@ -1539,7 +1539,7 @@ void ndpi_des_fitting(double *values, u_int32_t num_values, float *ret_alpha, fl
   for(beta=0.1; beta<0.99; beta += 0.05) {
     for(alpha=0.1; alpha<0.99; alpha += 0.05) {
       struct ndpi_des_struct des;
-      
+
       ndpi_des_init(&des, alpha, beta, 0.05);
 
       if(trace)
@@ -1594,7 +1594,7 @@ u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values
   ndpi_init_data_analysis(&a, 3 /* this is the window so we do not need to store values and 3 is enough */);
 
   /* Add values */
-  for(i=0; i<num_values; i++) 
+  for(i=0; i<num_values; i++)
     ndpi_data_add_value(&a, values[i]);
 
   mean    = ndpi_data_mean(&a);
@@ -1613,18 +1613,53 @@ u_int ndpi_find_outliers(u_int32_t *values, bool *outliers, u_int32_t num_values
     if(is_outlier) ret++;
     outliers[i] = is_outlier;
   }
-  
+
   ndpi_free_data_analysis(&a, 0);
 
   return(ret);
 }
 
-/* ************************************************************/
+/* ********************************************************************************* */
+
+/*
+  Simple Linear regression [https://en.wikipedia.org/wiki/Simple_linear_regression]
+  https://www.tutorialspoint.com/c-program-to-compute-linear-regression
+*/
+int ndpi_predict_linear(u_int32_t *values, u_int32_t num_values,
+			u_int32_t predict_periods, u_int32_t *prediction) {
+  u_int i;
+  float m, c, d;
+  float sumx = 0, sumx_square = 0, sumy = 0, sumxy = 0;
+
+  for(i = 0; i < num_values; i++) {
+    float y = values[i];
+    float x = i + 1;
+
+    sumx   = sumx+x;
+    sumx_square = sumx_square + (x * x);
+    sumy   = sumy + y;
+    sumxy  = sumxy + (x * y);
+  }
+
+  d = (num_values * sumx_square) - (sumx * sumx);
+
+  if(d == 0) return(-1);
+
+  m = ((num_values * sumxy) - (sumx * sumy))  / d; /* beta  */
+  c = ((sumy * sumx_square) - (sumx * sumxy)) / d; /* alpha */
+
+  *prediction = c + (m * (predict_periods + num_values - 1));
+
+  return(0);
+}
+
+/* ********************************************************************************* */
 
 /* ********************************************************** */
 /*       http://home.thep.lu.se/~bjorn/crc/crc32_fast.c       */
 /* ********************************************************** */
 
+
 static uint32_t crc32_for_byte(uint32_t r) {
   int j;
 
@@ -1633,6 +1668,8 @@ static uint32_t crc32_for_byte(uint32_t r) {
   return r ^ (uint32_t)0xFF000000L;
 }
 
+/* ********************************************************************************* */
+
 /* Any unsigned integer type with at least 32 bits may be used as
  * accumulator type for fast crc32-calulation, but unsigned long is
  * probably the optimal choice for most systems. */
@@ -1651,6 +1688,8 @@ static void init_tables(uint32_t* table, uint32_t* wtable) {
     }
 }
 
+/* ********************************************************************************* */
+
 static void __crc32(const void* data, size_t n_bytes, uint32_t* crc) {
   static uint32_t table[0x100], wtable[0x100*sizeof(accum_t)];
   size_t n_accum = n_bytes/sizeof(accum_t);
@@ -1667,6 +1706,8 @@ static void __crc32(const void* data, size_t n_bytes, uint32_t* crc) {
     *crc = table[(uint8_t)*crc ^ ((uint8_t*)data)[i]] ^ *crc >> 8;
 }
 
+/* ********************************************************************************* */
+
 u_int32_t ndpi_crc32(const void* data, size_t n_bytes) {
   u_int32_t crc = 0;
author	Luca Deri <deri@ntop.org>	2023-05-19 11:46:03 +0200
committer	Luca Deri <deri@ntop.org>	2023-05-19 11:46:03 +0200
commit	5ca6f0ac62d6b2c346bc99d2a1b1200fe9df7917 (patch)
tree	d4ba9b547f52c27e8a7f3a65cbb1a0cd1934ca0d
parent	a8b3baf2b8d7f4fffadc7d35a335ddd2c76770c2 (diff)