Implements ndpi_pearson_correlation for measuring how correlated are two series

author: Luca Deri <deri@ntop.org> 2023-12-27 22:42:37 +0100
committer: Luca Deri <deri@ntop.org> 2023-12-27 22:42:37 +0100
commit: 1366518bff88210008159cc385f092bd5cfc6252 (patch)
tree: 2a4db00c945c178a5636fb39d6d0e6fc0ea30727
parent: 99d48383286fbb865ab58db5e5f768d8ed14f41e (diff)
4 files changed, 108 insertions, 15 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index e3070a47f..91463bb02 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -5488,6 +5488,18 @@ void binaryBitmapUnitTest() {
 
 /* *********************************************** */
 
+void pearsonUnitTest() {
+  u_int32_t data_a[] = {1, 2, 3, 4, 5};
+  u_int32_t data_b[] = {1000, 113, 104, 105, 106};
+  u_int16_t num = sizeof(data_a) / sizeof(u_int32_t);
+  float pearson = ndpi_pearson_correlation(data_a, data_b, num);
+
+  assert(pearson != 0.0);
+  // printf("%.8f\n", pearson);
+}
+
+/* *********************************************** */
+
 void domainSearchUnitTest() {
   ndpi_domain_classify *sc = ndpi_domain_classify_alloc();
   char *domain = "ntop.org";
@@ -5576,6 +5588,7 @@ int main(int argc, char **argv) {
     exit(0);
 #endif
 
+    pearsonUnitTest();
     binaryBitmapUnitTest();
     domainSearchUnitTest();
     domainSearchUnitTest2();
diff --git a/src/include/ndpi_api.h b/src/include/ndpi_api.h
index 168014c2e..9200fcbf5 100644
--- a/src/include/ndpi_api.h
+++ b/src/include/ndpi_api.h
@@ -360,7 +360,8 @@ extern "C" {
    * @return the ID of the app protocol detected
    *
    */
-  u_int16_t ndpi_get_flow_appprotocol(struct ndpi_detection_module_struct *ndpi_str, struct ndpi_flow_struct *flow);
+  u_int16_t ndpi_get_flow_appprotocol(struct ndpi_detection_module_struct *ndpi_str,
+				      struct ndpi_flow_struct *flow);
 
   /**
    * Get the category of the passed flows for the detected module
@@ -371,7 +372,8 @@ extern "C" {
    * @return the ID of the category
    *
    */
-  ndpi_protocol_category_t ndpi_get_flow_category(struct ndpi_detection_module_struct *ndpi_str, struct ndpi_flow_struct *flow);
+  ndpi_protocol_category_t ndpi_get_flow_category(struct ndpi_detection_module_struct *ndpi_str,
+						  struct ndpi_flow_struct *flow);
 
   /**
    * Get the ndpi protocol data of the passed flows for the detected module
@@ -382,8 +384,9 @@ extern "C" {
    * @par    ndpi_proto   = the output struct where to store the requested information
    *
    */
-  void ndpi_get_flow_ndpi_proto(struct ndpi_detection_module_struct *ndpi_str, struct ndpi_flow_struct *flow,
-                    struct ndpi_proto * ndpi_proto);
+  void ndpi_get_flow_ndpi_proto(struct ndpi_detection_module_struct *ndpi_str,
+				struct ndpi_flow_struct *flow,
+				struct ndpi_proto * ndpi_proto);
 
   /**
    * API call that is called internally by ndpi_detection_process_packet or by apps
@@ -415,7 +418,8 @@ extern "C" {
    else != 0
    *
    */
-  u_int8_t ndpi_detection_get_l4(const u_int8_t *l3, u_int16_t l3_len, const u_int8_t **l4_return, u_int16_t *l4_len_return,
+  u_int8_t ndpi_detection_get_l4(const u_int8_t *l3, u_int16_t l3_len,
+				 const u_int8_t **l4_return, u_int16_t *l4_len_return,
 				 u_int8_t *l4_protocol_return, u_int32_t flags);
 
   /**
@@ -430,10 +434,8 @@ extern "C" {
    *
    */
   ndpi_protocol ndpi_find_port_based_protocol(struct ndpi_detection_module_struct *ndpi_struct/* , u_int8_t proto */,
-					      u_int32_t shost,
-					      u_int16_t sport,
-					      u_int32_t dhost,
-					      u_int16_t dport);
+					      u_int32_t shost, u_int16_t sport,
+					      u_int32_t dhost, u_int16_t dport);
   /**
    * Search and return the protocol guessed that is undetected
    *
@@ -463,10 +465,8 @@ extern "C" {
   ndpi_protocol ndpi_guess_undetected_protocol_v4(struct ndpi_detection_module_struct *ndpi_struct,
 						  struct ndpi_flow_struct *flow,
 						  u_int8_t proto,
-						  u_int32_t shost,
-						  u_int16_t sport,
-						  u_int32_t dhost,
-						  u_int16_t dport);
+						  u_int32_t shost, u_int16_t sport,
+						  u_int32_t dhost, u_int16_t dport);
   /**
    * Check if the string passed match with a protocol
    *
@@ -1742,6 +1742,7 @@ extern "C" {
 
   /* Data analysis */
   struct ndpi_analyze_struct* ndpi_alloc_data_analysis(u_int16_t _max_series_len);
+  struct ndpi_analyze_struct* ndpi_alloc_data_analysis_from_series(const u_int32_t *values, u_int16_t num_values);
   void ndpi_init_data_analysis(struct ndpi_analyze_struct *s, u_int16_t _max_series_len);
   void ndpi_free_data_analysis(struct ndpi_analyze_struct *d, u_int8_t free_pointer);
   void ndpi_reset_data_analysis(struct ndpi_analyze_struct *d);
@@ -1928,6 +1929,26 @@ extern "C" {
 
   /* ******************************* */
 
+  /*
+   * Checks if the two series are correlated using the
+   * Pearson correlation coefficient that is a value in the -1..0..+1 range
+   * where:
+   * -1 < x < 0   Negative correlation (when one changes the other series changes in opposite direction)
+   * x = 0        No correlation       (no relationship between the series)
+   * 0 < x < 1    Positive correlation (when one changes the other series changes in the same direction)
+   * (i.e. when a series increases, the other also increase and vice-versa)
+   *
+   * @par    values_a   = First series with num_values values
+   * @par    values_b   = Second series with num_values values
+   * @par    num_values = Number of series entries
+   *
+   * @return pearson correlation coefficient
+   *
+   */
+  double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int16_t num_values);
+  
+  /* ******************************* */
+
   u_int32_t ndpi_quick_16_byte_hash(u_int8_t *in_16_bytes_long);
 
   /* ******************************* */
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index bb0b74fd4..7a2fd495c 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -66,6 +66,20 @@ struct ndpi_analyze_struct* ndpi_alloc_data_analysis(u_int16_t _max_series_len)
 
 /* ********************************************************************************* */
 
+struct ndpi_analyze_struct* ndpi_alloc_data_analysis_from_series(const u_int32_t *values, u_int16_t num_values) {
+  u_int16_t i;
+  struct ndpi_analyze_struct *ret = ndpi_alloc_data_analysis(num_values);
+
+  if(ret == NULL) return(NULL);
+   
+  for(i=0; i<num_values; i++)
+    ndpi_data_add_value(ret, (const u_int64_t)values[i]);
+
+  return(ret);
+}
+
+/* ********************************************************************************* */
+
 void ndpi_free_data_analysis(struct ndpi_analyze_struct *d, u_int8_t free_pointer) {
   if(d && d->values) ndpi_free(d->values);
   if(free_pointer) ndpi_free(d);
@@ -1670,6 +1684,31 @@ int ndpi_predict_linear(u_int32_t *values, u_int32_t num_values,
 
 /* ********************************************************************************* */
 
+double ndpi_pearson_correlation(u_int32_t *values_a, u_int32_t *values_b, u_int16_t num_values) {
+  double sum_a = 0, sum_b = 0, sum_squared_diff_a = 0, sum_squared_diff_b = 0, sum_product_diff = 0;
+  u_int16_t i;
+  double mean_a, mean_b, variance_a, variance_b, covariance;
+
+  if(num_values == 0) return(0.0);
+  
+  for(i = 0; i < num_values; i++)
+    sum_a += values_a[i], sum_b += values_b[i];
+
+  mean_a = sum_a / num_values, mean_b = sum_b / num_values;
+
+  for(i = 0; i < num_values; i++)
+    sum_squared_diff_a += pow(values_a[i] - mean_a, 2),
+      sum_squared_diff_b += pow(values_b[i] - mean_b, 2),
+      sum_product_diff += (values_a[i] - mean_a) * (values_b[i] - mean_b);
+
+  variance_a = sum_squared_diff_a / (double)num_values, variance_b = sum_squared_diff_b / (double)num_values;
+  covariance = sum_product_diff / (double)num_values;
+      
+  return(covariance / sqrt(variance_a * variance_b));
+}
+
+/* ********************************************************************************* */
+
 static const u_int16_t crc16_ccitt_table[256] = {
 	0x0000, 0x1189, 0x2312, 0x329B, 0x4624, 0x57AD, 0x6536, 0x74BF,
 	0x8C48, 0x9DC1, 0xAF5A, 0xBED3, 0xCA6C, 0xDBE5, 0xE97E, 0xF8F7,
diff --git a/src/lib/ndpi_classify.c b/src/lib/ndpi_classify.c
index a7937ed08..b8bf8213d 100644
--- a/src/lib/ndpi_classify.c
+++ b/src/lib/ndpi_classify.c
@@ -61,6 +61,7 @@
     _a < _b ? _a : _b; })
 #endif
 
+/* **************************************** */
 
 //bias (1) + w (207)
 //const float ndpi_parameters_splt[NUM_PARAMETERS_SPLT_LOGREG] = {
@@ -240,6 +241,8 @@ float ndpi_parameters_bd[NUM_PARAMETERS_BD_LOGREG] = {
 						      0.000000000000000000e+00, 0.000000000000000000e+00, -9.635140703414636576e+00, 2.603288107669730511e+00,
 };
 
+/* **************************************** */
+
 /**
  * \fn void ndpi_merge_splt_arrays (const uint16_t *pkt_len, const pkt_timeval *pkt_time,
  const uint16_t *pkt_len_twin, const pkt_timeval *pkt_time_twin,
@@ -342,6 +345,8 @@ ndpi_merge_splt_arrays (const uint16_t *pkt_len, const pkt_timeval *pkt_time,
     merged_times[0] = ndpi_timeval_to_microseconds(start_m);
 }
 
+/* **************************************** */
+
 /* transform lens array to Markov chain */
 static void
 ndpi_get_mc_rep_lens (uint16_t *lens, float *length_mc, uint16_t num_packets)
@@ -382,6 +387,8 @@ ndpi_get_mc_rep_lens (uint16_t *lens, float *length_mc, uint16_t num_packets)
   }
 }
 
+/* **************************************** */
+
 /* transform times array to Markov chain */
 void
 ndpi_get_mc_rep_times (uint16_t *times, float *time_mc, uint16_t num_packets)
@@ -421,6 +428,8 @@ ndpi_get_mc_rep_times (uint16_t *times, float *time_mc, uint16_t num_packets)
   }
 }
 
+/* **************************************** */
+
 /**
  * \fn float classify (const unsigned short *pkt_len, const pkt_timeval *pkt_time,
  const unsigned short *pkt_len_twin, const pkt_timeval *pkt_time_twin,
@@ -452,8 +461,7 @@ ndpi_classify (const unsigned short *pkt_len, const pkt_timeval *pkt_time,
                const unsigned short *pkt_len_twin, const pkt_timeval *pkt_time_twin,
                pkt_timeval start_time, pkt_timeval start_time_twin, uint32_t max_num_pkt_len,
                uint16_t sp, uint16_t dp, uint32_t op, uint32_t ip, uint32_t np_o, uint32_t np_i,
-               uint32_t ob, uint32_t ib, uint16_t use_bd, const uint32_t *bd, const uint32_t *bd_t)
-{
+               uint32_t ob, uint32_t ib, uint16_t use_bd, const uint32_t *bd, const uint32_t *bd_t) {
 
   float features[NUM_PARAMETERS_BD_LOGREG] = {1.0};
   float mc_lens[MC_BINS_LEN*MC_BINS_LEN];
@@ -539,6 +547,8 @@ ndpi_classify (const unsigned short *pkt_len, const pkt_timeval *pkt_time,
   return 1.0/(1.0+exp(score));
 }
 
+/* **************************************** */
+
 /**
  * \fn void update_params (char *splt_params, char *bd_params)
  * \brief if a user supplies new parameter files, update parameters splt/bd
@@ -592,6 +602,8 @@ ndpi_update_params (classifier_type_codes_t param_type, const char *param_file)
   }
 }
 
+/* **************************************** */
+
 /* *********************************************************************
  * ---------------------------------------------------------------------
  *                      Time functions
@@ -619,6 +631,8 @@ ndpi_timer_eq(const pkt_timeval *a,
   return 0;
 }
 
+/* **************************************** */
+
 unsigned int
 ndpi_timer_lt(const pkt_timeval *a,
               const pkt_timeval *b)
@@ -647,6 +661,8 @@ ndpi_timer_sub(const pkt_timeval *a,
   }
 }
 
+/* **************************************** */
+
 /**
  * \brief Zeroize a timeval.
  * \param a Timeval to zero out
@@ -658,6 +674,8 @@ ndpi_timer_clear(pkt_timeval *a)
   a->tv_sec = a->tv_usec = 0;
 }
 
+/* **************************************** */
+
 /**
  * \brief Calculate the milliseconds representation of a timeval.
  * \param ts Timeval
@@ -671,6 +689,8 @@ ndpi_timeval_to_milliseconds(pkt_timeval ts)
   return usec / 1000 + sec * 1000;
 }
 
+/* **************************************** */
+
 /**
  * \brief Calculate the microseconds representation of a timeval.
  * \param ts Timeval
author	Luca Deri <deri@ntop.org>	2023-12-27 22:42:37 +0100
committer	Luca Deri <deri@ntop.org>	2023-12-27 22:42:37 +0100
commit	1366518bff88210008159cc385f092bd5cfc6252 (patch)
tree	2a4db00c945c178a5636fb39d6d0e6fc0ea30727
parent	99d48383286fbb865ab58db5e5f768d8ed14f41e (diff)