aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2021-12-04 10:09:01 +0100
committerLuca Deri <deri@ntop.org>2021-12-04 10:09:01 +0100
commitfe2822c6a8fcbf9e0a4bb9ee7558cbd0c310e067 (patch)
treeab0a32b8f38cd5b73d3d688f179e62249c6479b3
parent6ab1367846bfa7aeba578e363e46a1acccb9c477 (diff)
Added example for finding similarities in RRDs using nDPI statistical APIs
-rw-r--r--example/ndpiReader.c4
-rw-r--r--rrdtool/Makefile.in5
-rw-r--r--rrdtool/README.txt3
-rw-r--r--rrdtool/rrd_anomaly.c29
-rw-r--r--rrdtool/rrd_similarity.c319
-rw-r--r--src/include/ndpi_api.h.in11
-rw-r--r--src/include/ndpi_typedefs.h3
-rw-r--r--src/lib/ndpi_analyze.c29
8 files changed, 369 insertions, 34 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index cbcbec881..f4f21f99d 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -266,7 +266,7 @@ u_int check_bin_doh_similarity(struct ndpi_bin *bin, float *similarity) {
float lowest_similarity = 9999999999.0f;
for(i=0; i<NUM_DOH_BINS; i++) {
- *similarity = ndpi_bin_similarity(&doh_ndpi_bins[i], bin, 0);
+ *similarity = ndpi_bin_similarity(&doh_ndpi_bins[i], bin, 0, 0);
if(*similarity <= doh_max_distance)
return(1);
@@ -2827,7 +2827,7 @@ static void printFlowsStats() {
print_bin(out, NULL, &bins[i]);
printf("][similarity: %f]",
- (similarity = ndpi_bin_similarity(&centroids[j], &bins[i], 0)));
+ (similarity = ndpi_bin_similarity(&centroids[j], &bins[i], 0, 0)));
if(all_flows[i].flow->host_server_name[0] != '\0')
fprintf(out, "[%s]", all_flows[i].flow->host_server_name);
diff --git a/rrdtool/Makefile.in b/rrdtool/Makefile.in
index d25e0bdac..95d2fd7b3 100644
--- a/rrdtool/Makefile.in
+++ b/rrdtool/Makefile.in
@@ -3,5 +3,10 @@ INC=-I ../src/include
LIBDPI=../src/lib/libndpi.a
LIB=$(LIBDPI) -lrrd -lm @LIBS@ @ADDITIONAL_LIBS@ @LDFLAGS@
+all: rrd_anomaly rrd_similarity
+
rrd_anomaly: rrd_anomaly.c Makefile $(LIBDPI)
$(CC) -g $(INC) rrd_anomaly.c -o rrd_anomaly $(LIB)
+
+rrd_similarity: rrd_similarity.c Makefile $(LIBDPI)
+ $(CC) -g $(INC) rrd_similarity.c -o rrd_similarity $(LIB)
diff --git a/rrdtool/README.txt b/rrdtool/README.txt
index 3d8056186..1ea5cad4c 100644
--- a/rrdtool/README.txt
+++ b/rrdtool/README.txt
@@ -1,4 +1,5 @@
-This directory contains a tool that allows to identify anomalies in RRD files
+
+This directory contains a tool that allows to identify anomalies and similarities in RRD files
Prerequisite
diff --git a/rrdtool/rrd_anomaly.c b/rrdtool/rrd_anomaly.c
index e4822d936..fa0edb6a5 100644
--- a/rrdtool/rrd_anomaly.c
+++ b/rrdtool/rrd_anomaly.c
@@ -1,5 +1,5 @@
/*
- * ndpiReader.c
+ * rrd_anomaly.c
*
* Copyright (C) 2011-21 - ntop.org
*
@@ -57,7 +57,7 @@ int main(int argc, char *argv[]) {
struct ndpi_ses_struct ses;
float alpha;
char c;
-
+
/* Defaults */
alpha = DEFAULT_ALPHA;
start_s = DEFAULT_START;
@@ -71,7 +71,7 @@ int main(int argc, char *argv[]) {
case 's':
start_s = optarg;
break;
-
+
case 'e':
end_s = optarg;
break;
@@ -79,7 +79,7 @@ int main(int argc, char *argv[]) {
case 'q':
quick_mode = 1;
break;
-
+
case 'a':
{
float f = atof(optarg);
@@ -90,11 +90,11 @@ int main(int argc, char *argv[]) {
printf("Discarding -a: valid range is >0 .. <1\n");
}
break;
-
+
case 'f':
filename = optarg;
break;
-
+
default:
help();
break;
@@ -103,7 +103,7 @@ int main(int argc, char *argv[]) {
if(filename == NULL)
help();
-
+
ndpi_ses_init(&ses, alpha, 0.05);
if((rrd_parsetime(start_s, &start_tv) != NULL)) {
@@ -124,8 +124,9 @@ int main(int argc, char *argv[]) {
}
p = data;
- for(t=start+1, i=0; t<end; t+=step, i++) {
- for(j=0; j<ds_cnt; j++) {
+ for(t=start+1, i=0; t<end; t+=step, i++) {
+ j = 0; /* Consider only the first DS */
+ /* for(j=0; j<ds_cnt; j++) */ {
rrd_value_t value = *p++;
if(!isnan(value)) {
@@ -146,15 +147,15 @@ int main(int argc, char *argv[]) {
} else {
const time_t _t = t;
struct tm *t_info = localtime((const time_t*)&_t);
-
+
strftime(buf, sizeof(buf), "%d/%b/%Y %H:%M:%S", t_info);
-
+
if(first) {
- first = 0;
+ first = 0;
printf("%s %s\t%s %s %s\t %s [%s]\n",
- "When", "Value", "Prediction", "Lower", "Upper", "Out", "Band");
+ "When", "Value", "Prediction", "Lower", "Upper", "Out", "Band");
}
-
+
printf("%s %12.3f\t%.3f\t%12.3f\t%12.3f\t %s [%.3f]\n",
buf, value/100., prediction/100., lower/100., upper/100., is_anomaly? "ANOMALY" : "OK",
confidence_band/100.);
diff --git a/rrdtool/rrd_similarity.c b/rrdtool/rrd_similarity.c
new file mode 100644
index 000000000..ca25abd02
--- /dev/null
+++ b/rrdtool/rrd_similarity.c
@@ -0,0 +1,319 @@
+/*
+ * rrd_similarity.c
+ *
+ * Copyright (C) 2011-21 - ntop.org
+ *
+ * nDPI is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * nDPI is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with nDPI. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <getopt.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+
+#include "rrd.h"
+#include "ndpi_api.h"
+
+#define DEFAULT_ALPHA 0.5
+#define DEFAULT_START "now-1d"
+#define DEFAULT_END "now"
+
+#define MAX_NUM_RRDS 8192
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+typedef struct {
+ char *path;
+ float average, stddev;
+ struct ndpi_bin b;
+} rrd_file_stats;
+
+u_int verbose = 0, similarity_threshold = 100, skip_zero = 0;
+
+/* *************************************************** */
+
+static void help() {
+ printf("Usage: rrd_similarity [-v][-a <alpha>][-e <end>][-q][-s <start>]\n"
+ " -f <filename> -d <basedir> [-t <threshold>]\n"
+ "-a | Set alpha. Valid range >0 .. <1. Default %.2f\n"
+ "-e <end> | RRD end time. Default %s\n"
+ "-q | Quick output (only anomalies are reported)\n"
+ "-s <start> | RRD start time. Default %s\n"
+
+ "-d <basedir> | Base directory where RRD filename is searched\n"
+ "-f <rrd path> | Path of the RRD filename to analyze\n"
+ "-t <threshold> | Similarity threshold. Default %u (0 == alike)\n"
+ "-v | Verbose\n"
+ "-z | Skip zero RRDs during comparison\n"
+ ,
+ DEFAULT_ALPHA, DEFAULT_END, DEFAULT_START, similarity_threshold);
+
+ printf("\n\nExample: rrd_similarity -q -f bytes.rrd -d /var/lib/ntopng/-1/snmpstats\n");
+
+ printf("\n\nGoal: find similar RRDs\n");
+ exit(0);
+}
+
+/* *************************************************** */
+
+void analyze_rrd(rrd_file_stats *rrd, time_t start, time_t end) {
+ unsigned long step = 0, ds_cnt = 0;
+ rrd_value_t *data, *p;
+ char **names;
+ u_int t, i, j, num_points;
+ struct ndpi_analyze_struct *s;
+
+ if(rrd_fetch_r(rrd->path, "AVERAGE", &start, &end, &step, &ds_cnt, &names, &data) != 0) {
+ printf("Unable to extract data from rrd %s\n", rrd->path);
+ return;
+ }
+
+ p = data;
+ num_points = (end-start)/step;
+
+ if((s = ndpi_alloc_data_analysis(num_points)) == NULL)
+ return;
+
+ ndpi_init_bin(&rrd->b, ndpi_bin_family32, num_points);
+
+ /* Step 1 - Compute average and stddev */
+ for(t=start+1, i=0; t<end; t+=step, i++) {
+ double value = (double)*p++;
+
+ if(isnan(value)) value = 0;
+ ndpi_data_add_value(s, value);
+ ndpi_set_bin(&rrd->b, i, value);
+ }
+
+ rrd->average = ndpi_data_average(s);
+ rrd->stddev = ndpi_data_stddev(s);
+
+ /* Step 2 - Bin analysis */
+ ndpi_free_data_analysis(s, 1);
+ rrd_freemem(data);
+}
+
+/* *************************************************** */
+
+int circles_touch(int x1, int r1, int x2, int r2) {
+ int radius_sum = r1+r2;
+ int x_diff = abs(x1 - x2);
+
+ return((radius_sum < x_diff) ? 0 : 1);
+}
+
+
+/* *************************************************** */
+
+void find_rrd_similarities(rrd_file_stats *rrd, u_int num_rrds) {
+ u_int i, j, num_similar_rrds = 0, num_potentially_zero_equal = 0;
+
+ for(i=0; i<num_rrds; i++) {
+ for(j=i+1; j<num_rrds; j++) {
+ /*
+ Average is the circle center, and stddev is the radius
+ if circles touch each other then there is a chance that
+ the two rrds are similar
+ */
+
+ if((rrd[i].average == 0) && (rrd[i].average == rrd[j].average)) {
+ if(!skip_zero)
+ printf("%s [%.1f/%.1f] - %s [%.1f/%.1f] are alike\n",
+ rrd[i].path, rrd[i].average, rrd[i].stddev,
+ rrd[j].path, rrd[j].average, rrd[j].stddev);
+
+ num_potentially_zero_equal++;
+ } else if(circles_touch(rrd[i].average, rrd[i].stddev, rrd[j].average, rrd[j].stddev)
+ ) {
+ float similarity = ndpi_bin_similarity(&rrd[i].b, &rrd[j].b, 0, similarity_threshold);
+
+ if((similarity >= 0) && (similarity < similarity_threshold)) {
+ if(verbose)
+ printf("%s [%.1f/%.1f] - %s [%.1f/%.1f] are %s [%.1f]\n",
+ rrd[i].path, rrd[i].average, rrd[i].stddev,
+ rrd[j].path, rrd[j].average, rrd[j].stddev,
+ (similarity == 0) ? "alike" : "similar",
+ similarity
+ );
+
+ num_similar_rrds++;
+ }
+ }
+ }
+ }
+
+ printf("Found %u (%.3f %%) similar RRDs / %u zero alike RRDs [num_rrds: %u]\n",
+ num_similar_rrds,
+ (num_similar_rrds*100.)/(float)(num_rrds*num_rrds),
+ num_potentially_zero_equal,
+ num_rrds);
+}
+
+/* *************************************************** */
+
+void find_rrds(char *basedir, char *filename, rrd_file_stats *rrds, u_int *num_rrds) {
+ struct dirent **namelist;
+ int n = scandir(basedir, &namelist, 0, NULL);
+
+ if(n < 0)
+ return; /* End of the tree */
+
+ while(n--) {
+ if(namelist[n]->d_name[0] != '.') {
+ char path[PATH_MAX];
+ struct stat s;
+
+ snprintf(path, sizeof(path), "%s/%s", basedir, namelist[n]->d_name);
+
+ if(stat(path, &s) == 0) {
+ if(S_ISDIR(s.st_mode))
+ find_rrds(path, filename, rrds, num_rrds);
+ else if(strcmp(namelist[n]->d_name, filename) == 0) {
+ if(*num_rrds < MAX_NUM_RRDS) {
+ rrds[*num_rrds].path = strdup(path);
+ if(rrds[*num_rrds].path != NULL)
+ (*num_rrds)++;
+ }
+ }
+ }
+ }
+
+ free(namelist[n]);
+ }
+
+ free(namelist);
+}
+
+/* *************************************************** */
+
+int main(int argc, char *argv[]) {
+ rrd_time_value_t start_tv, end_tv;
+ char *filename = NULL, *start_s, *end_s, *dirname = NULL, *basedir = NULL;
+ u_int first = 1, quick_mode = 0;
+ float alpha;
+ char c;
+ time_t start, end;
+ u_int num_rrds = 0, i;
+ rrd_file_stats *rrds;
+
+ /* Defaults */
+ alpha = DEFAULT_ALPHA;
+ start_s = DEFAULT_START;
+ end_s = DEFAULT_END;
+
+
+ while((c = getopt(argc, argv, "d:s:e:a:qf:t:vz")) != '?') {
+ if(c == -1) break;
+
+ switch(c) {
+ case 's':
+ start_s = optarg;
+ break;
+
+ case 'd':
+ basedir = optarg;
+ break;
+
+ case 'e':
+ end_s = optarg;
+ break;
+
+ case 'q':
+ quick_mode = 1;
+ break;
+
+ case 'v':
+ verbose = 1;
+ break;
+
+ case 'a':
+ {
+ float f = atof(optarg);
+
+ if((f > 0) && (f < 1))
+ alpha = f;
+ else
+ printf("Discarding -a: valid range is >0 .. <1\n");
+ }
+ break;
+
+ case 'f':
+ filename = optarg;
+ break;
+
+ case 't':
+ similarity_threshold = atoi(optarg);
+ break;
+
+ case 'z':
+ skip_zero = 1;
+ break;
+
+ default:
+ help();
+ break;
+ }
+ }
+
+ if((filename == NULL) || (basedir == NULL))
+ help();
+
+ if((rrd_parsetime(start_s, &start_tv) != NULL)) {
+ printf("Unable to parse start time %s\n", start_s);
+ return(-1);
+ }
+
+ if((rrd_parsetime(end_s, &end_tv) != NULL)) {
+ printf("Unable to parse end time %s\n", end_s);
+ return(-1);
+ }
+
+ rrd_proc_start_end(&start_tv, &end_tv, &start, &end);
+
+ if((rrds = ndpi_calloc(sizeof(rrd_file_stats), MAX_NUM_RRDS)) == NULL) {
+ printf("Not enough memory !\n");
+ return(-1);
+ }
+
+ /* Find all rrd's */
+ find_rrds(basedir, filename, rrds, &num_rrds);
+
+ /* Read RRD's data */
+ for(i=0; i<num_rrds; i++)
+ analyze_rrd(&rrds[i], start, end);
+
+ find_rrd_similarities(rrds, num_rrds);
+
+#if 0
+ if(verbose) {
+ for(i=0; i<num_rrds; i++)
+ printf("%s\t%.1f\t%.1f\n", rrds[i].path, rrds[i].average, rrds[i].stddev);
+ }
+#endif
+
+ for(i=0; i<num_rrds; i++) {
+ ndpi_free_bin(&rrds[i].b);
+ free(rrds[i].path);
+ }
+
+ ndpi_free(rrds);
+
+ return(0);
+}
diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in
index 243b16ad8..1cc96e854 100644
--- a/src/include/ndpi_api.h.in
+++ b/src/include/ndpi_api.h.in
@@ -1564,16 +1564,17 @@ extern "C" {
/* ******************************* */
- int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins);
+ int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int16_t num_bins);
void ndpi_free_bin(struct ndpi_bin *b);
struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b);
- void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val);
- void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value);
- u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id);
+ void ndpi_inc_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val);
+ void ndpi_set_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t value);
+ u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int16_t slot_id);
void ndpi_reset_bin(struct ndpi_bin *b);
void ndpi_normalize_bin(struct ndpi_bin *b);
char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len);
- float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first);
+ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2,
+ u_int8_t normalize_first, float similarity_max_threshold);
int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
u_int8_t num_clusters, u_int16_t *cluster_ids,
struct ndpi_bin *centroids);
diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h
index c6e79a951..357b4db13 100644
--- a/src/include/ndpi_typedefs.h
+++ b/src/include/ndpi_typedefs.h
@@ -1570,7 +1570,8 @@ enum ndpi_bin_family {
};
struct ndpi_bin {
- u_int8_t num_bins, is_empty;
+ u_int8_t is_empty;
+ u_int16_t num_bins;
enum ndpi_bin_family family;
union {
diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c
index c8591a36b..947fb8a13 100644
--- a/src/lib/ndpi_analyze.c
+++ b/src/lib/ndpi_analyze.c
@@ -299,7 +299,7 @@ double ndpi_hll_count(struct ndpi_hll *hll) {
/* ********************************************************************************* */
/* ********************************************************************************* */
-int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins) {
+int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int16_t num_bins) {
b->num_bins = num_bins, b->family = f, b->is_empty = 1;
switch(f) {
@@ -378,7 +378,7 @@ struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b) {
/* ********************************************************************************* */
-void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
+void ndpi_set_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val) {
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
@@ -396,7 +396,7 @@ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
/* ********************************************************************************* */
-void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
+void ndpi_inc_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val) {
b->is_empty = 0;
if(slot_id >= b->num_bins) slot_id = 0;
@@ -416,7 +416,7 @@ void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) {
/* ********************************************************************************* */
-u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) {
+u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int16_t slot_id) {
if(slot_id >= b->num_bins) slot_id = 0;
switch(b->family) {
@@ -457,7 +457,7 @@ void ndpi_reset_bin(struct ndpi_bin *b) {
Each bin slot is transformed in a % with respect to the value total
*/
void ndpi_normalize_bin(struct ndpi_bin *b) {
- u_int8_t i;
+ u_int16_t i;
u_int32_t tot = 0;
if(b->is_empty) return;
@@ -495,7 +495,7 @@ void ndpi_normalize_bin(struct ndpi_bin *b) {
/* ********************************************************************************* */
char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len) {
- u_int8_t i;
+ u_int16_t i;
u_int len = 0;
if(!out_buf) return(out_buf); else out_buf[0] = '\0';
@@ -555,10 +555,14 @@ char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf
0 = alike
...
the higher the more different
-*/
-float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first) {
- u_int8_t i;
+ if similarity_max_threshold != 0, we assume that bins arent similar
+*/
+float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2,
+ u_int8_t normalize_first, float similarity_max_threshold) {
+ u_int16_t i;
+ float threshold = similarity_max_threshold*similarity_max_threshold;
+
if(
// (b1->family != b2->family) ||
(b1->num_bins != b2->num_bins))
@@ -594,7 +598,10 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor
if(a != b) sum += pow(diff, 2);
- // printf("[a: %u][b: %u][sum: %u]\n", a, b, sum);
+ if(threshold && (sum > threshold))
+ return(-2); /* Sorry they are not similar */
+
+ // printf("%u/%u) [a: %u][b: %u][sum: %u]\n", i, b1->num_bins, a, b, sum);
}
/* The lower the more similar */
@@ -720,7 +727,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins,
if(centroids[j].is_empty) continue;
- similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0);
+ similarity = ndpi_bin_similarity(&bins[i], &centroids[j], 0, 0);
if(j == cluster_ids[i])
current_similarity = similarity;