diff options
author | Luca Deri <deri@ntop.org> | 2021-12-04 10:09:01 +0100 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2021-12-04 10:09:01 +0100 |
commit | fe2822c6a8fcbf9e0a4bb9ee7558cbd0c310e067 (patch) | |
tree | ab0a32b8f38cd5b73d3d688f179e62249c6479b3 | |
parent | 6ab1367846bfa7aeba578e363e46a1acccb9c477 (diff) |
Added example for finding similarities in RRDs using nDPI statistical APIs
-rw-r--r-- | example/ndpiReader.c | 4 | ||||
-rw-r--r-- | rrdtool/Makefile.in | 5 | ||||
-rw-r--r-- | rrdtool/README.txt | 3 | ||||
-rw-r--r-- | rrdtool/rrd_anomaly.c | 29 | ||||
-rw-r--r-- | rrdtool/rrd_similarity.c | 319 | ||||
-rw-r--r-- | src/include/ndpi_api.h.in | 11 | ||||
-rw-r--r-- | src/include/ndpi_typedefs.h | 3 | ||||
-rw-r--r-- | src/lib/ndpi_analyze.c | 29 |
8 files changed, 369 insertions, 34 deletions
diff --git a/example/ndpiReader.c b/example/ndpiReader.c index cbcbec881..f4f21f99d 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -266,7 +266,7 @@ u_int check_bin_doh_similarity(struct ndpi_bin *bin, float *similarity) { float lowest_similarity = 9999999999.0f; for(i=0; i<NUM_DOH_BINS; i++) { - *similarity = ndpi_bin_similarity(&doh_ndpi_bins[i], bin, 0); + *similarity = ndpi_bin_similarity(&doh_ndpi_bins[i], bin, 0, 0); if(*similarity <= doh_max_distance) return(1); @@ -2827,7 +2827,7 @@ static void printFlowsStats() { print_bin(out, NULL, &bins[i]); printf("][similarity: %f]", - (similarity = ndpi_bin_similarity(¢roids[j], &bins[i], 0))); + (similarity = ndpi_bin_similarity(¢roids[j], &bins[i], 0, 0))); if(all_flows[i].flow->host_server_name[0] != '\0') fprintf(out, "[%s]", all_flows[i].flow->host_server_name); diff --git a/rrdtool/Makefile.in b/rrdtool/Makefile.in index d25e0bdac..95d2fd7b3 100644 --- a/rrdtool/Makefile.in +++ b/rrdtool/Makefile.in @@ -3,5 +3,10 @@ INC=-I ../src/include LIBDPI=../src/lib/libndpi.a LIB=$(LIBDPI) -lrrd -lm @LIBS@ @ADDITIONAL_LIBS@ @LDFLAGS@ +all: rrd_anomaly rrd_similarity + rrd_anomaly: rrd_anomaly.c Makefile $(LIBDPI) $(CC) -g $(INC) rrd_anomaly.c -o rrd_anomaly $(LIB) + +rrd_similarity: rrd_similarity.c Makefile $(LIBDPI) + $(CC) -g $(INC) rrd_similarity.c -o rrd_similarity $(LIB) diff --git a/rrdtool/README.txt b/rrdtool/README.txt index 3d8056186..1ea5cad4c 100644 --- a/rrdtool/README.txt +++ b/rrdtool/README.txt @@ -1,4 +1,5 @@ -This directory contains a tool that allows to identify anomalies in RRD files + +This directory contains a tool that allows to identify anomalies and similarities in RRD files Prerequisite diff --git a/rrdtool/rrd_anomaly.c b/rrdtool/rrd_anomaly.c index e4822d936..fa0edb6a5 100644 --- a/rrdtool/rrd_anomaly.c +++ b/rrdtool/rrd_anomaly.c @@ -1,5 +1,5 @@ /* - * ndpiReader.c + * rrd_anomaly.c * * Copyright (C) 2011-21 - ntop.org * @@ -57,7 +57,7 @@ int main(int argc, char *argv[]) { struct ndpi_ses_struct ses; float alpha; char c; - + /* Defaults */ alpha = DEFAULT_ALPHA; start_s = DEFAULT_START; @@ -71,7 +71,7 @@ int main(int argc, char *argv[]) { case 's': start_s = optarg; break; - + case 'e': end_s = optarg; break; @@ -79,7 +79,7 @@ int main(int argc, char *argv[]) { case 'q': quick_mode = 1; break; - + case 'a': { float f = atof(optarg); @@ -90,11 +90,11 @@ int main(int argc, char *argv[]) { printf("Discarding -a: valid range is >0 .. <1\n"); } break; - + case 'f': filename = optarg; break; - + default: help(); break; @@ -103,7 +103,7 @@ int main(int argc, char *argv[]) { if(filename == NULL) help(); - + ndpi_ses_init(&ses, alpha, 0.05); if((rrd_parsetime(start_s, &start_tv) != NULL)) { @@ -124,8 +124,9 @@ int main(int argc, char *argv[]) { } p = data; - for(t=start+1, i=0; t<end; t+=step, i++) { - for(j=0; j<ds_cnt; j++) { + for(t=start+1, i=0; t<end; t+=step, i++) { + j = 0; /* Consider only the first DS */ + /* for(j=0; j<ds_cnt; j++) */ { rrd_value_t value = *p++; if(!isnan(value)) { @@ -146,15 +147,15 @@ int main(int argc, char *argv[]) { } else { const time_t _t = t; struct tm *t_info = localtime((const time_t*)&_t); - + strftime(buf, sizeof(buf), "%d/%b/%Y %H:%M:%S", t_info); - + if(first) { - first = 0; + first = 0; printf("%s %s\t%s %s %s\t %s [%s]\n", - "When", "Value", "Prediction", "Lower", "Upper", "Out", "Band"); + "When", "Value", "Prediction", "Lower", "Upper", "Out", "Band"); } - + printf("%s %12.3f\t%.3f\t%12.3f\t%12.3f\t %s [%.3f]\n", buf, value/100., prediction/100., lower/100., upper/100., is_anomaly? "ANOMALY" : "OK", confidence_band/100.); diff --git a/rrdtool/rrd_similarity.c b/rrdtool/rrd_similarity.c new file mode 100644 index 000000000..ca25abd02 --- /dev/null +++ b/rrdtool/rrd_similarity.c @@ -0,0 +1,319 @@ +/* + * rrd_similarity.c + * + * Copyright (C) 2011-21 - ntop.org + * + * nDPI is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * nDPI is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with nDPI. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <getopt.h> +#include <dirent.h> +#include <sys/stat.h> + + +#include "rrd.h" +#include "ndpi_api.h" + +#define DEFAULT_ALPHA 0.5 +#define DEFAULT_START "now-1d" +#define DEFAULT_END "now" + +#define MAX_NUM_RRDS 8192 + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +typedef struct { + char *path; + float average, stddev; + struct ndpi_bin b; +} rrd_file_stats; + +u_int verbose = 0, similarity_threshold = 100, skip_zero = 0; + +/* *************************************************** */ + +static void help() { + printf("Usage: rrd_similarity [-v][-a <alpha>][-e <end>][-q][-s <start>]\n" + " -f <filename> -d <basedir> [-t <threshold>]\n" + "-a | Set alpha. Valid range >0 .. <1. Default %.2f\n" + "-e <end> | RRD end time. Default %s\n" + "-q | Quick output (only anomalies are reported)\n" + "-s <start> | RRD start time. Default %s\n" + + "-d <basedir> | Base directory where RRD filename is searched\n" + "-f <rrd path> | Path of the RRD filename to analyze\n" + "-t <threshold> | Similarity threshold. Default %u (0 == alike)\n" + "-v | Verbose\n" + "-z | Skip zero RRDs during comparison\n" + , + DEFAULT_ALPHA, DEFAULT_END, DEFAULT_START, similarity_threshold); + + printf("\n\nExample: rrd_similarity -q -f bytes.rrd -d /var/lib/ntopng/-1/snmpstats\n"); + + printf("\n\nGoal: find similar RRDs\n"); + exit(0); +} + +/* *************************************************** */ + +void analyze_rrd(rrd_file_stats *rrd, time_t start, time_t end) { + unsigned long step = 0, ds_cnt = 0; + rrd_value_t *data, *p; + char **names; + u_int t, i, j, num_points; + struct ndpi_analyze_struct *s; + + if(rrd_fetch_r(rrd->path, "AVERAGE", &start, &end, &step, &ds_cnt, &names, &data) != 0) { + printf("Unable to extract data from rrd %s\n", rrd->path); + return; + } + + p = data; + num_points = (end-start)/step; + + if((s = ndpi_alloc_data_analysis(num_points)) == NULL) + return; + + ndpi_init_bin(&rrd->b, ndpi_bin_family32, num_points); + + /* Step 1 - Compute average and stddev */ + for(t=start+1, i=0; t<end; t+=step, i++) { + double value = (double)*p++; + + if(isnan(value)) value = 0; + ndpi_data_add_value(s, value); + ndpi_set_bin(&rrd->b, i, value); + } + + rrd->average = ndpi_data_average(s); + rrd->stddev = ndpi_data_stddev(s); + + /* Step 2 - Bin analysis */ + ndpi_free_data_analysis(s, 1); + rrd_freemem(data); +} + +/* *************************************************** */ + +int circles_touch(int x1, int r1, int x2, int r2) { + int radius_sum = r1+r2; + int x_diff = abs(x1 - x2); + + return((radius_sum < x_diff) ? 0 : 1); +} + + +/* *************************************************** */ + +void find_rrd_similarities(rrd_file_stats *rrd, u_int num_rrds) { + u_int i, j, num_similar_rrds = 0, num_potentially_zero_equal = 0; + + for(i=0; i<num_rrds; i++) { + for(j=i+1; j<num_rrds; j++) { + /* + Average is the circle center, and stddev is the radius + if circles touch each other then there is a chance that + the two rrds are similar + */ + + if((rrd[i].average == 0) && (rrd[i].average == rrd[j].average)) { + if(!skip_zero) + printf("%s [%.1f/%.1f] - %s [%.1f/%.1f] are alike\n", + rrd[i].path, rrd[i].average, rrd[i].stddev, + rrd[j].path, rrd[j].average, rrd[j].stddev); + + num_potentially_zero_equal++; + } else if(circles_touch(rrd[i].average, rrd[i].stddev, rrd[j].average, rrd[j].stddev) + ) { + float similarity = ndpi_bin_similarity(&rrd[i].b, &rrd[j].b, 0, similarity_threshold); + + if((similarity >= 0) && (similarity < similarity_threshold)) { + if(verbose) + printf("%s [%.1f/%.1f] - %s [%.1f/%.1f] are %s [%.1f]\n", + rrd[i].path, rrd[i].average, rrd[i].stddev, + rrd[j].path, rrd[j].average, rrd[j].stddev, + (similarity == 0) ? "alike" : "similar", + similarity + ); + + num_similar_rrds++; + } + } + } + } + + printf("Found %u (%.3f %%) similar RRDs / %u zero alike RRDs [num_rrds: %u]\n", + num_similar_rrds, + (num_similar_rrds*100.)/(float)(num_rrds*num_rrds), + num_potentially_zero_equal, + num_rrds); +} + +/* *************************************************** */ + +void find_rrds(char *basedir, char *filename, rrd_file_stats *rrds, u_int *num_rrds) { + struct dirent **namelist; + int n = scandir(basedir, &namelist, 0, NULL); + + if(n < 0) + return; /* End of the tree */ + + while(n--) { + if(namelist[n]->d_name[0] != '.') { + char path[PATH_MAX]; + struct stat s; + + snprintf(path, sizeof(path), "%s/%s", basedir, namelist[n]->d_name); + + if(stat(path, &s) == 0) { + if(S_ISDIR(s.st_mode)) + find_rrds(path, filename, rrds, num_rrds); + else if(strcmp(namelist[n]->d_name, filename) == 0) { + if(*num_rrds < MAX_NUM_RRDS) { + rrds[*num_rrds].path = strdup(path); + if(rrds[*num_rrds].path != NULL) + (*num_rrds)++; + } + } + } + } + + free(namelist[n]); + } + + free(namelist); +} + +/* *************************************************** */ + +int main(int argc, char *argv[]) { + rrd_time_value_t start_tv, end_tv; + char *filename = NULL, *start_s, *end_s, *dirname = NULL, *basedir = NULL; + u_int first = 1, quick_mode = 0; + float alpha; + char c; + time_t start, end; + u_int num_rrds = 0, i; + rrd_file_stats *rrds; + + /* Defaults */ + alpha = DEFAULT_ALPHA; + start_s = DEFAULT_START; + end_s = DEFAULT_END; + + + while((c = getopt(argc, argv, "d:s:e:a:qf:t:vz")) != '?') { + if(c == -1) break; + + switch(c) { + case 's': + start_s = optarg; + break; + + case 'd': + basedir = optarg; + break; + + case 'e': + end_s = optarg; + break; + + case 'q': + quick_mode = 1; + break; + + case 'v': + verbose = 1; + break; + + case 'a': + { + float f = atof(optarg); + + if((f > 0) && (f < 1)) + alpha = f; + else + printf("Discarding -a: valid range is >0 .. <1\n"); + } + break; + + case 'f': + filename = optarg; + break; + + case 't': + similarity_threshold = atoi(optarg); + break; + + case 'z': + skip_zero = 1; + break; + + default: + help(); + break; + } + } + + if((filename == NULL) || (basedir == NULL)) + help(); + + if((rrd_parsetime(start_s, &start_tv) != NULL)) { + printf("Unable to parse start time %s\n", start_s); + return(-1); + } + + if((rrd_parsetime(end_s, &end_tv) != NULL)) { + printf("Unable to parse end time %s\n", end_s); + return(-1); + } + + rrd_proc_start_end(&start_tv, &end_tv, &start, &end); + + if((rrds = ndpi_calloc(sizeof(rrd_file_stats), MAX_NUM_RRDS)) == NULL) { + printf("Not enough memory !\n"); + return(-1); + } + + /* Find all rrd's */ + find_rrds(basedir, filename, rrds, &num_rrds); + + /* Read RRD's data */ + for(i=0; i<num_rrds; i++) + analyze_rrd(&rrds[i], start, end); + + find_rrd_similarities(rrds, num_rrds); + +#if 0 + if(verbose) { + for(i=0; i<num_rrds; i++) + printf("%s\t%.1f\t%.1f\n", rrds[i].path, rrds[i].average, rrds[i].stddev); + } +#endif + + for(i=0; i<num_rrds; i++) { + ndpi_free_bin(&rrds[i].b); + free(rrds[i].path); + } + + ndpi_free(rrds); + + return(0); +} diff --git a/src/include/ndpi_api.h.in b/src/include/ndpi_api.h.in index 243b16ad8..1cc96e854 100644 --- a/src/include/ndpi_api.h.in +++ b/src/include/ndpi_api.h.in @@ -1564,16 +1564,17 @@ extern "C" { /* ******************************* */ - int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins); + int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int16_t num_bins); void ndpi_free_bin(struct ndpi_bin *b); struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b); - void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val); - void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t value); - u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id); + void ndpi_inc_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val); + void ndpi_set_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t value); + u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int16_t slot_id); void ndpi_reset_bin(struct ndpi_bin *b); void ndpi_normalize_bin(struct ndpi_bin *b); char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len); - float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first); + float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, + u_int8_t normalize_first, float similarity_max_threshold); int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, u_int8_t num_clusters, u_int16_t *cluster_ids, struct ndpi_bin *centroids); diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h index c6e79a951..357b4db13 100644 --- a/src/include/ndpi_typedefs.h +++ b/src/include/ndpi_typedefs.h @@ -1570,7 +1570,8 @@ enum ndpi_bin_family { }; struct ndpi_bin { - u_int8_t num_bins, is_empty; + u_int8_t is_empty; + u_int16_t num_bins; enum ndpi_bin_family family; union { diff --git a/src/lib/ndpi_analyze.c b/src/lib/ndpi_analyze.c index c8591a36b..947fb8a13 100644 --- a/src/lib/ndpi_analyze.c +++ b/src/lib/ndpi_analyze.c @@ -299,7 +299,7 @@ double ndpi_hll_count(struct ndpi_hll *hll) { /* ********************************************************************************* */ /* ********************************************************************************* */ -int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int8_t num_bins) { +int ndpi_init_bin(struct ndpi_bin *b, enum ndpi_bin_family f, u_int16_t num_bins) { b->num_bins = num_bins, b->family = f, b->is_empty = 1; switch(f) { @@ -378,7 +378,7 @@ struct ndpi_bin* ndpi_clone_bin(struct ndpi_bin *b) { /* ********************************************************************************* */ -void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { +void ndpi_set_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val) { if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { @@ -396,7 +396,7 @@ void ndpi_set_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { /* ********************************************************************************* */ -void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { +void ndpi_inc_bin(struct ndpi_bin *b, u_int16_t slot_id, u_int32_t val) { b->is_empty = 0; if(slot_id >= b->num_bins) slot_id = 0; @@ -416,7 +416,7 @@ void ndpi_inc_bin(struct ndpi_bin *b, u_int8_t slot_id, u_int32_t val) { /* ********************************************************************************* */ -u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int8_t slot_id) { +u_int32_t ndpi_get_bin_value(struct ndpi_bin *b, u_int16_t slot_id) { if(slot_id >= b->num_bins) slot_id = 0; switch(b->family) { @@ -457,7 +457,7 @@ void ndpi_reset_bin(struct ndpi_bin *b) { Each bin slot is transformed in a % with respect to the value total */ void ndpi_normalize_bin(struct ndpi_bin *b) { - u_int8_t i; + u_int16_t i; u_int32_t tot = 0; if(b->is_empty) return; @@ -495,7 +495,7 @@ void ndpi_normalize_bin(struct ndpi_bin *b) { /* ********************************************************************************* */ char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf, u_int out_buf_len) { - u_int8_t i; + u_int16_t i; u_int len = 0; if(!out_buf) return(out_buf); else out_buf[0] = '\0'; @@ -555,10 +555,14 @@ char* ndpi_print_bin(struct ndpi_bin *b, u_int8_t normalize_first, char *out_buf 0 = alike ... the higher the more different -*/ -float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t normalize_first) { - u_int8_t i; + if similarity_max_threshold != 0, we assume that bins arent similar +*/ +float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, + u_int8_t normalize_first, float similarity_max_threshold) { + u_int16_t i; + float threshold = similarity_max_threshold*similarity_max_threshold; + if( // (b1->family != b2->family) || (b1->num_bins != b2->num_bins)) @@ -594,7 +598,10 @@ float ndpi_bin_similarity(struct ndpi_bin *b1, struct ndpi_bin *b2, u_int8_t nor if(a != b) sum += pow(diff, 2); - // printf("[a: %u][b: %u][sum: %u]\n", a, b, sum); + if(threshold && (sum > threshold)) + return(-2); /* Sorry they are not similar */ + + // printf("%u/%u) [a: %u][b: %u][sum: %u]\n", i, b1->num_bins, a, b, sum); } /* The lower the more similar */ @@ -720,7 +727,7 @@ int ndpi_cluster_bins(struct ndpi_bin *bins, u_int16_t num_bins, if(centroids[j].is_empty) continue; - similarity = ndpi_bin_similarity(&bins[i], ¢roids[j], 0); + similarity = ndpi_bin_similarity(&bins[i], ¢roids[j], 0, 0); if(j == cluster_ids[i]) current_similarity = similarity; |