diff options
author | Luca Deri <deri@ntop.org> | 2023-09-02 19:27:19 +0200 |
---|---|---|
committer | Luca Deri <deri@ntop.org> | 2023-09-02 19:27:19 +0200 |
commit | f0dc3347ec92a55c16b7033e1b7f2890892b3094 (patch) | |
tree | cbecb7c62a364e9dd1501d2837ce46a2d28bc7f8 /src | |
parent | 1d480c18e381f393bd25352c6140e9651f3e3a76 (diff) |
Merged new and old version of ndpi_domain_classify.c code
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/ndpi_domain_classify.c | 394 | ||||
-rw-r--r-- | src/lib/ndpi_main.c | 3 |
2 files changed, 397 insertions, 0 deletions
diff --git a/src/lib/ndpi_domain_classify.c b/src/lib/ndpi_domain_classify.c index 986ebbf07..f5e6752c1 100644 --- a/src/lib/ndpi_domain_classify.c +++ b/src/lib/ndpi_domain_classify.c @@ -30,7 +30,14 @@ // #define DEBUG_ADD // #define DEBUG_CONTAINS +// #define USE_BINARY_BITMAP + +#ifdef USE_BINARY_BITMAP + /* ********************************************************** */ +/* ********************************************************** */ + +/* Faster but it uses more memory */ void ndpi_domain_classify_free(ndpi_domain_classify *search) { ndpi_binary_bitmap_free(search->bitmap); @@ -198,3 +205,390 @@ bool ndpi_domain_classify_contains(ndpi_domain_classify *c, return(false); } +#else /* ! USE_BINARY_BITMAP */ + +/* ********************************************************** */ +/* ********************************************************** */ + +#define END_OF_TOKENS_DELIMITER 0x12345678 +#define NUM_DOMAIN_BITMAPS 8 +#define NUM_DOMAIN_BITMAPS_THRESHOLD (NUM_DOMAIN_BITMAPS-1) +#define MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS 8 + +typedef struct { + ndpi_bitmap *bitmap[NUM_DOMAIN_BITMAPS]; +} ndpi_domain_search; + +typedef struct { + u_int16_t class_id; + ndpi_domain_search *domains; +} ndpi_domain_classify_t; + +typedef struct { + ndpi_domain_classify_t *class[MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS]; +} ndpi_domain_classifications_t; + +/* ********************************************************** */ + +static void ndpi_domain_search_free(ndpi_domain_search *search) { + u_int16_t i; + + for(i=0; i<NUM_DOMAIN_BITMAPS; i++) { + if(search->bitmap[i] == NULL) + break; + + ndpi_bitmap_free(search->bitmap[i]); + } + + ndpi_free(search); +} + +/* ********************************************************** */ + +static ndpi_domain_search* ndpi_domain_search_alloc() { + ndpi_domain_search *search = (ndpi_domain_search*)ndpi_calloc(NUM_DOMAIN_BITMAPS, sizeof(ndpi_domain_search)); + u_int16_t i; + + if(!search) return(NULL); + + for(i=0; i<NUM_DOMAIN_BITMAPS; i++) { + if((search->bitmap[i] = ndpi_bitmap_alloc()) == NULL) + goto toobad; + } + + return(search); + + toobad: + ndpi_domain_search_free(search); + return(NULL); +} + +/* ********************************************************** */ + +static u_int32_t ndpi_domain_search_size(ndpi_domain_search *search) { + u_int32_t i, total_len = 0; + + for(i=0; i<NUM_DOMAIN_BITMAPS; i++) { + char *buf; + + total_len += ndpi_bitmap_serialize(search->bitmap[i], &buf); + ndpi_free(buf); + } + + return(total_len); +} + +/* ********************************************************** */ + +/* NOTE: domain will be modified: copy it if necessary */ +static bool ndpi_domain_search_add(ndpi_domain_search *search, char *domain) { + char *elem; + u_int32_t bitmap_id = 0, len, hsum = 0; + bool quit = false; + + if(domain == NULL) return(false); + if((len = strlen(domain)) == 0) return(false); + + len--; + while((len > 0) + && ((domain[len] == '.') + || (domain[len] == '\n') + || (domain[len] == '\r')) + ) + domain[len--] = '\0'; + + if(domain[0] == '.') ++domain; + + elem = strrchr(domain, '.'); + while(elem) { + u_int32_t h; + + if(elem[0] == '.') elem = &elem[1]; + + h = ndpi_hash_string(elem); + + if(elem == domain) { + /* We're adding the beginning of the domain, hence the last token before quitting */ + h += END_OF_TOKENS_DELIMITER; + } + + ndpi_bitmap_set(search->bitmap[bitmap_id], h + hsum); + + bitmap_id++, hsum += h; + + if(quit) + break; + + if(bitmap_id == NUM_DOMAIN_BITMAPS_THRESHOLD) + elem = domain, quit = true; /* Hash the rest of the word */ + else { + elem[-1] = '\0'; + elem = strrchr(domain, '.'); + + if(elem == NULL) + elem = domain, quit = true; + } + } + + return(bitmap_id); +} + +/* ********************************************************** */ + +static bool ndpi_domain_search_contains(ndpi_domain_search *search, char *domain) { + char *elem; + u_int32_t bitmap_id = 0, hsum = 0; + bool quit = false; + + if((elem = strrchr(domain, '.')) == NULL) + return(false); /* This does not look like a domain */ + + while(elem) { + u_int32_t h; + + if(elem[0] == '.') elem = &elem[1]; + + h = ndpi_hash_string(elem); + + if(!ndpi_bitmap_isset(search->bitmap[bitmap_id], h + hsum)) { + /* Exact match does not work, so let's see if a partial match works instead */ + + /* We're adding the beginning of the domain, hence the last token before quitting */ + h += END_OF_TOKENS_DELIMITER; + + return(ndpi_bitmap_isset(search->bitmap[bitmap_id], h + hsum)); + } + + bitmap_id++, hsum += h; + + if(quit) + break; + + if(bitmap_id == NUM_DOMAIN_BITMAPS_THRESHOLD) + elem = domain, quit = true; /* Hash the rest of the word */ + else { + elem[-1] = '\0'; + elem = strrchr(domain, '.'); + + if(elem == NULL) + elem = domain, quit = true; + } + } + + return(true); +} + +/* ********************************************************** */ +/* ********************************************************** */ + +ndpi_domain_classify* ndpi_domain_classify_alloc() { + ndpi_domain_classify_t *cat = (ndpi_domain_classify_t*)ndpi_calloc(1, sizeof(ndpi_domain_classifications_t)); + + return((ndpi_domain_classify*)cat); +} + +/* ********************************************************** */ + +void ndpi_domain_classify_free(ndpi_domain_classify *_s) { + u_int32_t i; + ndpi_domain_classifications_t *s = (ndpi_domain_classifications_t*)_s; + + for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { + if(s->class[i] != NULL) { + ndpi_domain_search_free(s->class[i]->domains); + ndpi_free(s->class[i]); + } else + break; + } + + ndpi_free(s); +} + +/* ********************************************************** */ + +u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *_s) { + u_int32_t i, tot_len = sizeof(ndpi_domain_classify_t); + ndpi_domain_classifications_t *s = (ndpi_domain_classifications_t*)_s; + + for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { + if(s->class[i] != NULL) { + tot_len += ndpi_domain_search_size(s->class[i]->domains) + sizeof(ndpi_domain_classify_t); + } else + break; + } + + return(tot_len); +} + +/* ********************************************************** */ + +bool ndpi_domain_classify_add(ndpi_domain_classify *_s, + u_int8_t class_id, + char *domain) { + u_int32_t i; + ndpi_domain_classifications_t *s = (ndpi_domain_classifications_t*)_s; + char buf[256], *dot = strrchr(domain, '.'); + + if(!dot) return(false); + if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) + return(false); + + for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { + if(s->class[i] != NULL) { + if(s->class[i]->class_id == class_id) { + break; + } + } else { + s->class[i] = (ndpi_domain_classify_t*)ndpi_malloc(sizeof(ndpi_domain_classify_t)); + + if(s->class[i] == NULL) + return(false); + + s->class[i]->class_id = class_id; + s->class[i]->domains = ndpi_domain_search_alloc(); + break; + } + } + + if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS) + return(false); + + snprintf(buf, sizeof(buf), "%s", domain); + +#ifdef DEBUG_ADD + printf("[add] %s @ %u\n", domain, class_id); +#endif + + return(ndpi_domain_search_add(s->class[i]->domains, buf)); +} + +/* ********************************************************** */ + +u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *_s, + u_int8_t class_id, + char *file_path) { + u_int32_t i, num_added = 0; + ndpi_domain_classifications_t *s = (ndpi_domain_classifications_t*)_s; + char buf[256]; + FILE *fd; + char *line; + + for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { + if(s->class[i] != NULL) { + if(s->class[i]->class_id == class_id) { + break; + } + } else { + s->class[i] = (ndpi_domain_classify_t*)ndpi_malloc(sizeof(ndpi_domain_classify_t)); + + if(s->class[i] == NULL) + return(false); + + s->class[i]->class_id = class_id; + s->class[i]->domains = ndpi_domain_search_alloc(); + break; + } + } + + if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS) + return(false); + + /* *************************************** */ + + fd = fopen(file_path, "r"); + if(fd == NULL) + return(false); + + while((line = fgets(buf, sizeof(buf), fd)) != NULL) { + u_int len; + + if((line[0] == '#') || (line[0] == '\0')) + continue; + else { + len = strlen(line) - 1; + + if(len == 0) + continue; + else + line[len] = '\0'; + } + + if(ndpi_domain_search_add(s->class[i]->domains, line)) + num_added++; + } + + fclose(fd); + + return(num_added); +} + +/* ********************************************************** */ + +static bool is_valid_domain_char(u_char c) { + if(((c >= 'A')&& (c <= 'Z')) + || ((c >= 'a')&& (c <= 'z')) + || ((c >= '0')&& (c <= '9')) + || (c == '_') + || (c == '-') + || (c == '.')) + return(true); + else + return(false); +} + +/* ********************************************************** */ + +bool ndpi_domain_classify_contains(ndpi_domain_classify *_s, + u_int8_t *class_id /* out */, + char *domain) { + u_int32_t i, len; + ndpi_domain_classifications_t *s = (ndpi_domain_classifications_t*)_s; + char *dot; + + if(!domain) return(false); + if((len = strlen(domain)) == 0) return(false); + if((dot = strrchr(domain, '.')) == NULL) return(false); + if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) return(false); + + /* This is a number or a numeric IP or similar */ + if(isdigit(domain[len-1]) && isdigit(domain[0])) { +#ifdef DEBUG_CONTAINS + printf("[contains] %s INVALID\n", domain); +#endif + + return(false); + } + + if(!is_valid_domain_char(domain[0])) { +#ifdef DEBUG_CONTAINS + printf("[contains] %s INVALID\n", domain); +#endif + + return(false); + } + + for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { + if(s->class[i] != NULL) { + char buf[256]; + + snprintf(buf, sizeof(buf), "%s", domain); + + if(ndpi_domain_search_contains(s->class[i]->domains, buf)) { +#ifdef DEBUG_CONTAINS + printf("[contains] %s = %d\n", domain, s->class[i]->class_id); +#endif + *class_id = s->class[i]->class_id; + return(true); + } + } + } + +#ifdef DEBUG_CONTAINS + printf("[contains] %s NOT FOUND\n", domain); +#endif + + return(false); +} + + +#endif diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c index c5a085be6..d5e2cecef 100644 --- a/src/lib/ndpi_main.c +++ b/src/lib/ndpi_main.c @@ -4206,6 +4206,9 @@ int ndpi_load_category_file(struct ndpi_detection_module_struct *ndpi_str, while((line[len] == '\n') || (line[len] == '\r')) line[len--] = '\0'; + while((line[0] == '-') || (line[0] == '.')) + line++; + if(ndpi_load_category(ndpi_str, line, category_id, NULL) > 0) num_loaded++; } |