diff options
author | Luca Deri <lucaderi@users.noreply.github.com> | 2024-04-18 23:21:40 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-04-18 23:21:40 +0200 |
commit | ad117bfaabd3bc75dc70d0ddbc4ba18c86c40dbd (patch) | |
tree | 3b1fb6016da1e114bca190ed6a868421fd9c32f1 /src | |
parent | 108b8331d5b345e110c9ef110a6aa95a2767a640 (diff) |
Domain Classification Improvements (#2396)
* Added
size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize);
size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize);
used to compress short strings such as domain names. This code is based on
https://github.com/Ed-von-Schleck/shoco
* Major code rewrite for ndpi_hash and ndpi_domain_classify
* Improvements to make sure custom categories are loaded and enabled
* Fixed string encoding
* Extended SalesForce/Cloudflare domains list
Diffstat (limited to 'src')
-rw-r--r-- | src/include/ndpi_api.h | 60 | ||||
-rw-r--r-- | src/include/ndpi_private.h | 2 | ||||
-rw-r--r-- | src/include/ndpi_typedefs.h | 11 | ||||
-rw-r--r-- | src/lib/ndpi_content_match.c.inc | 6 | ||||
-rw-r--r-- | src/lib/ndpi_domain_classify.c | 222 | ||||
-rw-r--r-- | src/lib/ndpi_domains.c | 101 | ||||
-rw-r--r-- | src/lib/ndpi_main.c | 93 | ||||
-rw-r--r-- | src/lib/ndpi_utils.c | 270 | ||||
-rw-r--r-- | src/lib/protocols/dns.c | 2 | ||||
-rw-r--r-- | src/lib/protocols/fastcgi.c | 2 | ||||
-rw-r--r-- | src/lib/protocols/http.c | 2 | ||||
-rw-r--r-- | src/lib/protocols/quic.c | 4 | ||||
-rw-r--r-- | src/lib/protocols/tls.c | 21 | ||||
-rw-r--r-- | src/lib/third_party/include/shoco.h | 24 | ||||
-rw-r--r-- | src/lib/third_party/include/shoco_domains_model.h | 172 | ||||
-rw-r--r-- | src/lib/third_party/src/shoco.c | 233 |
16 files changed, 874 insertions, 351 deletions
diff --git a/src/include/ndpi_api.h b/src/include/ndpi_api.h index 34617c535..acc01fb0f 100644 --- a/src/include/ndpi_api.h +++ b/src/include/ndpi_api.h @@ -1931,11 +1931,9 @@ extern "C" { * Free the hashmap. * * @par h = pointer to the hash map [in, out] - * @par cleanup_func = pointer to a optional callback function - * called for each element in the hashmap [in] * */ - void ndpi_hash_free(ndpi_str_hash **h, void (*cleanup_func)(ndpi_str_hash *h)); + void ndpi_hash_free(ndpi_str_hash **h); /** * Search for an entry in the hashmap. @@ -1949,7 +1947,7 @@ extern "C" { * @return 0 if an entry with that key was found, 1 otherwise * */ - int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, void **value); + int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, u_int16_t *value); /** * Add an entry to the hashmap. @@ -1957,12 +1955,12 @@ extern "C" { * @par h = pointer to the hash map [in, out] * @par key = character string (no '\0' required) [in] * @par key_len = length of the character string @key [in] - * @par value = pointer to the value to add [in] + * @par value = value to add [in] * * @return 0 if the entry was added, 1 otherwise * */ - int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, void *value); + int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, u_int16_t value); /* ******************************* */ @@ -2076,23 +2074,21 @@ extern "C" { for substring domain matching and classification */ - ndpi_domain_classify* ndpi_domain_classify_alloc(void); - void ndpi_domain_classify_free(ndpi_domain_classify *s); - u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s); - bool ndpi_domain_classify_add(ndpi_domain_classify *s, - u_int8_t class_id, const char *domain); - u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s, - u_int8_t class_id, - char *file_path); - bool ndpi_domain_classify_finalize(ndpi_domain_classify *s); - const char* ndpi_domain_classify_longest_prefix(ndpi_domain_classify *s, - u_int8_t *class_id /* out */, - const char *hostnname, - bool return_subprefix); - bool ndpi_domain_classify_contains(ndpi_domain_classify *s, - u_int8_t *class_id /* out */, - const char *domain); - + ndpi_domain_classify* ndpi_domain_classify_alloc(); + void ndpi_domain_classify_free(ndpi_domain_classify *s); + u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s); + bool ndpi_domain_classify_add(struct ndpi_detection_module_struct *ndpi_mod, + ndpi_domain_classify *s, + u_int16_t class_id, char *domain); + u_int32_t ndpi_domain_classify_add_domains(struct ndpi_detection_module_struct *ndpi_mod, + ndpi_domain_classify *s, + u_int16_t class_id, + char *file_path); + bool ndpi_domain_classify_hostname(struct ndpi_detection_module_struct *ndpi_mod, + ndpi_domain_classify *s, + u_int16_t *class_id /* out */, + char *hostname); + /* ******************************* */ /* @@ -2160,12 +2156,14 @@ extern "C" { * * @par ndpi_str = the struct created for the protocol detection * @par hostname = the hostname from which the domain name has to be extracted + * @par suffix_id = the id of the returned domain * * @return The host domain name suffic or the host itself if not found. * */ const char* ndpi_get_host_domain_suffix(struct ndpi_detection_module_struct *ndpi_str, - const char *hostname); + const char *hostname, + u_int16_t *suffix_id /* out */); /** * Returns the domain (including the TLS) suffix out of the specified hostname. @@ -2217,6 +2215,20 @@ extern "C" { /* ******************************* */ + size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize); + size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize); + + /* ******************************* */ + + /* NOTE + this function works best if yout have loaded in memory domain + suffixes using ndpi_load_domain_suffixes() + */ + u_int ndpi_encode_domain(struct ndpi_detection_module_struct *ndpi_str, + char *domain, char *out, u_int out_len); + + /* ******************************* */ + const char *ndpi_lru_cache_idx_to_name(lru_cache_type idx); /** diff --git a/src/include/ndpi_private.h b/src/include/ndpi_private.h index ee6302626..ece904278 100644 --- a/src/include/ndpi_private.h +++ b/src/include/ndpi_private.h @@ -400,7 +400,7 @@ struct ndpi_detection_module_struct { u_int16_t max_payload_track_len; - ndpi_domain_classify *public_domain_suffixes; + ndpi_str_hash *public_domain_suffixes; }; diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h index 9dc3fdc98..86c7df8ab 100644 --- a/src/include/ndpi_typedefs.h +++ b/src/include/ndpi_typedefs.h @@ -1127,11 +1127,7 @@ typedef struct _ndpi_automa { struct ndpi_automa_stats stats; } ndpi_automa; -typedef struct ndpi_str_hash { - unsigned int hash; - void *value; - // u_int8_t private_data[1]; /* Avoid error C2466 and do not initiate private data with 0 */ -} ndpi_str_hash; +typedef void ndpi_str_hash; typedef struct ndpi_proto { /* @@ -1164,10 +1160,7 @@ typedef struct { #define MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS 16 typedef struct { - struct { - u_int16_t class_id; - ndpi_bitmap64_fuse *domains; - } classes[MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS]; + ndpi_str_hash *domains; } ndpi_domain_classify; typedef enum { diff --git a/src/lib/ndpi_content_match.c.inc b/src/lib/ndpi_content_match.c.inc index e37d04b5e..d850581f7 100644 --- a/src/lib/ndpi_content_match.c.inc +++ b/src/lib/ndpi_content_match.c.inc @@ -385,6 +385,7 @@ static ndpi_protocol_match host_match[] = { "bloombergvault.com", "Bloomberg", NDPI_PROTOCOL_BLOOMBERG, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "bloomberg.com", "Bloomberg", NDPI_PROTOCOL_BLOOMBERG, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "salesforce.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, + { "force.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "salesforceliveagent.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "apple-dns.net", "Apple", NDPI_PROTOCOL_APPLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "origin-apple.com.akadns.net", "Apple", NDPI_PROTOCOL_APPLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL }, @@ -920,8 +921,9 @@ static ndpi_protocol_match host_match[] = { "whiteboard.microsoft.com", "Microsoft365", NDPI_PROTOCOL_MICROSOFT_365, NDPI_PROTOCOL_CATEGORY_COLLABORATIVE, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "events.data.microsoft.com", "Microsoft365", NDPI_PROTOCOL_MICROSOFT_365, NDPI_PROTOCOL_CATEGORY_COLLABORATIVE, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, - { "cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, - { "cdnjs.cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, + { "cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, + { "cdnjs.cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, + { "cf-ipfs.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "d295hzzivaok4k.cloudfront.net","OpenDNS", NDPI_PROTOCOL_OPENDNS, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, { "opendns.com", "OpenDNS", NDPI_PROTOCOL_OPENDNS, NDPI_PROTOCOL_CATEGORY_NETWORK, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL }, diff --git a/src/lib/ndpi_domain_classify.c b/src/lib/ndpi_domain_classify.c index fce10d072..f62800527 100644 --- a/src/lib/ndpi_domain_classify.c +++ b/src/lib/ndpi_domain_classify.c @@ -1,7 +1,7 @@ /* * ndpi_domain_classify.c * - * Copyright (C) 2011-23 - ntop.org and contributors + * Copyright (C) 2011-24 - ntop.org and contributors * * nDPI is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by @@ -27,40 +27,31 @@ #include "ndpi_config.h" #include "ndpi_api.h" -#if 0 -#define DEBUG_ADD -#define DEBUG_CONTAINS -#endif +#define ENCODE_DATA /* ********************************************************** */ ndpi_domain_classify* ndpi_domain_classify_alloc() { - int i; - ndpi_domain_classify *cat = (ndpi_domain_classify*)ndpi_malloc(sizeof(ndpi_domain_classify)); + ndpi_domain_classify *s = (ndpi_domain_classify*)ndpi_malloc(sizeof(ndpi_domain_classify)); - if(!cat) + if(!s) return NULL; - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) - cat->classes[i].class_id = 0, cat->classes[i].domains = NULL; + if(ndpi_hash_init(&s->domains) != 0) { + ndpi_free(s); + return(NULL); + } - return((ndpi_domain_classify*)cat); + return((ndpi_domain_classify*)s); } /* ********************************************************** */ void ndpi_domain_classify_free(ndpi_domain_classify *s) { - u_int32_t i; - if(!s) return; - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].domains != NULL) { - ndpi_bitmap64_fuse_free(s->classes[i].domains); - } else - break; - } + ndpi_hash_free(&s->domains); ndpi_free(s); } @@ -68,28 +59,26 @@ void ndpi_domain_classify_free(ndpi_domain_classify *s) { /* ********************************************************** */ u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s) { - u_int32_t i, tot_len = sizeof(ndpi_domain_classify); + u_int32_t tot_len = sizeof(ndpi_domain_classify); if(!s) return(0); - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].domains != NULL) { - tot_len += ndpi_bitmap64_fuse_size(s->classes[i].domains); - } else - break; - } + /* TODO */ return(tot_len); } /* ********************************************************** */ -bool ndpi_domain_classify_add(ndpi_domain_classify *s, - u_int8_t class_id, - const char *domain) { - u_int32_t i; - u_int64_t hash; +bool ndpi_domain_classify_add(struct ndpi_detection_module_struct *ndpi_str, + ndpi_domain_classify *s, + u_int16_t class_id, + char *domain) { +#ifdef ENCODE_DATA + u_int32_t out_len; + char out[256]; +#endif if((!s) || (!domain)) return(false); @@ -97,43 +86,27 @@ bool ndpi_domain_classify_add(ndpi_domain_classify *s, /* Skip initial string . in domain names */ while(domain[0] == '.') domain++; -#if 0 - char *dot = strrchr(domain, '.'); + //printf("%s\n", domain); + // fprintf(stdout, "."); fflush(stdout); - if(dot) { - if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) - return(false); - } -#endif +#ifdef ENCODE_DATA + out_len = ndpi_encode_domain(ndpi_str, domain, out, sizeof(out)); - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].class_id == class_id) { - break; - } else if(s->classes[i].class_id == 0) { - s->classes[i].class_id = class_id; - s->classes[i].domains = ndpi_bitmap64_fuse_alloc(); - - if(!s->classes[i].domains) - s->classes[i].class_id = 0; - - break; - } - } - - if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS) - return(false); - - hash = ndpi_quick_hash64(domain, strlen(domain)); + ndpi_hash_add_entry(&s->domains, out, out_len, class_id); +#else + ndpi_hash_add_entry(&s->domains, domain, strlen(domain), class_id); +#endif - return(ndpi_bitmap64_fuse_set(s->classes[i].domains, hash)); + return(true); } /* ********************************************************** */ -u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s, - u_int8_t class_id, +u_int32_t ndpi_domain_classify_add_domains(struct ndpi_detection_module_struct *ndpi_mod, + ndpi_domain_classify *s, + u_int16_t class_id, char *file_path) { - u_int32_t i, num_added = 0; + u_int32_t num_added = 0; char buf[256]; FILE *fd; char *line; @@ -141,30 +114,12 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s, if((!s) || (!file_path)) return(false); - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].class_id == class_id) { - break; - } else if(s->classes[i].class_id == 0) { - s->classes[i].class_id = class_id; - s->classes[i].domains = ndpi_bitmap64_fuse_alloc(); - if(!s->classes[i].domains) - s->classes[i].class_id = 0; - break; - } - } - - if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS) - return(false); - - /* *************************************** */ - fd = fopen(file_path, "r"); if(fd == NULL) return(false); while((line = fgets(buf, sizeof(buf), fd)) != NULL) { u_int len; - u_int64_t hash; if((line[0] == '#') || (line[0] == '\0')) continue; @@ -177,9 +132,7 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s, line[len] = '\0'; } - hash = ndpi_quick_hash64(line, strlen(line)); - - if(ndpi_bitmap64_fuse_set(s->classes[i].domains, hash)) + if(ndpi_domain_classify_add(ndpi_mod, s, class_id, line)) num_added++; } @@ -191,104 +144,53 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s, /* ********************************************************** */ bool ndpi_domain_classify_finalize(ndpi_domain_classify *s) { - u_int32_t i; - if(!s) return(false); - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].class_id != 0) { - ndpi_bitmap64_fuse_compress(s->classes[i].domains); - } - } return(true); } /* ********************************************************** */ -static bool is_valid_domain_char(u_char c) { - if(((c >= 'A') && (c <= 'Z')) - || ((c >= 'a') && (c <= 'z')) - || ((c >= '0') && (c <= '9')) - || (c == '_') - || (c == '-') - || (c == '.')) - return(true); - else - return(false); -} - -/* ********************************************************** */ +bool ndpi_domain_classify_hostname(struct ndpi_detection_module_struct *ndpi_mod, + ndpi_domain_classify *s, + u_int16_t *class_id /* out */, + char *hostname) { + u_int32_t len; + const char *dot; + char *item; -const char* ndpi_domain_classify_longest_prefix(ndpi_domain_classify *s, - u_int8_t *class_id /* out */, - const char *hostname, - bool return_subprefix) { - u_int32_t i, len; - const char *dot, *elem, *prev_elem; + // ndpi_enable_loaded_categories(ndpi_mod); /* Make sure they have been enabled */ *class_id = 0; /* Unknown class_id */ - if(!hostname || !s) return(hostname); - if((len = strlen(hostname)) == 0) return(hostname); - if((dot = strrchr(hostname, '.')) == NULL) return(hostname); - if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) return(hostname); + if(!hostname || !s) return(false); + if((len = strlen(hostname)) == 0) return(false); + if((dot = strrchr(hostname, '.')) == NULL) return(false); + if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) return(false); - /* This is a number or a numeric IP or similar */ - if(ndpi_isdigit(hostname[len-1]) && isdigit(hostname[0])) { -#ifdef DEBUG_CONTAINS - printf("[contains] %s INVALID\n", hostname); -#endif + item = hostname; - return(hostname); - } + while(true) { + char *next; - if(!is_valid_domain_char(hostname[0])) { -#ifdef DEBUG_CONTAINS - printf("[contains] %s INVALID\n", hostname); -#endif + /* This looks like a match so let's check the hash now */ +#ifdef ENCODE_DATA + char out[256]; + u_int32_t out_len = ndpi_encode_domain(ndpi_mod, item, out, sizeof(out)); - return(hostname); - } - - elem = prev_elem = hostname; - - while(elem != NULL) { - u_int64_t hash = ndpi_quick_hash64(elem, strlen(elem)); - - for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) { - if(s->classes[i].class_id != 0) { - if(ndpi_bitmap64_fuse_isset(s->classes[i].domains, hash)) { -#ifdef DEBUG_CONTAINS - printf("[contains] %s = %d [%llu]\n", - hostname, s->classes[i].class_id, hash); + if(ndpi_hash_find_entry(s->domains, out, out_len, class_id) == 0) + return(true); +#else + if(ndpi_hash_find_entry(s->domains, item, strlen(item), class_id) == 0) + return(true); #endif - *class_id = s->classes[i].class_id; - return(return_subprefix ? prev_elem : elem); - } - } else - break; - } - - prev_elem = elem; - elem = strchr(elem, '.'); - if(elem == NULL) break; - // if(elem == dot) break; + next = strchr(item, '.'); - elem = &elem[1]; - } /* while */ + if(!next) break; else item = &next[1]; + } /* Not found */ - return(hostname); -} - -/* ********************************************************** */ - -bool ndpi_domain_classify_contains(ndpi_domain_classify *s, - u_int8_t *class_id /* out */, - const char *domain) { - (void)ndpi_domain_classify_longest_prefix(s, class_id, domain, false); /* UNUSED */ - - return((*class_id == 0) ? false : true); + return(false); } diff --git a/src/lib/ndpi_domains.c b/src/lib/ndpi_domains.c index 699beaf1f..c7f131304 100644 --- a/src/lib/ndpi_domains.c +++ b/src/lib/ndpi_domains.c @@ -29,29 +29,29 @@ int ndpi_load_domain_suffixes(struct ndpi_detection_module_struct *ndpi_str, char *public_suffix_list_path) { char buf[256], *line; FILE *fd; - u_int num_domains = 0; - + u_int16_t domain_id = 1; + if(ndpi_str == NULL || public_suffix_list_path == NULL) return(-1); if((fd = fopen(public_suffix_list_path, "r")) == NULL) return(-2); - + if(ndpi_str->public_domain_suffixes != NULL) { /* An existing license was aleady loaded: free it and start over */ - ndpi_domain_classify_free(ndpi_str->public_domain_suffixes); + ndpi_hash_free(&ndpi_str->public_domain_suffixes); } - if((ndpi_str->public_domain_suffixes = ndpi_domain_classify_alloc()) == NULL) + if(ndpi_hash_init(&ndpi_str->public_domain_suffixes) != 0) return(-3); while((line = fgets(buf, sizeof(buf), fd)) != NULL) { u_int offset, len; - + /* Skip private domains */ if(strstr(line, "// ===END ICANN DOMAINS===")) break; - + /* Skip empty lines or comments */ if((line[0] == '\0') || (line[0] == '/') || (line[0] == '\n') || (line[0] == '\r')) continue; @@ -65,54 +65,99 @@ int ndpi_load_domain_suffixes(struct ndpi_detection_module_struct *ndpi_str, while((len > 0) && (line[len] == '\n')) line[len--] = '\0'; - if(!ndpi_domain_classify_add(ndpi_str->public_domain_suffixes, - 1 /* dummy */, &line[offset])) { + if(ndpi_hash_add_entry(&ndpi_str->public_domain_suffixes, + &line[offset], strlen(&line[offset]), domain_id) != 0) { + NDPI_LOG_ERR(ndpi_str, "Error while processing domain %s\n", &line[offset]); } else - num_domains++; + domain_id++; } fclose(fd); - - if(!ndpi_domain_classify_finalize(ndpi_str->public_domain_suffixes)) { - NDPI_LOG_ERR(ndpi_str, "Error while finalizing domain processing\n"); - } - if(num_domains > 0) { - NDPI_LOG_DBG(ndpi_str, "Loaded %u domains\n", num_domains); - } - + if(domain_id > 0) + NDPI_LOG_DBG(ndpi_str, "Loaded %u domains\n", domain_id-1); + return(0); } /* ******************************* */ +/* + Example + - www.ntop.org -> org + - www.bbc.co.uk -> co.uk +*/ + const char* ndpi_get_host_domain_suffix(struct ndpi_detection_module_struct *ndpi_str, - const char *hostname) { + const char *hostname, + u_int16_t *domain_id /* out */) { + char *dot, *prev_dot; + + *domain_id = 0; + if(!ndpi_str) return NULL; + if(ndpi_str->public_domain_suffixes == NULL) return(hostname); - else { - u_int8_t class_id; + + prev_dot = dot = strrchr(hostname, '.'); + + while(dot != NULL) { + while((dot != hostname) && (dot[0] != '.')) + dot--; - return(ndpi_domain_classify_longest_prefix(ndpi_str->public_domain_suffixes, - &class_id, hostname, false)); + if((dot == hostname) + || (ndpi_hash_find_entry(ndpi_str->public_domain_suffixes, + &dot[1], strlen(&dot[1]), domain_id) != 0)) { + /* Not found: end of search */ + return(&prev_dot[1]); + } + + prev_dot = dot; + dot--; } + + return(hostname); } /* ******************************* */ +/* + Example + - www.ntop.org -> ntop.org + - www.bbc.co.uk -> bbc.co.uk +*/ const char* ndpi_get_host_domain(struct ndpi_detection_module_struct *ndpi_str, const char *hostname) { + const char *ret; + char *dot; + u_int16_t domain_id; + if(!ndpi_str) return NULL; + if(ndpi_str->public_domain_suffixes == NULL) return(hostname); - else { - u_int8_t class_id; - - return(ndpi_domain_classify_longest_prefix(ndpi_str->public_domain_suffixes, - &class_id, hostname, true)); + + ret = ndpi_get_host_domain_suffix(ndpi_str, hostname, &domain_id); + + if((ret == NULL) || (ret == hostname)) + return(hostname); + + dot = strstr(hostname, ret); + + if(dot == NULL) + return(hostname); + + dot--; + while(dot != hostname) { + dot--; + + if(dot[0] == '.') + return(&dot[1]); } + + return(hostname); } diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c index 9b04727bb..4c5c67557 100644 --- a/src/lib/ndpi_main.c +++ b/src/lib/ndpi_main.c @@ -954,7 +954,7 @@ static void init_string_based_protocols(struct ndpi_detection_module_struct *ndp /* ************************ */ - ndpi_enable_loaded_categories(ndpi_str); + //ndpi_enable_loaded_categories(ndpi_str); if(!ndpi_xgrams_inited) { ndpi_xgrams_inited = 1; @@ -3454,11 +3454,17 @@ static int is_ip_list_enabled(struct ndpi_detection_module_struct *ndpi_str, int return 1; } +/* *********************************************** */ + int ndpi_finalize_initialization(struct ndpi_detection_module_struct *ndpi_str) { u_int i; if(!ndpi_str) return -1; + + if(!ndpi_str->custom_categories.categories_loaded) + ndpi_enable_loaded_categories(ndpi_str); + if(ndpi_str->finalized) /* Already finalized */ return 0; @@ -4038,15 +4044,18 @@ int ndpi_match_custom_category(struct ndpi_detection_module_struct *ndpi_str, return(id != NDPI_PROTOCOL_UNKNOWN ? 0 : -1); #else char buf[128]; - u_int8_t class_id; + u_int16_t class_id; u_int max_len = sizeof(buf)-1; + if(!ndpi_str->custom_categories.categories_loaded) + ndpi_enable_loaded_categories(ndpi_str); + if(name_len > max_len) name_len = max_len; memcpy(buf, name, name_len); buf[name_len] = '\0'; - if(ndpi_domain_classify_contains(ndpi_str->custom_categories.sc_hostnames, - &class_id, buf)) { + if(ndpi_domain_classify_hostname(ndpi_str, ndpi_str->custom_categories.sc_hostnames, + &class_id, buf)) { *category = (ndpi_protocol_category_t)class_id; return(0); } else @@ -4066,9 +4075,11 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str ndpi_patricia_node_t *node; u_int cp_len = ndpi_min(sizeof(ipbuf) - 1, name_len); + *id = 0; + if(!ndpi_str->custom_categories.categories_loaded) - return(-1); - + ndpi_enable_loaded_categories(ndpi_str); + if(cp_len > 0) { memcpy(ipbuf, name_or_ip, cp_len); ipbuf[cp_len] = '\0'; @@ -4085,7 +4096,8 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str /* Search IPv4 */ /* Make sure all in network byte order otherwise compares wont work */ - ndpi_fill_prefix_v4(&prefix, &pin, 32, ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses)->maxbits); + ndpi_fill_prefix_v4(&prefix, &pin, 32, + ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses)->maxbits); node = ndpi_patricia_search_best(ndpi_str->custom_categories.ipAddresses, &prefix); if(node) { @@ -4095,7 +4107,8 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str return(-1); } else if(inet_pton(AF_INET6, ipbuf, &pin6) == 1) { /* Search IPv6 */ - ndpi_fill_prefix_v6(&prefix, &pin6, 128, ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses6)->maxbits); + ndpi_fill_prefix_v6(&prefix, &pin6, 128, + ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses6)->maxbits); node = ndpi_patricia_search_best(ndpi_str->custom_categories.ipAddresses6, &prefix); if(node) { @@ -4196,10 +4209,10 @@ void ndpi_exit_detection_module(struct ndpi_detection_module_struct *ndpi_str) { ac_automata_release((AC_AUTOMATA_t *) ndpi_str->tls_cert_subject_automa.ac_automa, 0); if(ndpi_str->malicious_ja3_hashmap != NULL) - ndpi_hash_free(&ndpi_str->malicious_ja3_hashmap, NULL); + ndpi_hash_free(&ndpi_str->malicious_ja3_hashmap); if(ndpi_str->malicious_sha1_hashmap != NULL) - ndpi_hash_free(&ndpi_str->malicious_sha1_hashmap, NULL); + ndpi_hash_free(&ndpi_str->malicious_sha1_hashmap); #ifdef USE_LEGACY_AHO_CORASICK if(ndpi_str->custom_categories.hostnames.ac_automa != NULL) @@ -4260,7 +4273,7 @@ void ndpi_exit_detection_module(struct ndpi_detection_module_struct *ndpi_str) { ndpi_free(ndpi_str->callback_buffer_tcp_payload); if(ndpi_str->public_domain_suffixes) - ndpi_domain_classify_free(ndpi_str->public_domain_suffixes); + ndpi_hash_free(&ndpi_str->public_domain_suffixes); ndpi_free(ndpi_str); } @@ -4865,6 +4878,8 @@ int ndpi_load_categories_file(struct ndpi_detection_module_struct *ndpi_str, return rc; } +/* ******************************************************************** */ + int load_categories_file_fd(struct ndpi_detection_module_struct *ndpi_str, FILE *fd, void *user_data) { char buffer[512], *line, *name, *category, *saveptr; @@ -5064,10 +5079,10 @@ int ndpi_load_categories_dir(struct ndpi_detection_module_struct *ndpi_str, if(failed_files) return(-1 * failed_files); + return(num_loaded); } - /* ******************************************************************** */ static int ndpi_load_risky_domain(struct ndpi_detection_module_struct *ndpi_str, @@ -5209,7 +5224,7 @@ int load_malicious_ja3_file_fd(struct ndpi_detection_module_struct *ndpi_str, FI continue; } - if(ndpi_hash_add_entry(&ndpi_str->malicious_ja3_hashmap, line, len, NULL) == 0) + if(ndpi_hash_add_entry(&ndpi_str->malicious_ja3_hashmap, line, len, 0) == 0) num++; } @@ -5287,7 +5302,8 @@ int load_malicious_sha1_file_fd(struct ndpi_detection_module_struct *ndpi_str, F for (i = 0; i < 40; ++i) first_comma[i] = toupper(first_comma[i]); - if(ndpi_hash_add_entry(&ndpi_str->malicious_sha1_hashmap, first_comma, second_comma - first_comma, NULL) == 0) + if(ndpi_hash_add_entry(&ndpi_str->malicious_sha1_hashmap, first_comma, + second_comma - first_comma, 0) == 0) num++; } @@ -6225,42 +6241,42 @@ static int ndpi_callback_init(struct ndpi_detection_module_struct *ndpi_str) { /* ******************************************************************** */ static inline int ndpi_proto_cb_tcp_payload(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) { - return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | - NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | - NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0; + return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | + NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | + NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0; } /* ******************************************************************** */ static inline int ndpi_proto_cb_tcp_nopayload(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) { - return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | - NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | - NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0 - && (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - NDPI_SELECTION_BITMASK_PROTOCOL_HAS_PAYLOAD) == 0; + return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | + NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | + NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0 + && (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + NDPI_SELECTION_BITMASK_PROTOCOL_HAS_PAYLOAD) == 0; } /* ******************************************************************** */ static inline int ndpi_proto_cb_udp(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) { - return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - (NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP | - NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | - NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0; + return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + (NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP | + NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP | + NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0; } /* ******************************************************************** */ static inline int ndpi_proto_cb_other(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) { - return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | - NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP | - NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP)) == 0 - || - (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & - NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC) != 0; + return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP | + NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP | + NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP)) == 0 + || + (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask & + NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC) != 0; } /* ******************************************************************** */ @@ -7896,7 +7912,7 @@ int ndpi_load_hostname_category(struct ndpi_detection_module_struct *ndpi_str, if(ndpi_str->custom_categories.sc_hostnames_shadow == NULL) return(-1); - return(ndpi_domain_classify_add(ndpi_str->custom_categories.sc_hostnames_shadow, + return(ndpi_domain_classify_add(ndpi_str, ndpi_str->custom_categories.sc_hostnames_shadow, (u_int16_t)category, (char*)name_to_add) ? 0 : -1); #endif } @@ -7931,6 +7947,9 @@ int ndpi_enable_loaded_categories(struct ndpi_detection_module_struct *ndpi_str) int i; static char *built_in = "built-in"; + if(ndpi_str->custom_categories.categories_loaded) + return(-1); /* Already loaded */ + /* First add the nDPI known categories matches */ for(i = 0; category_match[i].string_to_match != NULL; i++) ndpi_load_category(ndpi_str, category_match[i].string_to_match, @@ -7956,7 +7975,7 @@ int ndpi_enable_loaded_categories(struct ndpi_detection_module_struct *ndpi_str) } #else ndpi_domain_classify_free(ndpi_str->custom_categories.sc_hostnames); - ndpi_domain_classify_finalize(ndpi_str->custom_categories.sc_hostnames_shadow); + // ndpi_domain_classify_finalize(ndpi_str->custom_categories.sc_hostnames_shadow); ndpi_str->custom_categories.sc_hostnames = ndpi_str->custom_categories.sc_hostnames_shadow; ndpi_str->custom_categories.sc_hostnames_shadow = ndpi_domain_classify_alloc(); #endif diff --git a/src/lib/ndpi_utils.c b/src/lib/ndpi_utils.c index 7795f59d5..49d86e28d 100644 --- a/src/lib/ndpi_utils.c +++ b/src/lib/ndpi_utils.c @@ -1,7 +1,7 @@ /* * ndpi_utils.c * - * Copyright (C) 2011-23 - ntop.org and contributors + * Copyright (C) 2011-24 - ntop.org and contributors * * This file is part of nDPI, an open source deep packet inspection * library based on the OpenDPI and PACE technology by ipoque GmbH @@ -38,6 +38,7 @@ #include "ahocorasick.h" #include "libcache.h" +#include "shoco.h" #include <time.h> #ifndef WIN32 @@ -72,21 +73,11 @@ struct pcre2_struct { }; #endif -/* - * Please keep this strcture in sync with - * `struct ndpi_str_hash` in src/include/ndpi_typedefs.h - */ - -typedef struct ndpi_str_hash_private { - unsigned int hash; - void *value; - // u_int8_t private_data[1]; /* Avoid error C2466 and do not initiate private data with 0 */ +typedef struct { + char *key; + u_int16_t value16; UT_hash_handle hh; -} ndpi_str_hash_private; -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -_Static_assert(sizeof(struct ndpi_str_hash) == sizeof(struct ndpi_str_hash_private) - sizeof(UT_hash_handle), - "Please keep `struct ndpi_str_hash` and `struct ndpi_str_hash_private` syncd."); -#endif +} ndpi_str_hash_priv; /* ****************************************** */ @@ -1246,15 +1237,15 @@ static void ndpi_tls2json(ndpi_serializer *serializer, struct ndpi_flow_struct * static char* print_ndpi_address_port(ndpi_address_port *ap, char *buf, u_int buf_len) { char ipbuf[INET6_ADDRSTRLEN]; - + if(ap->is_ipv6) { inet_ntop(AF_INET6, &ap->address, ipbuf, sizeof(ipbuf)); } else { inet_ntop(AF_INET, &ap->address, ipbuf, sizeof(ipbuf)); } - + snprintf(buf, buf_len, "%s:%u", ipbuf, ap->port); - + return(buf); } @@ -2280,12 +2271,9 @@ ndpi_http_method ndpi_http_str2method(const char* method, u_int16_t method_len) /* ******************************************************************** */ -int ndpi_hash_init(ndpi_str_hash **h) -{ +int ndpi_hash_init(ndpi_str_hash **h) { if (h == NULL) - { return 1; - } *h = NULL; return 0; @@ -2293,77 +2281,69 @@ int ndpi_hash_init(ndpi_str_hash **h) /* ******************************************************************** */ -void ndpi_hash_free(ndpi_str_hash **h, void (*cleanup_func)(ndpi_str_hash *h)) -{ - struct ndpi_str_hash_private *h_priv; - struct ndpi_str_hash_private *current, *tmp; - - if (h == NULL) - { - return; - } - h_priv = *(struct ndpi_str_hash_private **)h; - - HASH_ITER(hh, h_priv, current, tmp) { - HASH_DEL(h_priv, current); - if (cleanup_func != NULL) - { - cleanup_func((ndpi_str_hash *)current); +void ndpi_hash_free(ndpi_str_hash **h) { + if(h != NULL) { + ndpi_str_hash_priv *h_priv = *((ndpi_str_hash_priv **)h); + ndpi_str_hash_priv *current, *tmp; + + HASH_ITER(hh, h_priv, current, tmp) { + HASH_DEL(h_priv, current); + ndpi_free(current->key); + ndpi_free(current); } - ndpi_free(current); + + *h = NULL; } - - *h = NULL; } /* ******************************************************************** */ -int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, void **value) -{ - struct ndpi_str_hash_private *h_priv = (struct ndpi_str_hash_private *)h; - struct ndpi_str_hash_private *found; - unsigned int hash_value; +int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, u_int16_t *value) { + ndpi_str_hash_priv *h_priv = (ndpi_str_hash_priv *)h; + ndpi_str_hash_priv *item; + + HASH_FIND(hh, h_priv, key, key_len, item); + + if (item != NULL) { + if(value != NULL) + *value = item->value16; - HASH_VALUE(key, key_len, hash_value); - HASH_FIND_INT(h_priv, &hash_value, found); - if (found != NULL) - { - if (value != NULL) - { - *value = found->value; - } return 0; - } else { + } else return 1; - } } /* ******************************************************************** */ -int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, void *value) -{ - struct ndpi_str_hash_private **h_priv = (struct ndpi_str_hash_private **)h; - struct ndpi_str_hash_private *new = ndpi_calloc(1, sizeof(*new)); - struct ndpi_str_hash_private *found; - unsigned int hash_value; +int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, u_int16_t value) { + ndpi_str_hash_priv *h_priv = (ndpi_str_hash_priv *)*h; + ndpi_str_hash_priv *item; - if (new == NULL) - { - return 1; + HASH_FIND(hh, h_priv, key, key_len, item); + + if(item != NULL) { + item->value16 = value; + return(1); /* Entry already present */ } - HASH_VALUE(key, key_len, hash_value); - new->hash = hash_value; - new->value = value; - HASH_ADD_INT(*h_priv, hash, new); + item = ndpi_calloc(1, sizeof(ndpi_str_hash_priv)); + if(item == NULL) + return(2); - HASH_FIND_INT(*h_priv, &hash_value, found); - if (found == NULL) /* The insertion failed (because of a memory allocation error) */ - { - ndpi_free(new); - return 1; + item->key = ndpi_malloc(key_len+1); + + if(item->key == NULL) { + ndpi_free(item); + return(1); + } else { + memcpy(item->key, key, key_len); + item->key[key_len] = '\0'; } + item->value16 = value; + + HASH_ADD(hh, *((ndpi_str_hash_priv **)h), key[0], key_len, item); + return 0; } @@ -2502,7 +2482,7 @@ void ndpi_handle_risk_exceptions(struct ndpi_detection_module_struct *ndpi_str, if(host && (host[0] != '\0')) { /* Check host exception */ ndpi_check_hostname_risk_exception(ndpi_str, flow, host); - + if(flow->risk_mask == 0) { u_int i; @@ -2555,7 +2535,7 @@ void ndpi_set_risk(struct ndpi_flow_struct *flow, ndpi_risk_enum r, /* In case there is an exception set, take it into account */ if(flow->host_risk_mask_evaluated) v &= flow->risk_mask; - + // NDPI_SET_BIT(flow->risk, (u_int32_t)r); flow->risk |= v; @@ -3149,3 +3129,141 @@ const char *ndpi_lru_cache_idx_to_name(lru_cache_type idx) return "unknown"; return names[idx]; } + +/* ******************************************* */ + +size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize) { + size_t ret = shoco_compress(in, len, out, bufsize); + + if(ret > bufsize) + return(0); /* Better not to compress data (it is longer than the uncompressed data) */ + + return(ret); +} + +/* ******************************************* */ + +size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize) { + return(shoco_decompress(in, len, out, bufsize)); + +} + +/* ******************************************* */ + +static u_char ndpi_domain_mapper[256]; +static bool ndpi_domain_mapper_initialized = false; + +#define IGNORE_CHAR 0xFF +#define NUM_BITS_NIBBLE 6 /* each 'nibble' is encoded with 6 bits */ +#define NIBBLE_ELEM_OFFSET 24 + +/* Used fo encoding domain names 8 bits -> 6 bits */ +static void ndpi_domain_mapper_init() { + u_int i; + u_char idx = 1 /* start from 1 to make sure 0 is no ambiguous */; + + memset(ndpi_domain_mapper, IGNORE_CHAR, 256); + + for(i='a'; i<= 'z'; i++) + ndpi_domain_mapper[i] = idx++; + + for(i='0'; i<= '9'; i++) + ndpi_domain_mapper[i] = idx++; + + ndpi_domain_mapper['-'] = idx++; + ndpi_domain_mapper['_'] = idx++; + ndpi_domain_mapper['.'] = idx++; +} + +/* ************************************************ */ + +u_int ndpi_encode_domain(struct ndpi_detection_module_struct *ndpi_str, + char *domain, char *out, u_int out_len) { + u_int out_idx = 0, i, buf_shift = 0, domain_buf_len, compressed_len, suffix_len, domain_len; + u_int32_t value = 0; + u_char domain_buf[256], compressed[128]; + u_int16_t domain_id = 0; + const char *suffix; + + if(!ndpi_domain_mapper_initialized) { + ndpi_domain_mapper_init(); + ndpi_domain_mapper_initialized = true; + } + + domain_len = strlen(domain); + + if(domain_len >= (out_len-3)) + return(0); + + if(domain_len <= 4) + return((u_int)snprintf(out, out_len, "%s", domain)); /* Too short */ + + /* [1] Encode the domain in 6 bits */ + suffix = ndpi_get_host_domain_suffix(ndpi_str, domain, &domain_id); + + if(suffix == NULL) + return((u_int)snprintf(out, out_len, "%s", domain)); /* Unknown suffix */ + + snprintf((char*)domain_buf, sizeof(domain_buf), "%s", domain); + domain_buf_len = strlen((char*)domain_buf), suffix_len = strlen(suffix); + + if(domain_buf_len > suffix_len) { + snprintf((char*)domain_buf, sizeof(domain_buf), "%s", domain); + domain_buf_len = domain_buf_len-suffix_len-1; + domain_buf[domain_buf_len] = '\0'; + + for(i=0; domain_buf[i] != '\0'; i++) { + u_int32_t mapped_idx = ndpi_domain_mapper[domain_buf[i]]; + + if(mapped_idx != IGNORE_CHAR) { + mapped_idx <<= buf_shift; + value |= mapped_idx, buf_shift += NUM_BITS_NIBBLE; + + if(buf_shift == NIBBLE_ELEM_OFFSET) { + memcpy(&out[out_idx], &value, 3); + out_idx += 3; + buf_shift = 0; /* Move to the next buffer */ + value = 0; + } + } + } + + if(buf_shift != 0) { + u_int bytes = buf_shift / NUM_BITS_NIBBLE; + + memcpy(&out[out_idx], &value, bytes); + out_idx += bytes; + } + } + + /* [2] Check if compressing the string is more efficient */ + compressed_len = ndpi_compress_str((char*)domain_buf, domain_buf_len, + (char*)compressed, sizeof(compressed)); + + if((compressed_len > 0) && ((out_idx == 0) || (compressed_len < out_idx))) { + if(compressed_len >= domain_len) { + /* Compression creates a longer buffer */ + return((u_int)snprintf(out, out_len, "%s", domain)); + } else { + compressed_len = ndpi_min(ndpi_min(compressed_len, sizeof(compressed)), out_len-3); + memcpy(out, compressed, compressed_len); + out_idx = compressed_len; + } + } + + /* Add trailer domainId value */ + out[out_idx++] = (domain_id >> 8) & 0xFF; + out[out_idx++] = domain_id & 0xFF; + +#ifdef DEBUG + { + u_int i; + + fprintf(stdout, "%s [len: %u][", domain, out_idx); + for(i=0; i<out_idx; i++) fprintf(stdout, "%02X", out[i] & 0xFF); + fprintf(stdout, "]\n"); + } +#endif + + return(out_idx); +} diff --git a/src/lib/protocols/dns.c b/src/lib/protocols/dns.c index 70b8cd451..d9eaf1e84 100644 --- a/src/lib/protocols/dns.c +++ b/src/lib/protocols/dns.c @@ -766,7 +766,7 @@ static void ndpi_search_dns(struct ndpi_detection_module_struct *ndpi_struct, st ndpi_hostname_sni_set(flow, (const u_int8_t *)_hostname, len, is_mdns ? NDPI_HOSTNAME_NORM_LC : NDPI_HOSTNAME_NORM_ALL); if (hostname_is_valid == 0) - ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, NULL); + ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, "Invalid chars detected in domain name"); dot = strchr(_hostname, '.'); if(dot) { diff --git a/src/lib/protocols/fastcgi.c b/src/lib/protocols/fastcgi.c index a9f9113d3..10384a13e 100644 --- a/src/lib/protocols/fastcgi.c +++ b/src/lib/protocols/fastcgi.c @@ -221,7 +221,7 @@ static void ndpi_search_fastcgi(struct ndpi_detection_module_struct *ndpi_struct ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str); /* This looks like an attack */ - ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL); + ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?"); } ndpi_int_fastcgi_add_connection(ndpi_struct, flow, &ret_match); } diff --git a/src/lib/protocols/http.c b/src/lib/protocols/http.c index a85f1c44c..8fc82dd67 100644 --- a/src/lib/protocols/http.c +++ b/src/lib/protocols/http.c @@ -1007,7 +1007,7 @@ static void check_content_type_and_change_protocol(struct ndpi_detection_module_ ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str); /* This looks like an attack */ - ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL); + ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?"); } double_col = strchr((char*)flow->host_server_name, ':'); diff --git a/src/lib/protocols/quic.c b/src/lib/protocols/quic.c index 4734433e0..345f77c47 100644 --- a/src/lib/protocols/quic.c +++ b/src/lib/protocols/quic.c @@ -1475,7 +1475,7 @@ void process_chlo(struct ndpi_detection_module_struct *ndpi_struct, ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str); /* This looks like an attack */ - ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL); + ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?"); } sni_found = 1; @@ -1503,7 +1503,7 @@ void process_chlo(struct ndpi_detection_module_struct *ndpi_struct, /* Add check for missing SNI */ if(flow->host_server_name[0] == '\0') { /* This is a bit suspicious */ - ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, NULL); + ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, "SNI should be present all time: attack ?"); } } diff --git a/src/lib/protocols/tls.c b/src/lib/protocols/tls.c index 882f463fb..54061d10c 100644 --- a/src/lib/protocols/tls.c +++ b/src/lib/protocols/tls.c @@ -643,7 +643,7 @@ void processCertificateElements(struct ndpi_detection_module_struct *ndpi_struct ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, dNSName); /* This looks like an attack */ - ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL); + ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Invalid dNSName name"); } if(matched_name == 0) { @@ -695,10 +695,13 @@ void processCertificateElements(struct ndpi_detection_module_struct *ndpi_struct i += len; } else { + char buf[32]; + + snprintf(buf, sizeof(buf), "Unknown extension %02X", general_name_type); #if DEBUG_TLS printf("[TLS] Leftover %u bytes", packet->payload_packet_len - i); #endif - ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, NULL); + ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, buf); break; } } else { @@ -781,7 +784,7 @@ int processCertificate(struct ndpi_detection_module_struct *ndpi_struct, if((packet->payload_packet_len != (length + 4 + (is_dtls ? 8 : 0))) || (packet->payload[1] != 0x0) || certificates_offset >= packet->payload_packet_len) { - ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, NULL); + ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, "Unvalid lenght"); return(-1); /* Invalid length */ } @@ -790,7 +793,7 @@ int processCertificate(struct ndpi_detection_module_struct *ndpi_struct, packet->payload[certificates_offset - 1]; if((packet->payload[certificates_offset - 3] != 0x0) || ((certificates_length+3) != length)) { - ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, NULL); + ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, "Invalid certificate offset"); return(-2); /* Invalid length */ } @@ -1056,7 +1059,7 @@ static int ndpi_search_tls_tcp(struct ndpi_detection_module_struct *ndpi_struct, u_int8_t alert_level = message->buffer[5]; if(alert_level == 2 /* Warning (1), Fatal (2) */) - ndpi_set_risk(flow, NDPI_TLS_FATAL_ALERT, NULL); + ndpi_set_risk(flow, NDPI_TLS_FATAL_ALERT, "Found fatal TLS alert"); } u_int16_t const alert_len = ntohs(*(u_int16_t const *)&message->buffer[3]); @@ -1516,7 +1519,7 @@ static void checkExtensions(struct ndpi_detection_module_struct *ndpi_struct, printf("[TLS] extension length exceeds remaining packet length: %u > %u.\n", extension_len, packet->payload_packet_len - extension_payload_offset); #endif - ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, NULL); + ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, "Invalid extension len"); return; } @@ -2264,7 +2267,7 @@ int processClientServerHello(struct ndpi_detection_module_struct *ndpi_struct, ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, sni); /* This looks like an attack */ - ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL); + ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Invalid chars found in SNI: exploit or misconfiguration?"); } if(!is_quic) { @@ -2847,7 +2850,7 @@ compute_ja3c: && (flow->protos.tls_quic.encrypted_sni.esni == NULL) /* No ESNI */ ) { /* This is a bit suspicious */ - ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, NULL); + ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, "SNI should always be present"); if(flow->protos.tls_quic.advertised_alpns != NULL) { char buf[256], *tmp, *item; @@ -2859,7 +2862,7 @@ compute_ja3c: while(item != NULL) { if(item[0] == 'h') { /* Example 'h2' */ - ndpi_set_risk(flow, NDPI_TLS_ALPN_SNI_MISMATCH, NULL); + ndpi_set_risk(flow, NDPI_TLS_ALPN_SNI_MISMATCH, item); break; } else item = strtok_r(NULL, ",", &tmp); diff --git a/src/lib/third_party/include/shoco.h b/src/lib/third_party/include/shoco.h new file mode 100644 index 000000000..0772ac656 --- /dev/null +++ b/src/lib/third_party/include/shoco.h @@ -0,0 +1,24 @@ +#pragma once + +#include <stddef.h> + +#if defined(_MSC_VER) +#define shoco_restrict __restrict +#elif __GNUC__ +#define shoco_restrict __restrict__ +#else +#define shoco_restrict restrict +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +size_t shoco_compress(const char * const shoco_restrict in, size_t len, char * const shoco_restrict out, size_t bufsize); +size_t shoco_decompress(const char * const shoco_restrict in, size_t len, char * const shoco_restrict out, size_t bufsize); + +#ifdef __cplusplus +} +#endif + + diff --git a/src/lib/third_party/include/shoco_domains_model.h b/src/lib/third_party/include/shoco_domains_model.h new file mode 100644 index 000000000..be1b50a8e --- /dev/null +++ b/src/lib/third_party/include/shoco_domains_model.h @@ -0,0 +1,172 @@ +/* + Note + This file has been generated (by ntop) as indicated in https://github.com/Ed-von-Schleck/shoco + using generate_compression_model.py and trained using domain names for obtaining optimal + performance when used with Internet domain names +*/ + +#ifndef _SHOCO_INTERNAL +#error This header file is only to be included by 'shoco.c'. +#endif +#pragma once +/* +This file was generated by 'generate_compressor_model.py' +so don't edit this by hand. Also, do not include this file +anywhere. It is internal to 'shoco.c'. Include 'shoco.h' +if you want to use shoco in your project. +*/ + +#define MIN_CHR 45 +#define MAX_CHR 123 + +static const char chrs_by_chr_id[32] = { + 'o', '.', 'c', 'e', 'a', 'i', 'r', 'n', 's', 't', 'l', 'd', 'm', 'u', 'p', 'g', 'h', 'b', 'f', 'k', 'y', 'v', 'w', '-', 'x', 'j', 'z', 'q', '1', '2', '0', '3' +}; + +static const int8_t chr_ids_by_chr[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 23, 1, -1, 30, 28, 29, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 17, 2, 11, 3, 18, 15, 16, 5, 25, 19, 10, 12, 7, 0, 14, 27, 6, 8, 9, 13, 21, 22, 24, 20, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +}; + +static const int8_t successor_ids_by_chr_id_and_chr_id[32][32] = { + {7, 3, 11, -1, -1, -1, 1, 2, 6, 8, 5, 12, 0, 4, 9, 15, -1, 13, -1, -1, -1, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {2, -1, 0, 6, 8, 3, 10, 1, 15, 11, -1, 5, 13, 4, 12, 7, -1, 9, -1, -1, -1, -1, -1, -1, -1, 14, -1, -1, -1, -1, -1, -1}, + {0, 4, 14, 3, 1, 8, 10, 9, 12, 6, 5, 13, -1, 11, -1, -1, 2, -1, -1, 7, -1, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1}, + {-1, 1, 8, 10, 7, -1, 0, 4, 3, 2, 6, 5, 9, -1, 13, -1, -1, 12, -1, -1, -1, 15, 11, -1, 14, -1, -1, -1, -1, -1, -1, -1}, + {-1, 4, 6, -1, -1, 10, 1, 0, 5, 3, 2, 8, 7, 12, 9, 11, -1, 13, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {3, 10, 2, 8, 6, -1, 9, 0, 4, 1, 5, 7, 11, -1, 14, 12, -1, -1, 15, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {2, 5, 9, 0, 1, 4, -1, 12, 8, 6, -1, 10, 13, 7, -1, 3, -1, -1, -1, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {9, 2, 8, 0, 5, 7, -1, 13, 6, 1, 10, 4, -1, 14, -1, 3, -1, -1, 12, 11, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1}, + {6, 0, 8, 2, 4, 3, -1, -1, 7, 1, -1, 15, 11, 10, 9, -1, 5, -1, -1, 12, 14, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1}, + {4, 1, 11, 0, 3, 2, 5, -1, 7, 10, 14, -1, 15, 8, -1, -1, 6, -1, -1, -1, 9, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {3, 4, 11, 1, 2, 0, -1, -1, 8, 7, 5, 10, 13, 6, 12, -1, -1, 15, 14, -1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {5, 2, 9, 0, 3, 1, 8, 7, 6, -1, 12, 14, 11, 4, -1, 10, -1, 13, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {3, 2, 11, 1, 0, 4, -1, -1, 7, 13, -1, 15, 10, 8, 5, 14, -1, 9, -1, -1, 6, -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1}, + {-1, 5, 14, 12, 11, 13, 1, 2, 0, 4, 9, 6, 8, -1, 7, 15, -1, 10, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {1, 5, 12, 3, 0, 6, 2, -1, 8, 9, 4, 14, 13, 11, 7, -1, 10, -1, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1}, + {0, 2, 13, 1, 3, 5, 4, 10, 9, 12, 6, -1, 15, 7, -1, 14, 8, -1, -1, -1, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {1, 4, 10, 0, 2, 3, 7, 9, 8, 6, 13, -1, 14, 5, 15, -1, -1, -1, -1, 12, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {4, 5, 9, 1, 0, 3, 2, 15, 8, 12, 6, -1, 14, 7, -1, 13, -1, 11, -1, -1, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {0, 9, 10, 3, 4, 1, 2, -1, 11, 6, 5, 15, 13, 7, 14, -1, -1, -1, 8, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {6, 0, 12, 1, 3, 2, 5, 11, 4, 8, 9, -1, 14, 7, 13, -1, -1, -1, -1, -1, 10, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1}, + {3, 0, 4, 5, 2, 14, -1, 9, 1, 6, 10, -1, 7, 13, 8, -1, -1, 11, -1, -1, -1, -1, -1, 15, -1, -1, 12, -1, -1, -1, -1, -1}, + {4, 2, 6, 0, 3, 1, 8, 5, 7, 11, -1, 14, 12, 10, 9, -1, -1, -1, -1, -1, 15, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1}, + {2, 5, 8, 0, 1, 3, 9, 7, 4, 12, 11, 13, 14, -1, 10, -1, 6, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {13, -1, 1, 7, 2, 5, 14, 12, 0, 6, 10, 8, 3, -1, 4, 11, -1, 9, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {10, 0, 6, 7, 5, 1, -1, 12, 8, 3, 15, -1, 13, 11, 2, -1, -1, -1, -1, -1, 4, -1, -1, 14, 9, -1, -1, -1, -1, -1, -1, -1}, + {1, 6, 8, 3, 2, 5, -1, 11, 7, 12, -1, 10, 9, 4, 0, -1, 14, 13, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, + {4, 2, 11, 0, 1, 3, -1, 14, 10, 15, 9, 12, 13, 6, -1, -1, 7, -1, -1, -1, 8, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1}, + {7, 1, 4, 12, 3, 2, 8, -1, 6, 9, 5, 15, 10, 0, 14, -1, -1, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, -1, -1, -1}, + {-1, 0, 14, -1, 11, -1, -1, -1, 12, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, -1, -1, -1, 3, 1, 2, 6}, + {-1, 0, 7, -1, 15, -1, -1, -1, 8, -1, -1, 11, 14, -1, 10, 12, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 3, 2}, + {-1, 0, 13, -1, 10, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 1, 7}, + {-1, 0, 5, -1, 14, -1, -1, -1, 13, -1, -1, 2, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, 6, 3, 4} +}; + +static const int8_t chrs_by_chr_and_successor_id[MAX_CHR - MIN_CHR][16] = { + {'s', 'c', 'a', 'm', 'p', 'i', 't', 'e', 'd', 'b', 'l', 'g', 'n', 'o', 'r', 'f'}, + {'c', 'n', 'o', 'i', 'u', 'd', 'e', 'g', 'a', 'b', 'r', 't', 'p', 'm', 'j', 's'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'.', '0', '1', '2', '7', '5', '4', '3', '8', '6', 'a', '9', 'b', 'c', 's', 'm'}, + {'.', '2', '0', '1', '8', '6', '3', '9', '7', '4', '5', 'a', 's', '-', 'c', 'p'}, + {'.', '4', '3', '0', '1', '2', 'b', 'c', 's', '5', 'p', 'd', 'g', '6', 'm', 'a'}, + {'.', '6', 'd', '0', '3', 'c', '2', '4', '1', '8', '5', '7', '9', 's', 'a', 'm'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}, + {'n', 'r', 'l', 't', '.', 's', 'c', 'm', 'd', 'p', 'i', 'g', 'u', 'b', 'y', 'v'}, + {'a', 'e', 'r', 'i', 'o', '.', 'l', 'u', 's', 'c', 'y', 'b', 't', 'g', 'm', 'n'}, + {'o', 'a', 'h', 'e', '.', 'l', 't', 'k', 'i', 'n', 'r', 'u', 's', 'd', 'c', 'z'}, + {'e', 'i', '.', 'a', 'u', 'o', 's', 'n', 'r', 'c', 'g', 'm', 'l', 'b', 'd', 'y'}, + {'r', '.', 't', 's', 'n', 'd', 'l', 'a', 'c', 'm', 'e', 'w', 'b', 'p', 'x', 'v'}, + {'o', 'i', 'r', 'e', 'a', 'l', 't', 'u', 'f', '.', 'c', 's', 'y', 'm', 'p', 'd'}, + {'o', 'e', '.', 'a', 'r', 'i', 'l', 'u', 'h', 's', 'n', 'y', 't', 'c', 'g', 'm'}, + {'e', 'o', 'a', 'i', '.', 'u', 't', 'r', 's', 'n', 'c', 'y', 'k', 'l', 'm', 'p'}, + {'n', 't', 'c', 'o', 's', 'l', 'a', 'd', 'e', 'r', '.', 'm', 'g', 'v', 'p', 'f'}, + {'p', 'o', 'a', 'e', 'u', 'i', '.', 's', 'c', 'm', 'd', 'n', 't', 'b', 'h', 'k'}, + {'.', 'e', 'i', 'a', 's', 'r', 'o', 'u', 't', 'l', 'y', 'n', 'c', 'p', 'm', '-'}, + {'i', 'e', 'a', 'o', '.', 'l', 'u', 't', 's', 'y', 'd', 'c', 'p', 'm', 'f', 'b'}, + {'a', 'e', '.', 'o', 'i', 'p', 'y', 's', 'u', 'b', 'm', 'c', 'x', 't', 'g', 'd'}, + {'e', 't', '.', 'g', 'd', 'a', 's', 'i', 'c', 'o', 'l', 'k', 'f', 'n', 'u', '-'}, + {'m', 'r', 'n', '.', 'u', 'l', 's', 'o', 't', 'p', 'v', 'c', 'd', 'b', 'w', 'g'}, + {'a', 'o', 'r', 'e', 'l', '.', 'i', 'p', 's', 't', 'h', 'u', 'c', 'm', 'd', '-'}, + {'u', '.', 'i', 'a', 'c', 'l', 's', 'o', 'r', 't', 'm', 'b', 'e', 'q', 'p', 'd'}, + {'e', 'a', 'o', 'g', 'i', '.', 't', 'u', 's', 'c', 'd', 'k', 'n', 'm', 'y', 'v'}, + {'.', 't', 'e', 'i', 'a', 'h', 'o', 's', 'c', 'p', 'u', 'm', 'k', '-', 'y', 'd'}, + {'e', '.', 'i', 'a', 'o', 'r', 'h', 's', 'u', 'y', 't', 'c', 'w', 'v', 'l', 'm'}, + {'s', 'r', 'n', 'k', 't', '.', 'd', 'p', 'm', 'l', 'b', 'a', 'e', 'i', 'c', 'g'}, + {'e', 'i', '.', 'a', 'o', 'n', 'c', 's', 'r', 'p', 'u', 't', 'm', '-', 'd', 'y'}, + {'e', 'a', 'o', 'i', 's', '.', 'h', 'n', 'c', 'r', 'p', 'l', 't', 'd', 'm', 'w'}, + {'.', 'i', 'p', 't', 'y', 'a', 'c', 'e', 's', 'x', 'o', 'u', 'n', 'm', '-', 'l'}, + {'.', 's', 'a', 'o', 'c', 'e', 't', 'm', 'p', 'n', 'l', 'b', 'z', 'u', 'i', '-'}, + {'e', 'a', '.', 'i', 'o', 'z', 'u', 'h', 'y', 'l', 's', 'c', 'd', 'm', 'n', 't'} +}; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4324) // structure was padded due to __declspec(align()) +#endif + +typedef struct Pack { + const uint32_t word; + const unsigned int bytes_packed; + const unsigned int bytes_unpacked; + const unsigned int offsets[8]; + const int16_t _ALIGNED masks[8]; + const unsigned char header_mask; + const unsigned char header; +} Pack; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#define PACK_COUNT 3 +#define MAX_SUCCESSOR_N 7 + +static const Pack packs[PACK_COUNT] = { + { 0x80000000, 1, 2, { 26, 24, 24, 24, 24, 24, 24, 24 }, { 15, 3, 0, 0, 0, 0, 0, 0 }, 0xc0, 0x80 }, + { 0xc0000000, 2, 4, { 25, 22, 19, 16, 16, 16, 16, 16 }, { 15, 7, 7, 7, 0, 0, 0, 0 }, 0xe0, 0xc0 }, + { 0xe0000000, 4, 8, { 23, 19, 15, 11, 8, 5, 2, 0 }, { 31, 15, 15, 15, 7, 7, 7, 3 }, 0xf0, 0xe0 } +}; diff --git a/src/lib/third_party/src/shoco.c b/src/lib/third_party/src/shoco.c new file mode 100644 index 000000000..5b4ea3f6d --- /dev/null +++ b/src/lib/third_party/src/shoco.c @@ -0,0 +1,233 @@ +/* https://github.com/Ed-von-Schleck/shoco */ + +#include <stdint.h> + +#if (defined (__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || __BIG_ENDIAN__) + #define swap(x) (x) +#else + #if defined(_MSC_VER) + #include <stdlib.h> + #define swap(x) _byteswap_ulong(x) + #elif defined (__GNUC__) + #if defined(__builtin_bswap32) + #define swap(x) __builtin_bswap32(x) + #else + #define swap(x) ((x<<24) + ((x&0x0000FF00)<<8) + ((x&0x00FF0000)>>8) + (x>>24)) + #endif + #else + #include <byteswap.h> + #define swap(x) bswap_32(x) + #endif +#endif + +#if defined(_MSC_VER) + #define _ALIGNED __declspec(align(16)) + #define inline __inline +#elif defined(__GNUC__) + #define _ALIGNED __attribute__ ((aligned(16))) +#else + #define _ALIGNED +#endif + +#if defined(_M_X64) || defined (_M_AMD64) || defined (__x86_64__) + #include "emmintrin.h" + #define HAVE_SSE2 +#endif + +#include "shoco.h" +#define _SHOCO_INTERNAL +#include "shoco_domains_model.h" /* we have built a model trained on domain names */ + +static inline int decode_header(unsigned char val) { + int i = -1; + while ((signed char)val < 0) { + val <<= 1; + ++i; + } + return i; +} + +union Code { + uint32_t word; + char bytes[4]; +}; + +#ifdef HAVE_SSE2 +static inline int check_indices(const int16_t * shoco_restrict indices, int pack_n) { + __m128i zero = _mm_setzero_si128(); + __m128i indis = _mm_load_si128 ((__m128i *)indices); + __m128i masks = _mm_load_si128 ((__m128i *)packs[pack_n].masks); + __m128i cmp = _mm_cmpgt_epi16 (indis, masks); + __m128i mmask = _mm_cmpgt_epi16 (masks, zero); + cmp = _mm_and_si128 (cmp, mmask); + int result = _mm_movemask_epi8 (cmp); + return (result == 0); +} +#else +static inline int check_indices(const int16_t * shoco_restrict indices, int pack_n) { + unsigned int i; + + for (i = 0; i < packs[pack_n].bytes_unpacked; ++i) + if (indices[i] > packs[pack_n].masks[i]) + return 0; + return 1; +} +#endif + +static inline int find_best_encoding(const int16_t * shoco_restrict indices, unsigned int n_consecutive) { + int p; + + for (p = PACK_COUNT - 1; p >= 0; --p) + if ((n_consecutive >= packs[p].bytes_unpacked) && (check_indices(indices, p))) + return p; + return -1; +} + +size_t shoco_compress(const char * const shoco_restrict original, size_t strlen, char * const shoco_restrict out, size_t bufsize) { + char *o = out; + char * const out_end = out + bufsize; + const char *in = original; + int16_t _ALIGNED indices[MAX_SUCCESSOR_N + 1] = { 0 }; + int last_chr_index; + int current_index; + int successor_index; + unsigned int n_consecutive; + union Code code; + int pack_n; + unsigned int rest; + const char * const in_end = original + strlen; + + while ((*in != '\0')) { + if (strlen && (in == in_end)) + break; + + // find the longest string of known successors + indices[0] = chr_ids_by_chr[(unsigned char)in[0]]; + last_chr_index = indices[0]; + if (last_chr_index < 0) + goto last_resort; + + rest = in_end - in; + for (n_consecutive = 1; n_consecutive <= MAX_SUCCESSOR_N; ++n_consecutive) { + if (strlen && (n_consecutive == rest)) + break; + + current_index = chr_ids_by_chr[(unsigned char)in[n_consecutive]]; + if (current_index < 0) // '\0' is always -1 + break; + + successor_index = successor_ids_by_chr_id_and_chr_id[last_chr_index][current_index]; + if (successor_index < 0) + break; + + indices[n_consecutive] = (int16_t)successor_index; + last_chr_index = current_index; + } + if (n_consecutive < 2) + goto last_resort; + + pack_n = find_best_encoding(indices, n_consecutive); + if (pack_n >= 0) { + unsigned int i; + + if (o + packs[pack_n].bytes_packed > out_end) + return bufsize + 1; + + code.word = packs[pack_n].word; + for (i = 0; i < packs[pack_n].bytes_unpacked; ++i) + code.word |= indices[i] << packs[pack_n].offsets[i]; + + // In the little-endian world, we need to swap what's + // in the register to match the memory representation. + // On big-endian systems, this is a dummy. + code.word = swap(code.word); + + // if we'd just copy the word, we might write over the end + // of the output string + for (i = 0; i < packs[pack_n].bytes_packed; ++i) + o[i] = code.bytes[i]; + + o += packs[pack_n].bytes_packed; + in += packs[pack_n].bytes_unpacked; + } else { +last_resort: + if (*in & 0x80) { + // non-ascii case + if (o + 2 > out_end) + return bufsize + 1; + // put in a sentinel byte + *o++ = 0x00; + } else { + // an ascii byte + if (o + 1 > out_end) + return bufsize + 1; + } + *o++ = *in++; + } + } + + return o - out; +} + +size_t shoco_decompress(const char * const shoco_restrict original, size_t complen, char * const shoco_restrict out, size_t bufsize) { + char *o = out; + char * const out_end = out + bufsize; + const char *in = original; + char last_chr; + union Code code = { 0 }; + int offset; + int mask; + int mark; + const char * const in_end = original + complen; + + while (in < in_end) { + mark = decode_header(*in); + if (mark < 0) { + if (o >= out_end) + return bufsize + 1; + + // ignore the sentinel value for non-ascii chars + if (*in == 0x00) { + if (++in >= in_end) + return SIZE_MAX; + } + + *o++ = *in++; + } else { + unsigned int i; + + if (o + packs[mark].bytes_unpacked > out_end) + return bufsize + 1; + else if (in + packs[mark].bytes_packed > in_end) + return SIZE_MAX; + + // This should be OK as well, but it fails with emscripten. + // Test this with new versions of emcc. + //code.word = swap(*(uint32_t *)in); + for (i = 0; i < packs[mark].bytes_packed; ++i) + code.bytes[i] = in[i]; + code.word = swap(code.word); + + // unpack the leading char + offset = packs[mark].offsets[0]; + mask = packs[mark].masks[0]; + last_chr = o[0] = chrs_by_chr_id[(code.word >> offset) & mask]; + + // unpack the successor chars + for (i = 1; i < packs[mark].bytes_unpacked; ++i) { + offset = packs[mark].offsets[i]; + mask = packs[mark].masks[i]; + last_chr = o[i] = chrs_by_chr_and_successor_id[(unsigned char)last_chr - MIN_CHR][(code.word >> offset) & mask]; + } + + o += packs[mark].bytes_unpacked; + in += packs[mark].bytes_packed; + } + } + + // append a 0-terminator if it fits + if (o < out_end) + *o = '\0'; + + return o - out; +} |