aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLuca Deri <lucaderi@users.noreply.github.com>2024-04-18 23:21:40 +0200
committerGitHub <noreply@github.com>2024-04-18 23:21:40 +0200
commitad117bfaabd3bc75dc70d0ddbc4ba18c86c40dbd (patch)
tree3b1fb6016da1e114bca190ed6a868421fd9c32f1 /src
parent108b8331d5b345e110c9ef110a6aa95a2767a640 (diff)
Domain Classification Improvements (#2396)
* Added size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize); size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize); used to compress short strings such as domain names. This code is based on https://github.com/Ed-von-Schleck/shoco * Major code rewrite for ndpi_hash and ndpi_domain_classify * Improvements to make sure custom categories are loaded and enabled * Fixed string encoding * Extended SalesForce/Cloudflare domains list
Diffstat (limited to 'src')
16 files changed, 874 insertions, 351 deletions
diff --git a/src/include/ndpi_api.h b/src/include/ndpi_api.h
index 34617c535..acc01fb0f 100644
--- a/src/include/ndpi_api.h
+++ b/src/include/ndpi_api.h
@@ -1931,11 +1931,9 @@ extern "C" {
* Free the hashmap.
*
* @par h = pointer to the hash map [in, out]
- * @par cleanup_func = pointer to a optional callback function
- * called for each element in the hashmap [in]
*
*/
- void ndpi_hash_free(ndpi_str_hash **h, void (*cleanup_func)(ndpi_str_hash *h));
+ void ndpi_hash_free(ndpi_str_hash **h);
/**
* Search for an entry in the hashmap.
@@ -1949,7 +1947,7 @@ extern "C" {
* @return 0 if an entry with that key was found, 1 otherwise
*
*/
- int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, void **value);
+ int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, u_int16_t *value);
/**
* Add an entry to the hashmap.
@@ -1957,12 +1955,12 @@ extern "C" {
* @par h = pointer to the hash map [in, out]
* @par key = character string (no '\0' required) [in]
* @par key_len = length of the character string @key [in]
- * @par value = pointer to the value to add [in]
+ * @par value = value to add [in]
*
* @return 0 if the entry was added, 1 otherwise
*
*/
- int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, void *value);
+ int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, u_int16_t value);
/* ******************************* */
@@ -2076,23 +2074,21 @@ extern "C" {
for substring domain matching and classification
*/
- ndpi_domain_classify* ndpi_domain_classify_alloc(void);
- void ndpi_domain_classify_free(ndpi_domain_classify *s);
- u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s);
- bool ndpi_domain_classify_add(ndpi_domain_classify *s,
- u_int8_t class_id, const char *domain);
- u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s,
- u_int8_t class_id,
- char *file_path);
- bool ndpi_domain_classify_finalize(ndpi_domain_classify *s);
- const char* ndpi_domain_classify_longest_prefix(ndpi_domain_classify *s,
- u_int8_t *class_id /* out */,
- const char *hostnname,
- bool return_subprefix);
- bool ndpi_domain_classify_contains(ndpi_domain_classify *s,
- u_int8_t *class_id /* out */,
- const char *domain);
-
+ ndpi_domain_classify* ndpi_domain_classify_alloc();
+ void ndpi_domain_classify_free(ndpi_domain_classify *s);
+ u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s);
+ bool ndpi_domain_classify_add(struct ndpi_detection_module_struct *ndpi_mod,
+ ndpi_domain_classify *s,
+ u_int16_t class_id, char *domain);
+ u_int32_t ndpi_domain_classify_add_domains(struct ndpi_detection_module_struct *ndpi_mod,
+ ndpi_domain_classify *s,
+ u_int16_t class_id,
+ char *file_path);
+ bool ndpi_domain_classify_hostname(struct ndpi_detection_module_struct *ndpi_mod,
+ ndpi_domain_classify *s,
+ u_int16_t *class_id /* out */,
+ char *hostname);
+
/* ******************************* */
/*
@@ -2160,12 +2156,14 @@ extern "C" {
*
* @par ndpi_str = the struct created for the protocol detection
* @par hostname = the hostname from which the domain name has to be extracted
+ * @par suffix_id = the id of the returned domain
*
* @return The host domain name suffic or the host itself if not found.
*
*/
const char* ndpi_get_host_domain_suffix(struct ndpi_detection_module_struct *ndpi_str,
- const char *hostname);
+ const char *hostname,
+ u_int16_t *suffix_id /* out */);
/**
* Returns the domain (including the TLS) suffix out of the specified hostname.
@@ -2217,6 +2215,20 @@ extern "C" {
/* ******************************* */
+ size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize);
+ size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize);
+
+ /* ******************************* */
+
+ /* NOTE
+ this function works best if yout have loaded in memory domain
+ suffixes using ndpi_load_domain_suffixes()
+ */
+ u_int ndpi_encode_domain(struct ndpi_detection_module_struct *ndpi_str,
+ char *domain, char *out, u_int out_len);
+
+ /* ******************************* */
+
const char *ndpi_lru_cache_idx_to_name(lru_cache_type idx);
/**
diff --git a/src/include/ndpi_private.h b/src/include/ndpi_private.h
index ee6302626..ece904278 100644
--- a/src/include/ndpi_private.h
+++ b/src/include/ndpi_private.h
@@ -400,7 +400,7 @@ struct ndpi_detection_module_struct {
u_int16_t max_payload_track_len;
- ndpi_domain_classify *public_domain_suffixes;
+ ndpi_str_hash *public_domain_suffixes;
};
diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h
index 9dc3fdc98..86c7df8ab 100644
--- a/src/include/ndpi_typedefs.h
+++ b/src/include/ndpi_typedefs.h
@@ -1127,11 +1127,7 @@ typedef struct _ndpi_automa {
struct ndpi_automa_stats stats;
} ndpi_automa;
-typedef struct ndpi_str_hash {
- unsigned int hash;
- void *value;
- // u_int8_t private_data[1]; /* Avoid error C2466 and do not initiate private data with 0 */
-} ndpi_str_hash;
+typedef void ndpi_str_hash;
typedef struct ndpi_proto {
/*
@@ -1164,10 +1160,7 @@ typedef struct {
#define MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS 16
typedef struct {
- struct {
- u_int16_t class_id;
- ndpi_bitmap64_fuse *domains;
- } classes[MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS];
+ ndpi_str_hash *domains;
} ndpi_domain_classify;
typedef enum {
diff --git a/src/lib/ndpi_content_match.c.inc b/src/lib/ndpi_content_match.c.inc
index e37d04b5e..d850581f7 100644
--- a/src/lib/ndpi_content_match.c.inc
+++ b/src/lib/ndpi_content_match.c.inc
@@ -385,6 +385,7 @@ static ndpi_protocol_match host_match[] =
{ "bloombergvault.com", "Bloomberg", NDPI_PROTOCOL_BLOOMBERG, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "bloomberg.com", "Bloomberg", NDPI_PROTOCOL_BLOOMBERG, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "salesforce.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
+ { "force.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_CLOUD, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "salesforceliveagent.com", "Salesforce", NDPI_PROTOCOL_SALESFORCE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "apple-dns.net", "Apple", NDPI_PROTOCOL_APPLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "origin-apple.com.akadns.net", "Apple", NDPI_PROTOCOL_APPLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE, NDPI_PROTOCOL_DEFAULT_LEVEL },
@@ -920,8 +921,9 @@ static ndpi_protocol_match host_match[] =
{ "whiteboard.microsoft.com", "Microsoft365", NDPI_PROTOCOL_MICROSOFT_365, NDPI_PROTOCOL_CATEGORY_COLLABORATIVE, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "events.data.microsoft.com", "Microsoft365", NDPI_PROTOCOL_MICROSOFT_365, NDPI_PROTOCOL_CATEGORY_COLLABORATIVE, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
- { "cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
- { "cdnjs.cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
+ { "cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
+ { "cdnjs.cloudflare.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
+ { "cf-ipfs.com", "Cloudflare", NDPI_PROTOCOL_CLOUDFLARE, NDPI_PROTOCOL_CATEGORY_MEDIA, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "d295hzzivaok4k.cloudfront.net","OpenDNS", NDPI_PROTOCOL_OPENDNS, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
{ "opendns.com", "OpenDNS", NDPI_PROTOCOL_OPENDNS, NDPI_PROTOCOL_CATEGORY_NETWORK, NDPI_PROTOCOL_ACCEPTABLE, NDPI_PROTOCOL_DEFAULT_LEVEL },
diff --git a/src/lib/ndpi_domain_classify.c b/src/lib/ndpi_domain_classify.c
index fce10d072..f62800527 100644
--- a/src/lib/ndpi_domain_classify.c
+++ b/src/lib/ndpi_domain_classify.c
@@ -1,7 +1,7 @@
/*
* ndpi_domain_classify.c
*
- * Copyright (C) 2011-23 - ntop.org and contributors
+ * Copyright (C) 2011-24 - ntop.org and contributors
*
* nDPI is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
@@ -27,40 +27,31 @@
#include "ndpi_config.h"
#include "ndpi_api.h"
-#if 0
-#define DEBUG_ADD
-#define DEBUG_CONTAINS
-#endif
+#define ENCODE_DATA
/* ********************************************************** */
ndpi_domain_classify* ndpi_domain_classify_alloc() {
- int i;
- ndpi_domain_classify *cat = (ndpi_domain_classify*)ndpi_malloc(sizeof(ndpi_domain_classify));
+ ndpi_domain_classify *s = (ndpi_domain_classify*)ndpi_malloc(sizeof(ndpi_domain_classify));
- if(!cat)
+ if(!s)
return NULL;
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++)
- cat->classes[i].class_id = 0, cat->classes[i].domains = NULL;
+ if(ndpi_hash_init(&s->domains) != 0) {
+ ndpi_free(s);
+ return(NULL);
+ }
- return((ndpi_domain_classify*)cat);
+ return((ndpi_domain_classify*)s);
}
/* ********************************************************** */
void ndpi_domain_classify_free(ndpi_domain_classify *s) {
- u_int32_t i;
-
if(!s)
return;
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].domains != NULL) {
- ndpi_bitmap64_fuse_free(s->classes[i].domains);
- } else
- break;
- }
+ ndpi_hash_free(&s->domains);
ndpi_free(s);
}
@@ -68,28 +59,26 @@ void ndpi_domain_classify_free(ndpi_domain_classify *s) {
/* ********************************************************** */
u_int32_t ndpi_domain_classify_size(ndpi_domain_classify *s) {
- u_int32_t i, tot_len = sizeof(ndpi_domain_classify);
+ u_int32_t tot_len = sizeof(ndpi_domain_classify);
if(!s)
return(0);
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].domains != NULL) {
- tot_len += ndpi_bitmap64_fuse_size(s->classes[i].domains);
- } else
- break;
- }
+ /* TODO */
return(tot_len);
}
/* ********************************************************** */
-bool ndpi_domain_classify_add(ndpi_domain_classify *s,
- u_int8_t class_id,
- const char *domain) {
- u_int32_t i;
- u_int64_t hash;
+bool ndpi_domain_classify_add(struct ndpi_detection_module_struct *ndpi_str,
+ ndpi_domain_classify *s,
+ u_int16_t class_id,
+ char *domain) {
+#ifdef ENCODE_DATA
+ u_int32_t out_len;
+ char out[256];
+#endif
if((!s) || (!domain))
return(false);
@@ -97,43 +86,27 @@ bool ndpi_domain_classify_add(ndpi_domain_classify *s,
/* Skip initial string . in domain names */
while(domain[0] == '.') domain++;
-#if 0
- char *dot = strrchr(domain, '.');
+ //printf("%s\n", domain);
+ // fprintf(stdout, "."); fflush(stdout);
- if(dot) {
- if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local")))
- return(false);
- }
-#endif
+#ifdef ENCODE_DATA
+ out_len = ndpi_encode_domain(ndpi_str, domain, out, sizeof(out));
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].class_id == class_id) {
- break;
- } else if(s->classes[i].class_id == 0) {
- s->classes[i].class_id = class_id;
- s->classes[i].domains = ndpi_bitmap64_fuse_alloc();
-
- if(!s->classes[i].domains)
- s->classes[i].class_id = 0;
-
- break;
- }
- }
-
- if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS)
- return(false);
-
- hash = ndpi_quick_hash64(domain, strlen(domain));
+ ndpi_hash_add_entry(&s->domains, out, out_len, class_id);
+#else
+ ndpi_hash_add_entry(&s->domains, domain, strlen(domain), class_id);
+#endif
- return(ndpi_bitmap64_fuse_set(s->classes[i].domains, hash));
+ return(true);
}
/* ********************************************************** */
-u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s,
- u_int8_t class_id,
+u_int32_t ndpi_domain_classify_add_domains(struct ndpi_detection_module_struct *ndpi_mod,
+ ndpi_domain_classify *s,
+ u_int16_t class_id,
char *file_path) {
- u_int32_t i, num_added = 0;
+ u_int32_t num_added = 0;
char buf[256];
FILE *fd;
char *line;
@@ -141,30 +114,12 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s,
if((!s) || (!file_path))
return(false);
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].class_id == class_id) {
- break;
- } else if(s->classes[i].class_id == 0) {
- s->classes[i].class_id = class_id;
- s->classes[i].domains = ndpi_bitmap64_fuse_alloc();
- if(!s->classes[i].domains)
- s->classes[i].class_id = 0;
- break;
- }
- }
-
- if(i == MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS)
- return(false);
-
- /* *************************************** */
-
fd = fopen(file_path, "r");
if(fd == NULL)
return(false);
while((line = fgets(buf, sizeof(buf), fd)) != NULL) {
u_int len;
- u_int64_t hash;
if((line[0] == '#') || (line[0] == '\0'))
continue;
@@ -177,9 +132,7 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s,
line[len] = '\0';
}
- hash = ndpi_quick_hash64(line, strlen(line));
-
- if(ndpi_bitmap64_fuse_set(s->classes[i].domains, hash))
+ if(ndpi_domain_classify_add(ndpi_mod, s, class_id, line))
num_added++;
}
@@ -191,104 +144,53 @@ u_int32_t ndpi_domain_classify_add_domains(ndpi_domain_classify *s,
/* ********************************************************** */
bool ndpi_domain_classify_finalize(ndpi_domain_classify *s) {
- u_int32_t i;
-
if(!s)
return(false);
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].class_id != 0) {
- ndpi_bitmap64_fuse_compress(s->classes[i].domains);
- }
- }
return(true);
}
/* ********************************************************** */
-static bool is_valid_domain_char(u_char c) {
- if(((c >= 'A') && (c <= 'Z'))
- || ((c >= 'a') && (c <= 'z'))
- || ((c >= '0') && (c <= '9'))
- || (c == '_')
- || (c == '-')
- || (c == '.'))
- return(true);
- else
- return(false);
-}
-
-/* ********************************************************** */
+bool ndpi_domain_classify_hostname(struct ndpi_detection_module_struct *ndpi_mod,
+ ndpi_domain_classify *s,
+ u_int16_t *class_id /* out */,
+ char *hostname) {
+ u_int32_t len;
+ const char *dot;
+ char *item;
-const char* ndpi_domain_classify_longest_prefix(ndpi_domain_classify *s,
- u_int8_t *class_id /* out */,
- const char *hostname,
- bool return_subprefix) {
- u_int32_t i, len;
- const char *dot, *elem, *prev_elem;
+ // ndpi_enable_loaded_categories(ndpi_mod); /* Make sure they have been enabled */
*class_id = 0; /* Unknown class_id */
- if(!hostname || !s) return(hostname);
- if((len = strlen(hostname)) == 0) return(hostname);
- if((dot = strrchr(hostname, '.')) == NULL) return(hostname);
- if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) return(hostname);
+ if(!hostname || !s) return(false);
+ if((len = strlen(hostname)) == 0) return(false);
+ if((dot = strrchr(hostname, '.')) == NULL) return(false);
+ if((!strcmp(dot, ".arpa")) || (!strcmp(dot, ".local"))) return(false);
- /* This is a number or a numeric IP or similar */
- if(ndpi_isdigit(hostname[len-1]) && isdigit(hostname[0])) {
-#ifdef DEBUG_CONTAINS
- printf("[contains] %s INVALID\n", hostname);
-#endif
+ item = hostname;
- return(hostname);
- }
+ while(true) {
+ char *next;
- if(!is_valid_domain_char(hostname[0])) {
-#ifdef DEBUG_CONTAINS
- printf("[contains] %s INVALID\n", hostname);
-#endif
+ /* This looks like a match so let's check the hash now */
+#ifdef ENCODE_DATA
+ char out[256];
+ u_int32_t out_len = ndpi_encode_domain(ndpi_mod, item, out, sizeof(out));
- return(hostname);
- }
-
- elem = prev_elem = hostname;
-
- while(elem != NULL) {
- u_int64_t hash = ndpi_quick_hash64(elem, strlen(elem));
-
- for(i=0; i<MAX_NUM_NDPI_DOMAIN_CLASSIFICATIONS; i++) {
- if(s->classes[i].class_id != 0) {
- if(ndpi_bitmap64_fuse_isset(s->classes[i].domains, hash)) {
-#ifdef DEBUG_CONTAINS
- printf("[contains] %s = %d [%llu]\n",
- hostname, s->classes[i].class_id, hash);
+ if(ndpi_hash_find_entry(s->domains, out, out_len, class_id) == 0)
+ return(true);
+#else
+ if(ndpi_hash_find_entry(s->domains, item, strlen(item), class_id) == 0)
+ return(true);
#endif
- *class_id = s->classes[i].class_id;
- return(return_subprefix ? prev_elem : elem);
- }
- } else
- break;
- }
-
- prev_elem = elem;
- elem = strchr(elem, '.');
- if(elem == NULL) break;
- // if(elem == dot) break;
+ next = strchr(item, '.');
- elem = &elem[1];
- } /* while */
+ if(!next) break; else item = &next[1];
+ }
/* Not found */
- return(hostname);
-}
-
-/* ********************************************************** */
-
-bool ndpi_domain_classify_contains(ndpi_domain_classify *s,
- u_int8_t *class_id /* out */,
- const char *domain) {
- (void)ndpi_domain_classify_longest_prefix(s, class_id, domain, false); /* UNUSED */
-
- return((*class_id == 0) ? false : true);
+ return(false);
}
diff --git a/src/lib/ndpi_domains.c b/src/lib/ndpi_domains.c
index 699beaf1f..c7f131304 100644
--- a/src/lib/ndpi_domains.c
+++ b/src/lib/ndpi_domains.c
@@ -29,29 +29,29 @@ int ndpi_load_domain_suffixes(struct ndpi_detection_module_struct *ndpi_str,
char *public_suffix_list_path) {
char buf[256], *line;
FILE *fd;
- u_int num_domains = 0;
-
+ u_int16_t domain_id = 1;
+
if(ndpi_str == NULL || public_suffix_list_path == NULL)
return(-1);
if((fd = fopen(public_suffix_list_path, "r")) == NULL)
return(-2);
-
+
if(ndpi_str->public_domain_suffixes != NULL) {
/* An existing license was aleady loaded: free it and start over */
- ndpi_domain_classify_free(ndpi_str->public_domain_suffixes);
+ ndpi_hash_free(&ndpi_str->public_domain_suffixes);
}
- if((ndpi_str->public_domain_suffixes = ndpi_domain_classify_alloc()) == NULL)
+ if(ndpi_hash_init(&ndpi_str->public_domain_suffixes) != 0)
return(-3);
while((line = fgets(buf, sizeof(buf), fd)) != NULL) {
u_int offset, len;
-
+
/* Skip private domains */
if(strstr(line, "// ===END ICANN DOMAINS==="))
break;
-
+
/* Skip empty lines or comments */
if((line[0] == '\0') || (line[0] == '/') || (line[0] == '\n') || (line[0] == '\r'))
continue;
@@ -65,54 +65,99 @@ int ndpi_load_domain_suffixes(struct ndpi_detection_module_struct *ndpi_str,
while((len > 0) && (line[len] == '\n'))
line[len--] = '\0';
- if(!ndpi_domain_classify_add(ndpi_str->public_domain_suffixes,
- 1 /* dummy */, &line[offset])) {
+ if(ndpi_hash_add_entry(&ndpi_str->public_domain_suffixes,
+ &line[offset], strlen(&line[offset]), domain_id) != 0) {
+
NDPI_LOG_ERR(ndpi_str, "Error while processing domain %s\n", &line[offset]);
} else
- num_domains++;
+ domain_id++;
}
fclose(fd);
-
- if(!ndpi_domain_classify_finalize(ndpi_str->public_domain_suffixes)) {
- NDPI_LOG_ERR(ndpi_str, "Error while finalizing domain processing\n");
- }
- if(num_domains > 0) {
- NDPI_LOG_DBG(ndpi_str, "Loaded %u domains\n", num_domains);
- }
-
+ if(domain_id > 0)
+ NDPI_LOG_DBG(ndpi_str, "Loaded %u domains\n", domain_id-1);
+
return(0);
}
/* ******************************* */
+/*
+ Example
+ - www.ntop.org -> org
+ - www.bbc.co.uk -> co.uk
+*/
+
const char* ndpi_get_host_domain_suffix(struct ndpi_detection_module_struct *ndpi_str,
- const char *hostname) {
+ const char *hostname,
+ u_int16_t *domain_id /* out */) {
+ char *dot, *prev_dot;
+
+ *domain_id = 0;
+
if(!ndpi_str)
return NULL;
+
if(ndpi_str->public_domain_suffixes == NULL)
return(hostname);
- else {
- u_int8_t class_id;
+
+ prev_dot = dot = strrchr(hostname, '.');
+
+ while(dot != NULL) {
+ while((dot != hostname) && (dot[0] != '.'))
+ dot--;
- return(ndpi_domain_classify_longest_prefix(ndpi_str->public_domain_suffixes,
- &class_id, hostname, false));
+ if((dot == hostname)
+ || (ndpi_hash_find_entry(ndpi_str->public_domain_suffixes,
+ &dot[1], strlen(&dot[1]), domain_id) != 0)) {
+ /* Not found: end of search */
+ return(&prev_dot[1]);
+ }
+
+ prev_dot = dot;
+ dot--;
}
+
+ return(hostname);
}
/* ******************************* */
+/*
+ Example
+ - www.ntop.org -> ntop.org
+ - www.bbc.co.uk -> bbc.co.uk
+*/
const char* ndpi_get_host_domain(struct ndpi_detection_module_struct *ndpi_str,
const char *hostname) {
+ const char *ret;
+ char *dot;
+ u_int16_t domain_id;
+
if(!ndpi_str)
return NULL;
+
if(ndpi_str->public_domain_suffixes == NULL)
return(hostname);
- else {
- u_int8_t class_id;
-
- return(ndpi_domain_classify_longest_prefix(ndpi_str->public_domain_suffixes,
- &class_id, hostname, true));
+
+ ret = ndpi_get_host_domain_suffix(ndpi_str, hostname, &domain_id);
+
+ if((ret == NULL) || (ret == hostname))
+ return(hostname);
+
+ dot = strstr(hostname, ret);
+
+ if(dot == NULL)
+ return(hostname);
+
+ dot--;
+ while(dot != hostname) {
+ dot--;
+
+ if(dot[0] == '.')
+ return(&dot[1]);
}
+
+ return(hostname);
}
diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c
index 9b04727bb..4c5c67557 100644
--- a/src/lib/ndpi_main.c
+++ b/src/lib/ndpi_main.c
@@ -954,7 +954,7 @@ static void init_string_based_protocols(struct ndpi_detection_module_struct *ndp
/* ************************ */
- ndpi_enable_loaded_categories(ndpi_str);
+ //ndpi_enable_loaded_categories(ndpi_str);
if(!ndpi_xgrams_inited) {
ndpi_xgrams_inited = 1;
@@ -3454,11 +3454,17 @@ static int is_ip_list_enabled(struct ndpi_detection_module_struct *ndpi_str, int
return 1;
}
+/* *********************************************** */
+
int ndpi_finalize_initialization(struct ndpi_detection_module_struct *ndpi_str) {
u_int i;
if(!ndpi_str)
return -1;
+
+ if(!ndpi_str->custom_categories.categories_loaded)
+ ndpi_enable_loaded_categories(ndpi_str);
+
if(ndpi_str->finalized) /* Already finalized */
return 0;
@@ -4038,15 +4044,18 @@ int ndpi_match_custom_category(struct ndpi_detection_module_struct *ndpi_str,
return(id != NDPI_PROTOCOL_UNKNOWN ? 0 : -1);
#else
char buf[128];
- u_int8_t class_id;
+ u_int16_t class_id;
u_int max_len = sizeof(buf)-1;
+ if(!ndpi_str->custom_categories.categories_loaded)
+ ndpi_enable_loaded_categories(ndpi_str);
+
if(name_len > max_len) name_len = max_len;
memcpy(buf, name, name_len);
buf[name_len] = '\0';
- if(ndpi_domain_classify_contains(ndpi_str->custom_categories.sc_hostnames,
- &class_id, buf)) {
+ if(ndpi_domain_classify_hostname(ndpi_str, ndpi_str->custom_categories.sc_hostnames,
+ &class_id, buf)) {
*category = (ndpi_protocol_category_t)class_id;
return(0);
} else
@@ -4066,9 +4075,11 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str
ndpi_patricia_node_t *node;
u_int cp_len = ndpi_min(sizeof(ipbuf) - 1, name_len);
+ *id = 0;
+
if(!ndpi_str->custom_categories.categories_loaded)
- return(-1);
-
+ ndpi_enable_loaded_categories(ndpi_str);
+
if(cp_len > 0) {
memcpy(ipbuf, name_or_ip, cp_len);
ipbuf[cp_len] = '\0';
@@ -4085,7 +4096,8 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str
/* Search IPv4 */
/* Make sure all in network byte order otherwise compares wont work */
- ndpi_fill_prefix_v4(&prefix, &pin, 32, ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses)->maxbits);
+ ndpi_fill_prefix_v4(&prefix, &pin, 32,
+ ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses)->maxbits);
node = ndpi_patricia_search_best(ndpi_str->custom_categories.ipAddresses, &prefix);
if(node) {
@@ -4095,7 +4107,8 @@ int ndpi_get_custom_category_match(struct ndpi_detection_module_struct *ndpi_str
return(-1);
} else if(inet_pton(AF_INET6, ipbuf, &pin6) == 1) {
/* Search IPv6 */
- ndpi_fill_prefix_v6(&prefix, &pin6, 128, ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses6)->maxbits);
+ ndpi_fill_prefix_v6(&prefix, &pin6, 128,
+ ((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses6)->maxbits);
node = ndpi_patricia_search_best(ndpi_str->custom_categories.ipAddresses6, &prefix);
if(node) {
@@ -4196,10 +4209,10 @@ void ndpi_exit_detection_module(struct ndpi_detection_module_struct *ndpi_str) {
ac_automata_release((AC_AUTOMATA_t *) ndpi_str->tls_cert_subject_automa.ac_automa, 0);
if(ndpi_str->malicious_ja3_hashmap != NULL)
- ndpi_hash_free(&ndpi_str->malicious_ja3_hashmap, NULL);
+ ndpi_hash_free(&ndpi_str->malicious_ja3_hashmap);
if(ndpi_str->malicious_sha1_hashmap != NULL)
- ndpi_hash_free(&ndpi_str->malicious_sha1_hashmap, NULL);
+ ndpi_hash_free(&ndpi_str->malicious_sha1_hashmap);
#ifdef USE_LEGACY_AHO_CORASICK
if(ndpi_str->custom_categories.hostnames.ac_automa != NULL)
@@ -4260,7 +4273,7 @@ void ndpi_exit_detection_module(struct ndpi_detection_module_struct *ndpi_str) {
ndpi_free(ndpi_str->callback_buffer_tcp_payload);
if(ndpi_str->public_domain_suffixes)
- ndpi_domain_classify_free(ndpi_str->public_domain_suffixes);
+ ndpi_hash_free(&ndpi_str->public_domain_suffixes);
ndpi_free(ndpi_str);
}
@@ -4865,6 +4878,8 @@ int ndpi_load_categories_file(struct ndpi_detection_module_struct *ndpi_str,
return rc;
}
+/* ******************************************************************** */
+
int load_categories_file_fd(struct ndpi_detection_module_struct *ndpi_str,
FILE *fd, void *user_data) {
char buffer[512], *line, *name, *category, *saveptr;
@@ -5064,10 +5079,10 @@ int ndpi_load_categories_dir(struct ndpi_detection_module_struct *ndpi_str,
if(failed_files)
return(-1 * failed_files);
+
return(num_loaded);
}
-
/* ******************************************************************** */
static int ndpi_load_risky_domain(struct ndpi_detection_module_struct *ndpi_str,
@@ -5209,7 +5224,7 @@ int load_malicious_ja3_file_fd(struct ndpi_detection_module_struct *ndpi_str, FI
continue;
}
- if(ndpi_hash_add_entry(&ndpi_str->malicious_ja3_hashmap, line, len, NULL) == 0)
+ if(ndpi_hash_add_entry(&ndpi_str->malicious_ja3_hashmap, line, len, 0) == 0)
num++;
}
@@ -5287,7 +5302,8 @@ int load_malicious_sha1_file_fd(struct ndpi_detection_module_struct *ndpi_str, F
for (i = 0; i < 40; ++i)
first_comma[i] = toupper(first_comma[i]);
- if(ndpi_hash_add_entry(&ndpi_str->malicious_sha1_hashmap, first_comma, second_comma - first_comma, NULL) == 0)
+ if(ndpi_hash_add_entry(&ndpi_str->malicious_sha1_hashmap, first_comma,
+ second_comma - first_comma, 0) == 0)
num++;
}
@@ -6225,42 +6241,42 @@ static int ndpi_callback_init(struct ndpi_detection_module_struct *ndpi_str) {
/* ******************************************************************** */
static inline int ndpi_proto_cb_tcp_payload(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) {
- return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
- NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
- NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0;
+ return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0;
}
/* ******************************************************************** */
static inline int ndpi_proto_cb_tcp_nopayload(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) {
- return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
- NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
- NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0
- && (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- NDPI_SELECTION_BITMASK_PROTOCOL_HAS_PAYLOAD) == 0;
+ return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0
+ && (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ NDPI_SELECTION_BITMASK_PROTOCOL_HAS_PAYLOAD) == 0;
}
/* ******************************************************************** */
static inline int ndpi_proto_cb_udp(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) {
- return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- (NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP |
- NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
- NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0;
+ return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ (NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC)) != 0;
}
/* ******************************************************************** */
static inline int ndpi_proto_cb_other(const struct ndpi_detection_module_struct *ndpi_str, uint32_t idx) {
- return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
- NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP |
- NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP)) == 0
- ||
- (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
- NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC) != 0;
+ return (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ (NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_INT_UDP |
+ NDPI_SELECTION_BITMASK_PROTOCOL_INT_TCP_OR_UDP)) == 0
+ ||
+ (ndpi_str->callback_buffer[idx].ndpi_selection_bitmask &
+ NDPI_SELECTION_BITMASK_PROTOCOL_COMPLETE_TRAFFIC) != 0;
}
/* ******************************************************************** */
@@ -7896,7 +7912,7 @@ int ndpi_load_hostname_category(struct ndpi_detection_module_struct *ndpi_str,
if(ndpi_str->custom_categories.sc_hostnames_shadow == NULL)
return(-1);
- return(ndpi_domain_classify_add(ndpi_str->custom_categories.sc_hostnames_shadow,
+ return(ndpi_domain_classify_add(ndpi_str, ndpi_str->custom_categories.sc_hostnames_shadow,
(u_int16_t)category, (char*)name_to_add) ? 0 : -1);
#endif
}
@@ -7931,6 +7947,9 @@ int ndpi_enable_loaded_categories(struct ndpi_detection_module_struct *ndpi_str)
int i;
static char *built_in = "built-in";
+ if(ndpi_str->custom_categories.categories_loaded)
+ return(-1); /* Already loaded */
+
/* First add the nDPI known categories matches */
for(i = 0; category_match[i].string_to_match != NULL; i++)
ndpi_load_category(ndpi_str, category_match[i].string_to_match,
@@ -7956,7 +7975,7 @@ int ndpi_enable_loaded_categories(struct ndpi_detection_module_struct *ndpi_str)
}
#else
ndpi_domain_classify_free(ndpi_str->custom_categories.sc_hostnames);
- ndpi_domain_classify_finalize(ndpi_str->custom_categories.sc_hostnames_shadow);
+ // ndpi_domain_classify_finalize(ndpi_str->custom_categories.sc_hostnames_shadow);
ndpi_str->custom_categories.sc_hostnames = ndpi_str->custom_categories.sc_hostnames_shadow;
ndpi_str->custom_categories.sc_hostnames_shadow = ndpi_domain_classify_alloc();
#endif
diff --git a/src/lib/ndpi_utils.c b/src/lib/ndpi_utils.c
index 7795f59d5..49d86e28d 100644
--- a/src/lib/ndpi_utils.c
+++ b/src/lib/ndpi_utils.c
@@ -1,7 +1,7 @@
/*
* ndpi_utils.c
*
- * Copyright (C) 2011-23 - ntop.org and contributors
+ * Copyright (C) 2011-24 - ntop.org and contributors
*
* This file is part of nDPI, an open source deep packet inspection
* library based on the OpenDPI and PACE technology by ipoque GmbH
@@ -38,6 +38,7 @@
#include "ahocorasick.h"
#include "libcache.h"
+#include "shoco.h"
#include <time.h>
#ifndef WIN32
@@ -72,21 +73,11 @@ struct pcre2_struct {
};
#endif
-/*
- * Please keep this strcture in sync with
- * `struct ndpi_str_hash` in src/include/ndpi_typedefs.h
- */
-
-typedef struct ndpi_str_hash_private {
- unsigned int hash;
- void *value;
- // u_int8_t private_data[1]; /* Avoid error C2466 and do not initiate private data with 0 */
+typedef struct {
+ char *key;
+ u_int16_t value16;
UT_hash_handle hh;
-} ndpi_str_hash_private;
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
-_Static_assert(sizeof(struct ndpi_str_hash) == sizeof(struct ndpi_str_hash_private) - sizeof(UT_hash_handle),
- "Please keep `struct ndpi_str_hash` and `struct ndpi_str_hash_private` syncd.");
-#endif
+} ndpi_str_hash_priv;
/* ****************************************** */
@@ -1246,15 +1237,15 @@ static void ndpi_tls2json(ndpi_serializer *serializer, struct ndpi_flow_struct *
static char* print_ndpi_address_port(ndpi_address_port *ap, char *buf, u_int buf_len) {
char ipbuf[INET6_ADDRSTRLEN];
-
+
if(ap->is_ipv6) {
inet_ntop(AF_INET6, &ap->address, ipbuf, sizeof(ipbuf));
} else {
inet_ntop(AF_INET, &ap->address, ipbuf, sizeof(ipbuf));
}
-
+
snprintf(buf, buf_len, "%s:%u", ipbuf, ap->port);
-
+
return(buf);
}
@@ -2280,12 +2271,9 @@ ndpi_http_method ndpi_http_str2method(const char* method, u_int16_t method_len)
/* ******************************************************************** */
-int ndpi_hash_init(ndpi_str_hash **h)
-{
+int ndpi_hash_init(ndpi_str_hash **h) {
if (h == NULL)
- {
return 1;
- }
*h = NULL;
return 0;
@@ -2293,77 +2281,69 @@ int ndpi_hash_init(ndpi_str_hash **h)
/* ******************************************************************** */
-void ndpi_hash_free(ndpi_str_hash **h, void (*cleanup_func)(ndpi_str_hash *h))
-{
- struct ndpi_str_hash_private *h_priv;
- struct ndpi_str_hash_private *current, *tmp;
-
- if (h == NULL)
- {
- return;
- }
- h_priv = *(struct ndpi_str_hash_private **)h;
-
- HASH_ITER(hh, h_priv, current, tmp) {
- HASH_DEL(h_priv, current);
- if (cleanup_func != NULL)
- {
- cleanup_func((ndpi_str_hash *)current);
+void ndpi_hash_free(ndpi_str_hash **h) {
+ if(h != NULL) {
+ ndpi_str_hash_priv *h_priv = *((ndpi_str_hash_priv **)h);
+ ndpi_str_hash_priv *current, *tmp;
+
+ HASH_ITER(hh, h_priv, current, tmp) {
+ HASH_DEL(h_priv, current);
+ ndpi_free(current->key);
+ ndpi_free(current);
}
- ndpi_free(current);
+
+ *h = NULL;
}
-
- *h = NULL;
}
/* ******************************************************************** */
-int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, void **value)
-{
- struct ndpi_str_hash_private *h_priv = (struct ndpi_str_hash_private *)h;
- struct ndpi_str_hash_private *found;
- unsigned int hash_value;
+int ndpi_hash_find_entry(ndpi_str_hash *h, char *key, u_int key_len, u_int16_t *value) {
+ ndpi_str_hash_priv *h_priv = (ndpi_str_hash_priv *)h;
+ ndpi_str_hash_priv *item;
+
+ HASH_FIND(hh, h_priv, key, key_len, item);
+
+ if (item != NULL) {
+ if(value != NULL)
+ *value = item->value16;
- HASH_VALUE(key, key_len, hash_value);
- HASH_FIND_INT(h_priv, &hash_value, found);
- if (found != NULL)
- {
- if (value != NULL)
- {
- *value = found->value;
- }
return 0;
- } else {
+ } else
return 1;
- }
}
/* ******************************************************************** */
-int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, void *value)
-{
- struct ndpi_str_hash_private **h_priv = (struct ndpi_str_hash_private **)h;
- struct ndpi_str_hash_private *new = ndpi_calloc(1, sizeof(*new));
- struct ndpi_str_hash_private *found;
- unsigned int hash_value;
+int ndpi_hash_add_entry(ndpi_str_hash **h, char *key, u_int8_t key_len, u_int16_t value) {
+ ndpi_str_hash_priv *h_priv = (ndpi_str_hash_priv *)*h;
+ ndpi_str_hash_priv *item;
- if (new == NULL)
- {
- return 1;
+ HASH_FIND(hh, h_priv, key, key_len, item);
+
+ if(item != NULL) {
+ item->value16 = value;
+ return(1); /* Entry already present */
}
- HASH_VALUE(key, key_len, hash_value);
- new->hash = hash_value;
- new->value = value;
- HASH_ADD_INT(*h_priv, hash, new);
+ item = ndpi_calloc(1, sizeof(ndpi_str_hash_priv));
+ if(item == NULL)
+ return(2);
- HASH_FIND_INT(*h_priv, &hash_value, found);
- if (found == NULL) /* The insertion failed (because of a memory allocation error) */
- {
- ndpi_free(new);
- return 1;
+ item->key = ndpi_malloc(key_len+1);
+
+ if(item->key == NULL) {
+ ndpi_free(item);
+ return(1);
+ } else {
+ memcpy(item->key, key, key_len);
+ item->key[key_len] = '\0';
}
+ item->value16 = value;
+
+ HASH_ADD(hh, *((ndpi_str_hash_priv **)h), key[0], key_len, item);
+
return 0;
}
@@ -2502,7 +2482,7 @@ void ndpi_handle_risk_exceptions(struct ndpi_detection_module_struct *ndpi_str,
if(host && (host[0] != '\0')) {
/* Check host exception */
ndpi_check_hostname_risk_exception(ndpi_str, flow, host);
-
+
if(flow->risk_mask == 0) {
u_int i;
@@ -2555,7 +2535,7 @@ void ndpi_set_risk(struct ndpi_flow_struct *flow, ndpi_risk_enum r,
/* In case there is an exception set, take it into account */
if(flow->host_risk_mask_evaluated)
v &= flow->risk_mask;
-
+
// NDPI_SET_BIT(flow->risk, (u_int32_t)r);
flow->risk |= v;
@@ -3149,3 +3129,141 @@ const char *ndpi_lru_cache_idx_to_name(lru_cache_type idx)
return "unknown";
return names[idx];
}
+
+/* ******************************************* */
+
+size_t ndpi_compress_str(const char * in, size_t len, char * out, size_t bufsize) {
+ size_t ret = shoco_compress(in, len, out, bufsize);
+
+ if(ret > bufsize)
+ return(0); /* Better not to compress data (it is longer than the uncompressed data) */
+
+ return(ret);
+}
+
+/* ******************************************* */
+
+size_t ndpi_decompress_str(const char * in, size_t len, char * out, size_t bufsize) {
+ return(shoco_decompress(in, len, out, bufsize));
+
+}
+
+/* ******************************************* */
+
+static u_char ndpi_domain_mapper[256];
+static bool ndpi_domain_mapper_initialized = false;
+
+#define IGNORE_CHAR 0xFF
+#define NUM_BITS_NIBBLE 6 /* each 'nibble' is encoded with 6 bits */
+#define NIBBLE_ELEM_OFFSET 24
+
+/* Used fo encoding domain names 8 bits -> 6 bits */
+static void ndpi_domain_mapper_init() {
+ u_int i;
+ u_char idx = 1 /* start from 1 to make sure 0 is no ambiguous */;
+
+ memset(ndpi_domain_mapper, IGNORE_CHAR, 256);
+
+ for(i='a'; i<= 'z'; i++)
+ ndpi_domain_mapper[i] = idx++;
+
+ for(i='0'; i<= '9'; i++)
+ ndpi_domain_mapper[i] = idx++;
+
+ ndpi_domain_mapper['-'] = idx++;
+ ndpi_domain_mapper['_'] = idx++;
+ ndpi_domain_mapper['.'] = idx++;
+}
+
+/* ************************************************ */
+
+u_int ndpi_encode_domain(struct ndpi_detection_module_struct *ndpi_str,
+ char *domain, char *out, u_int out_len) {
+ u_int out_idx = 0, i, buf_shift = 0, domain_buf_len, compressed_len, suffix_len, domain_len;
+ u_int32_t value = 0;
+ u_char domain_buf[256], compressed[128];
+ u_int16_t domain_id = 0;
+ const char *suffix;
+
+ if(!ndpi_domain_mapper_initialized) {
+ ndpi_domain_mapper_init();
+ ndpi_domain_mapper_initialized = true;
+ }
+
+ domain_len = strlen(domain);
+
+ if(domain_len >= (out_len-3))
+ return(0);
+
+ if(domain_len <= 4)
+ return((u_int)snprintf(out, out_len, "%s", domain)); /* Too short */
+
+ /* [1] Encode the domain in 6 bits */
+ suffix = ndpi_get_host_domain_suffix(ndpi_str, domain, &domain_id);
+
+ if(suffix == NULL)
+ return((u_int)snprintf(out, out_len, "%s", domain)); /* Unknown suffix */
+
+ snprintf((char*)domain_buf, sizeof(domain_buf), "%s", domain);
+ domain_buf_len = strlen((char*)domain_buf), suffix_len = strlen(suffix);
+
+ if(domain_buf_len > suffix_len) {
+ snprintf((char*)domain_buf, sizeof(domain_buf), "%s", domain);
+ domain_buf_len = domain_buf_len-suffix_len-1;
+ domain_buf[domain_buf_len] = '\0';
+
+ for(i=0; domain_buf[i] != '\0'; i++) {
+ u_int32_t mapped_idx = ndpi_domain_mapper[domain_buf[i]];
+
+ if(mapped_idx != IGNORE_CHAR) {
+ mapped_idx <<= buf_shift;
+ value |= mapped_idx, buf_shift += NUM_BITS_NIBBLE;
+
+ if(buf_shift == NIBBLE_ELEM_OFFSET) {
+ memcpy(&out[out_idx], &value, 3);
+ out_idx += 3;
+ buf_shift = 0; /* Move to the next buffer */
+ value = 0;
+ }
+ }
+ }
+
+ if(buf_shift != 0) {
+ u_int bytes = buf_shift / NUM_BITS_NIBBLE;
+
+ memcpy(&out[out_idx], &value, bytes);
+ out_idx += bytes;
+ }
+ }
+
+ /* [2] Check if compressing the string is more efficient */
+ compressed_len = ndpi_compress_str((char*)domain_buf, domain_buf_len,
+ (char*)compressed, sizeof(compressed));
+
+ if((compressed_len > 0) && ((out_idx == 0) || (compressed_len < out_idx))) {
+ if(compressed_len >= domain_len) {
+ /* Compression creates a longer buffer */
+ return((u_int)snprintf(out, out_len, "%s", domain));
+ } else {
+ compressed_len = ndpi_min(ndpi_min(compressed_len, sizeof(compressed)), out_len-3);
+ memcpy(out, compressed, compressed_len);
+ out_idx = compressed_len;
+ }
+ }
+
+ /* Add trailer domainId value */
+ out[out_idx++] = (domain_id >> 8) & 0xFF;
+ out[out_idx++] = domain_id & 0xFF;
+
+#ifdef DEBUG
+ {
+ u_int i;
+
+ fprintf(stdout, "%s [len: %u][", domain, out_idx);
+ for(i=0; i<out_idx; i++) fprintf(stdout, "%02X", out[i] & 0xFF);
+ fprintf(stdout, "]\n");
+ }
+#endif
+
+ return(out_idx);
+}
diff --git a/src/lib/protocols/dns.c b/src/lib/protocols/dns.c
index 70b8cd451..d9eaf1e84 100644
--- a/src/lib/protocols/dns.c
+++ b/src/lib/protocols/dns.c
@@ -766,7 +766,7 @@ static void ndpi_search_dns(struct ndpi_detection_module_struct *ndpi_struct, st
ndpi_hostname_sni_set(flow, (const u_int8_t *)_hostname, len, is_mdns ? NDPI_HOSTNAME_NORM_LC : NDPI_HOSTNAME_NORM_ALL);
if (hostname_is_valid == 0)
- ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, NULL);
+ ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, "Invalid chars detected in domain name");
dot = strchr(_hostname, '.');
if(dot) {
diff --git a/src/lib/protocols/fastcgi.c b/src/lib/protocols/fastcgi.c
index a9f9113d3..10384a13e 100644
--- a/src/lib/protocols/fastcgi.c
+++ b/src/lib/protocols/fastcgi.c
@@ -221,7 +221,7 @@ static void ndpi_search_fastcgi(struct ndpi_detection_module_struct *ndpi_struct
ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str);
/* This looks like an attack */
- ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL);
+ ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?");
}
ndpi_int_fastcgi_add_connection(ndpi_struct, flow, &ret_match);
}
diff --git a/src/lib/protocols/http.c b/src/lib/protocols/http.c
index a85f1c44c..8fc82dd67 100644
--- a/src/lib/protocols/http.c
+++ b/src/lib/protocols/http.c
@@ -1007,7 +1007,7 @@ static void check_content_type_and_change_protocol(struct ndpi_detection_module_
ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str);
/* This looks like an attack */
- ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL);
+ ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?");
}
double_col = strchr((char*)flow->host_server_name, ':');
diff --git a/src/lib/protocols/quic.c b/src/lib/protocols/quic.c
index 4734433e0..345f77c47 100644
--- a/src/lib/protocols/quic.c
+++ b/src/lib/protocols/quic.c
@@ -1475,7 +1475,7 @@ void process_chlo(struct ndpi_detection_module_struct *ndpi_struct,
ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, str);
/* This looks like an attack */
- ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL);
+ ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Suspicious hostname: attack ?");
}
sni_found = 1;
@@ -1503,7 +1503,7 @@ void process_chlo(struct ndpi_detection_module_struct *ndpi_struct,
/* Add check for missing SNI */
if(flow->host_server_name[0] == '\0') {
/* This is a bit suspicious */
- ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, "SNI should be present all time: attack ?");
}
}
diff --git a/src/lib/protocols/tls.c b/src/lib/protocols/tls.c
index 882f463fb..54061d10c 100644
--- a/src/lib/protocols/tls.c
+++ b/src/lib/protocols/tls.c
@@ -643,7 +643,7 @@ void processCertificateElements(struct ndpi_detection_module_struct *ndpi_struct
ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, dNSName);
/* This looks like an attack */
- ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL);
+ ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Invalid dNSName name");
}
if(matched_name == 0) {
@@ -695,10 +695,13 @@ void processCertificateElements(struct ndpi_detection_module_struct *ndpi_struct
i += len;
} else {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "Unknown extension %02X", general_name_type);
#if DEBUG_TLS
printf("[TLS] Leftover %u bytes", packet->payload_packet_len - i);
#endif
- ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, buf);
break;
}
} else {
@@ -781,7 +784,7 @@ int processCertificate(struct ndpi_detection_module_struct *ndpi_struct,
if((packet->payload_packet_len != (length + 4 + (is_dtls ? 8 : 0))) || (packet->payload[1] != 0x0) ||
certificates_offset >= packet->payload_packet_len) {
- ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, NULL);
+ ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, "Unvalid lenght");
return(-1); /* Invalid length */
}
@@ -790,7 +793,7 @@ int processCertificate(struct ndpi_detection_module_struct *ndpi_struct,
packet->payload[certificates_offset - 1];
if((packet->payload[certificates_offset - 3] != 0x0) || ((certificates_length+3) != length)) {
- ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, NULL);
+ ndpi_set_risk(flow, NDPI_MALFORMED_PACKET, "Invalid certificate offset");
return(-2); /* Invalid length */
}
@@ -1056,7 +1059,7 @@ static int ndpi_search_tls_tcp(struct ndpi_detection_module_struct *ndpi_struct,
u_int8_t alert_level = message->buffer[5];
if(alert_level == 2 /* Warning (1), Fatal (2) */)
- ndpi_set_risk(flow, NDPI_TLS_FATAL_ALERT, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_FATAL_ALERT, "Found fatal TLS alert");
}
u_int16_t const alert_len = ntohs(*(u_int16_t const *)&message->buffer[3]);
@@ -1516,7 +1519,7 @@ static void checkExtensions(struct ndpi_detection_module_struct *ndpi_struct,
printf("[TLS] extension length exceeds remaining packet length: %u > %u.\n",
extension_len, packet->payload_packet_len - extension_payload_offset);
#endif
- ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_SUSPICIOUS_EXTENSION, "Invalid extension len");
return;
}
@@ -2264,7 +2267,7 @@ int processClientServerHello(struct ndpi_detection_module_struct *ndpi_struct,
ndpi_set_risk(flow, NDPI_INVALID_CHARACTERS, sni);
/* This looks like an attack */
- ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, NULL);
+ ndpi_set_risk(flow, NDPI_POSSIBLE_EXPLOIT, "Invalid chars found in SNI: exploit or misconfiguration?");
}
if(!is_quic) {
@@ -2847,7 +2850,7 @@ compute_ja3c:
&& (flow->protos.tls_quic.encrypted_sni.esni == NULL) /* No ESNI */
) {
/* This is a bit suspicious */
- ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_MISSING_SNI, "SNI should always be present");
if(flow->protos.tls_quic.advertised_alpns != NULL) {
char buf[256], *tmp, *item;
@@ -2859,7 +2862,7 @@ compute_ja3c:
while(item != NULL) {
if(item[0] == 'h') {
/* Example 'h2' */
- ndpi_set_risk(flow, NDPI_TLS_ALPN_SNI_MISMATCH, NULL);
+ ndpi_set_risk(flow, NDPI_TLS_ALPN_SNI_MISMATCH, item);
break;
} else
item = strtok_r(NULL, ",", &tmp);
diff --git a/src/lib/third_party/include/shoco.h b/src/lib/third_party/include/shoco.h
new file mode 100644
index 000000000..0772ac656
--- /dev/null
+++ b/src/lib/third_party/include/shoco.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <stddef.h>
+
+#if defined(_MSC_VER)
+#define shoco_restrict __restrict
+#elif __GNUC__
+#define shoco_restrict __restrict__
+#else
+#define shoco_restrict restrict
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+size_t shoco_compress(const char * const shoco_restrict in, size_t len, char * const shoco_restrict out, size_t bufsize);
+size_t shoco_decompress(const char * const shoco_restrict in, size_t len, char * const shoco_restrict out, size_t bufsize);
+
+#ifdef __cplusplus
+}
+#endif
+
+
diff --git a/src/lib/third_party/include/shoco_domains_model.h b/src/lib/third_party/include/shoco_domains_model.h
new file mode 100644
index 000000000..be1b50a8e
--- /dev/null
+++ b/src/lib/third_party/include/shoco_domains_model.h
@@ -0,0 +1,172 @@
+/*
+ Note
+ This file has been generated (by ntop) as indicated in https://github.com/Ed-von-Schleck/shoco
+ using generate_compression_model.py and trained using domain names for obtaining optimal
+ performance when used with Internet domain names
+*/
+
+#ifndef _SHOCO_INTERNAL
+#error This header file is only to be included by 'shoco.c'.
+#endif
+#pragma once
+/*
+This file was generated by 'generate_compressor_model.py'
+so don't edit this by hand. Also, do not include this file
+anywhere. It is internal to 'shoco.c'. Include 'shoco.h'
+if you want to use shoco in your project.
+*/
+
+#define MIN_CHR 45
+#define MAX_CHR 123
+
+static const char chrs_by_chr_id[32] = {
+ 'o', '.', 'c', 'e', 'a', 'i', 'r', 'n', 's', 't', 'l', 'd', 'm', 'u', 'p', 'g', 'h', 'b', 'f', 'k', 'y', 'v', 'w', '-', 'x', 'j', 'z', 'q', '1', '2', '0', '3'
+};
+
+static const int8_t chr_ids_by_chr[256] = {
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 23, 1, -1, 30, 28, 29, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 17, 2, 11, 3, 18, 15, 16, 5, 25, 19, 10, 12, 7, 0, 14, 27, 6, 8, 9, 13, 21, 22, 24, 20, 26, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+};
+
+static const int8_t successor_ids_by_chr_id_and_chr_id[32][32] = {
+ {7, 3, 11, -1, -1, -1, 1, 2, 6, 8, 5, 12, 0, 4, 9, 15, -1, 13, -1, -1, -1, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {2, -1, 0, 6, 8, 3, 10, 1, 15, 11, -1, 5, 13, 4, 12, 7, -1, 9, -1, -1, -1, -1, -1, -1, -1, 14, -1, -1, -1, -1, -1, -1},
+ {0, 4, 14, 3, 1, 8, 10, 9, 12, 6, 5, 13, -1, 11, -1, -1, 2, -1, -1, 7, -1, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1},
+ {-1, 1, 8, 10, 7, -1, 0, 4, 3, 2, 6, 5, 9, -1, 13, -1, -1, 12, -1, -1, -1, 15, 11, -1, 14, -1, -1, -1, -1, -1, -1, -1},
+ {-1, 4, 6, -1, -1, 10, 1, 0, 5, 3, 2, 8, 7, 12, 9, 11, -1, 13, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {3, 10, 2, 8, 6, -1, 9, 0, 4, 1, 5, 7, 11, -1, 14, 12, -1, -1, 15, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {2, 5, 9, 0, 1, 4, -1, 12, 8, 6, -1, 10, 13, 7, -1, 3, -1, -1, -1, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {9, 2, 8, 0, 5, 7, -1, 13, 6, 1, 10, 4, -1, 14, -1, 3, -1, -1, 12, 11, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1},
+ {6, 0, 8, 2, 4, 3, -1, -1, 7, 1, -1, 15, 11, 10, 9, -1, 5, -1, -1, 12, 14, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1},
+ {4, 1, 11, 0, 3, 2, 5, -1, 7, 10, 14, -1, 15, 8, -1, -1, 6, -1, -1, -1, 9, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {3, 4, 11, 1, 2, 0, -1, -1, 8, 7, 5, 10, 13, 6, 12, -1, -1, 15, 14, -1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {5, 2, 9, 0, 3, 1, 8, 7, 6, -1, 12, 14, 11, 4, -1, 10, -1, 13, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {3, 2, 11, 1, 0, 4, -1, -1, 7, 13, -1, 15, 10, 8, 5, 14, -1, 9, -1, -1, 6, -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1},
+ {-1, 5, 14, 12, 11, 13, 1, 2, 0, 4, 9, 6, 8, -1, 7, 15, -1, 10, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {1, 5, 12, 3, 0, 6, 2, -1, 8, 9, 4, 14, 13, 11, 7, -1, 10, -1, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1},
+ {0, 2, 13, 1, 3, 5, 4, 10, 9, 12, 6, -1, 15, 7, -1, 14, 8, -1, -1, -1, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {1, 4, 10, 0, 2, 3, 7, 9, 8, 6, 13, -1, 14, 5, 15, -1, -1, -1, -1, 12, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {4, 5, 9, 1, 0, 3, 2, 15, 8, 12, 6, -1, 14, 7, -1, 13, -1, 11, -1, -1, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {0, 9, 10, 3, 4, 1, 2, -1, 11, 6, 5, 15, 13, 7, 14, -1, -1, -1, 8, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {6, 0, 12, 1, 3, 2, 5, 11, 4, 8, 9, -1, 14, 7, 13, -1, -1, -1, -1, -1, 10, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1},
+ {3, 0, 4, 5, 2, 14, -1, 9, 1, 6, 10, -1, 7, 13, 8, -1, -1, 11, -1, -1, -1, -1, -1, 15, -1, -1, 12, -1, -1, -1, -1, -1},
+ {4, 2, 6, 0, 3, 1, 8, 5, 7, 11, -1, 14, 12, 10, 9, -1, -1, -1, -1, -1, 15, -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, -1},
+ {2, 5, 8, 0, 1, 3, 9, 7, 4, 12, 11, 13, 14, -1, 10, -1, 6, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {13, -1, 1, 7, 2, 5, 14, 12, 0, 6, 10, 8, 3, -1, 4, 11, -1, 9, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {10, 0, 6, 7, 5, 1, -1, 12, 8, 3, 15, -1, 13, 11, 2, -1, -1, -1, -1, -1, 4, -1, -1, 14, 9, -1, -1, -1, -1, -1, -1, -1},
+ {1, 6, 8, 3, 2, 5, -1, 11, 7, 12, -1, 10, 9, 4, 0, -1, 14, 13, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+ {4, 2, 11, 0, 1, 3, -1, 14, 10, 15, 9, 12, 13, 6, -1, -1, 7, -1, -1, -1, 8, -1, -1, -1, -1, -1, 5, -1, -1, -1, -1, -1},
+ {7, 1, 4, 12, 3, 2, 8, -1, 6, 9, 5, 15, 10, 0, 14, -1, -1, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, -1, -1, -1},
+ {-1, 0, 14, -1, 11, -1, -1, -1, 12, -1, -1, -1, -1, -1, 15, -1, -1, -1, -1, -1, -1, -1, -1, 13, -1, -1, -1, -1, 3, 1, 2, 6},
+ {-1, 0, 7, -1, 15, -1, -1, -1, 8, -1, -1, 11, 14, -1, 10, 12, -1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 3, 2},
+ {-1, 0, 13, -1, 10, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1, -1, -1, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 3, 1, 7},
+ {-1, 0, 5, -1, 14, -1, -1, -1, 13, -1, -1, 2, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, 6, 3, 4}
+};
+
+static const int8_t chrs_by_chr_and_successor_id[MAX_CHR - MIN_CHR][16] = {
+ {'s', 'c', 'a', 'm', 'p', 'i', 't', 'e', 'd', 'b', 'l', 'g', 'n', 'o', 'r', 'f'},
+ {'c', 'n', 'o', 'i', 'u', 'd', 'e', 'g', 'a', 'b', 'r', 't', 'p', 'm', 'j', 's'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'.', '0', '1', '2', '7', '5', '4', '3', '8', '6', 'a', '9', 'b', 'c', 's', 'm'},
+ {'.', '2', '0', '1', '8', '6', '3', '9', '7', '4', '5', 'a', 's', '-', 'c', 'p'},
+ {'.', '4', '3', '0', '1', '2', 'b', 'c', 's', '5', 'p', 'd', 'g', '6', 'm', 'a'},
+ {'.', '6', 'd', '0', '3', 'c', '2', '4', '1', '8', '5', '7', '9', 's', 'a', 'm'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'},
+ {'n', 'r', 'l', 't', '.', 's', 'c', 'm', 'd', 'p', 'i', 'g', 'u', 'b', 'y', 'v'},
+ {'a', 'e', 'r', 'i', 'o', '.', 'l', 'u', 's', 'c', 'y', 'b', 't', 'g', 'm', 'n'},
+ {'o', 'a', 'h', 'e', '.', 'l', 't', 'k', 'i', 'n', 'r', 'u', 's', 'd', 'c', 'z'},
+ {'e', 'i', '.', 'a', 'u', 'o', 's', 'n', 'r', 'c', 'g', 'm', 'l', 'b', 'd', 'y'},
+ {'r', '.', 't', 's', 'n', 'd', 'l', 'a', 'c', 'm', 'e', 'w', 'b', 'p', 'x', 'v'},
+ {'o', 'i', 'r', 'e', 'a', 'l', 't', 'u', 'f', '.', 'c', 's', 'y', 'm', 'p', 'd'},
+ {'o', 'e', '.', 'a', 'r', 'i', 'l', 'u', 'h', 's', 'n', 'y', 't', 'c', 'g', 'm'},
+ {'e', 'o', 'a', 'i', '.', 'u', 't', 'r', 's', 'n', 'c', 'y', 'k', 'l', 'm', 'p'},
+ {'n', 't', 'c', 'o', 's', 'l', 'a', 'd', 'e', 'r', '.', 'm', 'g', 'v', 'p', 'f'},
+ {'p', 'o', 'a', 'e', 'u', 'i', '.', 's', 'c', 'm', 'd', 'n', 't', 'b', 'h', 'k'},
+ {'.', 'e', 'i', 'a', 's', 'r', 'o', 'u', 't', 'l', 'y', 'n', 'c', 'p', 'm', '-'},
+ {'i', 'e', 'a', 'o', '.', 'l', 'u', 't', 's', 'y', 'd', 'c', 'p', 'm', 'f', 'b'},
+ {'a', 'e', '.', 'o', 'i', 'p', 'y', 's', 'u', 'b', 'm', 'c', 'x', 't', 'g', 'd'},
+ {'e', 't', '.', 'g', 'd', 'a', 's', 'i', 'c', 'o', 'l', 'k', 'f', 'n', 'u', '-'},
+ {'m', 'r', 'n', '.', 'u', 'l', 's', 'o', 't', 'p', 'v', 'c', 'd', 'b', 'w', 'g'},
+ {'a', 'o', 'r', 'e', 'l', '.', 'i', 'p', 's', 't', 'h', 'u', 'c', 'm', 'd', '-'},
+ {'u', '.', 'i', 'a', 'c', 'l', 's', 'o', 'r', 't', 'm', 'b', 'e', 'q', 'p', 'd'},
+ {'e', 'a', 'o', 'g', 'i', '.', 't', 'u', 's', 'c', 'd', 'k', 'n', 'm', 'y', 'v'},
+ {'.', 't', 'e', 'i', 'a', 'h', 'o', 's', 'c', 'p', 'u', 'm', 'k', '-', 'y', 'd'},
+ {'e', '.', 'i', 'a', 'o', 'r', 'h', 's', 'u', 'y', 't', 'c', 'w', 'v', 'l', 'm'},
+ {'s', 'r', 'n', 'k', 't', '.', 'd', 'p', 'm', 'l', 'b', 'a', 'e', 'i', 'c', 'g'},
+ {'e', 'i', '.', 'a', 'o', 'n', 'c', 's', 'r', 'p', 'u', 't', 'm', '-', 'd', 'y'},
+ {'e', 'a', 'o', 'i', 's', '.', 'h', 'n', 'c', 'r', 'p', 'l', 't', 'd', 'm', 'w'},
+ {'.', 'i', 'p', 't', 'y', 'a', 'c', 'e', 's', 'x', 'o', 'u', 'n', 'm', '-', 'l'},
+ {'.', 's', 'a', 'o', 'c', 'e', 't', 'm', 'p', 'n', 'l', 'b', 'z', 'u', 'i', '-'},
+ {'e', 'a', '.', 'i', 'o', 'z', 'u', 'h', 'y', 'l', 's', 'c', 'd', 'm', 'n', 't'}
+};
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable: 4324) // structure was padded due to __declspec(align())
+#endif
+
+typedef struct Pack {
+ const uint32_t word;
+ const unsigned int bytes_packed;
+ const unsigned int bytes_unpacked;
+ const unsigned int offsets[8];
+ const int16_t _ALIGNED masks[8];
+ const unsigned char header_mask;
+ const unsigned char header;
+} Pack;
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define PACK_COUNT 3
+#define MAX_SUCCESSOR_N 7
+
+static const Pack packs[PACK_COUNT] = {
+ { 0x80000000, 1, 2, { 26, 24, 24, 24, 24, 24, 24, 24 }, { 15, 3, 0, 0, 0, 0, 0, 0 }, 0xc0, 0x80 },
+ { 0xc0000000, 2, 4, { 25, 22, 19, 16, 16, 16, 16, 16 }, { 15, 7, 7, 7, 0, 0, 0, 0 }, 0xe0, 0xc0 },
+ { 0xe0000000, 4, 8, { 23, 19, 15, 11, 8, 5, 2, 0 }, { 31, 15, 15, 15, 7, 7, 7, 3 }, 0xf0, 0xe0 }
+};
diff --git a/src/lib/third_party/src/shoco.c b/src/lib/third_party/src/shoco.c
new file mode 100644
index 000000000..5b4ea3f6d
--- /dev/null
+++ b/src/lib/third_party/src/shoco.c
@@ -0,0 +1,233 @@
+/* https://github.com/Ed-von-Schleck/shoco */
+
+#include <stdint.h>
+
+#if (defined (__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) || __BIG_ENDIAN__)
+ #define swap(x) (x)
+#else
+ #if defined(_MSC_VER)
+ #include <stdlib.h>
+ #define swap(x) _byteswap_ulong(x)
+ #elif defined (__GNUC__)
+ #if defined(__builtin_bswap32)
+ #define swap(x) __builtin_bswap32(x)
+ #else
+ #define swap(x) ((x<<24) + ((x&0x0000FF00)<<8) + ((x&0x00FF0000)>>8) + (x>>24))
+ #endif
+ #else
+ #include <byteswap.h>
+ #define swap(x) bswap_32(x)
+ #endif
+#endif
+
+#if defined(_MSC_VER)
+ #define _ALIGNED __declspec(align(16))
+ #define inline __inline
+#elif defined(__GNUC__)
+ #define _ALIGNED __attribute__ ((aligned(16)))
+#else
+ #define _ALIGNED
+#endif
+
+#if defined(_M_X64) || defined (_M_AMD64) || defined (__x86_64__)
+ #include "emmintrin.h"
+ #define HAVE_SSE2
+#endif
+
+#include "shoco.h"
+#define _SHOCO_INTERNAL
+#include "shoco_domains_model.h" /* we have built a model trained on domain names */
+
+static inline int decode_header(unsigned char val) {
+ int i = -1;
+ while ((signed char)val < 0) {
+ val <<= 1;
+ ++i;
+ }
+ return i;
+}
+
+union Code {
+ uint32_t word;
+ char bytes[4];
+};
+
+#ifdef HAVE_SSE2
+static inline int check_indices(const int16_t * shoco_restrict indices, int pack_n) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i indis = _mm_load_si128 ((__m128i *)indices);
+ __m128i masks = _mm_load_si128 ((__m128i *)packs[pack_n].masks);
+ __m128i cmp = _mm_cmpgt_epi16 (indis, masks);
+ __m128i mmask = _mm_cmpgt_epi16 (masks, zero);
+ cmp = _mm_and_si128 (cmp, mmask);
+ int result = _mm_movemask_epi8 (cmp);
+ return (result == 0);
+}
+#else
+static inline int check_indices(const int16_t * shoco_restrict indices, int pack_n) {
+ unsigned int i;
+
+ for (i = 0; i < packs[pack_n].bytes_unpacked; ++i)
+ if (indices[i] > packs[pack_n].masks[i])
+ return 0;
+ return 1;
+}
+#endif
+
+static inline int find_best_encoding(const int16_t * shoco_restrict indices, unsigned int n_consecutive) {
+ int p;
+
+ for (p = PACK_COUNT - 1; p >= 0; --p)
+ if ((n_consecutive >= packs[p].bytes_unpacked) && (check_indices(indices, p)))
+ return p;
+ return -1;
+}
+
+size_t shoco_compress(const char * const shoco_restrict original, size_t strlen, char * const shoco_restrict out, size_t bufsize) {
+ char *o = out;
+ char * const out_end = out + bufsize;
+ const char *in = original;
+ int16_t _ALIGNED indices[MAX_SUCCESSOR_N + 1] = { 0 };
+ int last_chr_index;
+ int current_index;
+ int successor_index;
+ unsigned int n_consecutive;
+ union Code code;
+ int pack_n;
+ unsigned int rest;
+ const char * const in_end = original + strlen;
+
+ while ((*in != '\0')) {
+ if (strlen && (in == in_end))
+ break;
+
+ // find the longest string of known successors
+ indices[0] = chr_ids_by_chr[(unsigned char)in[0]];
+ last_chr_index = indices[0];
+ if (last_chr_index < 0)
+ goto last_resort;
+
+ rest = in_end - in;
+ for (n_consecutive = 1; n_consecutive <= MAX_SUCCESSOR_N; ++n_consecutive) {
+ if (strlen && (n_consecutive == rest))
+ break;
+
+ current_index = chr_ids_by_chr[(unsigned char)in[n_consecutive]];
+ if (current_index < 0) // '\0' is always -1
+ break;
+
+ successor_index = successor_ids_by_chr_id_and_chr_id[last_chr_index][current_index];
+ if (successor_index < 0)
+ break;
+
+ indices[n_consecutive] = (int16_t)successor_index;
+ last_chr_index = current_index;
+ }
+ if (n_consecutive < 2)
+ goto last_resort;
+
+ pack_n = find_best_encoding(indices, n_consecutive);
+ if (pack_n >= 0) {
+ unsigned int i;
+
+ if (o + packs[pack_n].bytes_packed > out_end)
+ return bufsize + 1;
+
+ code.word = packs[pack_n].word;
+ for (i = 0; i < packs[pack_n].bytes_unpacked; ++i)
+ code.word |= indices[i] << packs[pack_n].offsets[i];
+
+ // In the little-endian world, we need to swap what's
+ // in the register to match the memory representation.
+ // On big-endian systems, this is a dummy.
+ code.word = swap(code.word);
+
+ // if we'd just copy the word, we might write over the end
+ // of the output string
+ for (i = 0; i < packs[pack_n].bytes_packed; ++i)
+ o[i] = code.bytes[i];
+
+ o += packs[pack_n].bytes_packed;
+ in += packs[pack_n].bytes_unpacked;
+ } else {
+last_resort:
+ if (*in & 0x80) {
+ // non-ascii case
+ if (o + 2 > out_end)
+ return bufsize + 1;
+ // put in a sentinel byte
+ *o++ = 0x00;
+ } else {
+ // an ascii byte
+ if (o + 1 > out_end)
+ return bufsize + 1;
+ }
+ *o++ = *in++;
+ }
+ }
+
+ return o - out;
+}
+
+size_t shoco_decompress(const char * const shoco_restrict original, size_t complen, char * const shoco_restrict out, size_t bufsize) {
+ char *o = out;
+ char * const out_end = out + bufsize;
+ const char *in = original;
+ char last_chr;
+ union Code code = { 0 };
+ int offset;
+ int mask;
+ int mark;
+ const char * const in_end = original + complen;
+
+ while (in < in_end) {
+ mark = decode_header(*in);
+ if (mark < 0) {
+ if (o >= out_end)
+ return bufsize + 1;
+
+ // ignore the sentinel value for non-ascii chars
+ if (*in == 0x00) {
+ if (++in >= in_end)
+ return SIZE_MAX;
+ }
+
+ *o++ = *in++;
+ } else {
+ unsigned int i;
+
+ if (o + packs[mark].bytes_unpacked > out_end)
+ return bufsize + 1;
+ else if (in + packs[mark].bytes_packed > in_end)
+ return SIZE_MAX;
+
+ // This should be OK as well, but it fails with emscripten.
+ // Test this with new versions of emcc.
+ //code.word = swap(*(uint32_t *)in);
+ for (i = 0; i < packs[mark].bytes_packed; ++i)
+ code.bytes[i] = in[i];
+ code.word = swap(code.word);
+
+ // unpack the leading char
+ offset = packs[mark].offsets[0];
+ mask = packs[mark].masks[0];
+ last_chr = o[0] = chrs_by_chr_id[(code.word >> offset) & mask];
+
+ // unpack the successor chars
+ for (i = 1; i < packs[mark].bytes_unpacked; ++i) {
+ offset = packs[mark].offsets[i];
+ mask = packs[mark].masks[i];
+ last_chr = o[i] = chrs_by_chr_and_successor_id[(unsigned char)last_chr - MIN_CHR][(code.word >> offset) & mask];
+ }
+
+ o += packs[mark].bytes_unpacked;
+ in += packs[mark].bytes_packed;
+ }
+ }
+
+ // append a 0-terminator if it fits
+ if (o < out_end)
+ *o = '\0';
+
+ return o - out;
+}