aboutsummaryrefslogtreecommitdiff
path: root/src/lib/ndpi_main.c
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2021-03-03 19:30:01 +0100
committerLuca Deri <deri@ntop.org>2021-03-03 19:30:01 +0100
commit0f8a9948415bceac84148e1965e12e3f19c5050e (patch)
treee0615fc8fbf465178d29c78472638fb56b6ba5e3 /src/lib/ndpi_main.c
parent49843509e54aa70dc69005dab5f02f32df7866e2 (diff)
Improved DGA detection
Before Accuracy 66%, Precision 86%, Recall 38% After Accuracy 71%, Precision 89%, Recall 49%
Diffstat (limited to 'src/lib/ndpi_main.c')
-rw-r--r--src/lib/ndpi_main.c227
1 files changed, 127 insertions, 100 deletions
diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c
index 4b5f38101..daf02f76a 100644
--- a/src/lib/ndpi_main.c
+++ b/src/lib/ndpi_main.c
@@ -58,6 +58,8 @@ static int _ndpi_debug_callbacks = 0;
/* #define DGA_DEBUG 1 */
/* #define MATCH_DEBUG 1 */
+u_int ndpi_verbose_dga_detection = 0;
+
/* ****************************************** */
static void *(*_ndpi_flow_malloc)(size_t size);
@@ -497,7 +499,7 @@ static int ndpi_string_to_automa(struct ndpi_detection_module_struct *ndpi_str,
dot = len -1;
memset(&ac_pattern, 0, sizeof(ac_pattern));
-
+
if((!add_ends_with) || ndpi_is_middle_string_char(value[dot])) {
ac_pattern.length = len;
ac_pattern.astring = value;
@@ -621,14 +623,14 @@ static void init_string_based_protocols(struct ndpi_detection_module_struct *ndp
ndpi_init_protocol_match(ndpi_str, &host_match[i]);
/* ************************ */
-
+
for(i = 0; tls_certificate_match[i].string_to_match != NULL; i++) {
#if 0
printf("%s() %s / %u\n", __FUNCTION__,
tls_certificate_match[i].string_to_match,
tls_certificate_match[i].protocol_id);
#endif
-
+
ndpi_add_string_value_to_automa(ndpi_str->tls_cert_subject_automa.ac_automa,
tls_certificate_match[i].string_to_match,
tls_certificate_match[i].protocol_id);
@@ -2997,7 +2999,7 @@ int ndpi_load_malicious_ja3_file(struct ndpi_detection_module_struct *ndpi_str,
if(ndpi_str->malicious_ja3_automa.ac_automa == NULL)
ndpi_str->malicious_ja3_automa.ac_automa = ac_automata_init(ac_match_handler);
-
+
fd = fopen(path, "r");
if(fd == NULL) {
@@ -3007,7 +3009,7 @@ int ndpi_load_malicious_ja3_file(struct ndpi_detection_module_struct *ndpi_str,
while(1) {
char *comma;
-
+
line = fgets(buffer, sizeof(buffer), fd);
if(line == NULL)
@@ -5014,7 +5016,7 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
NDPI_CLR_BIT(flow->risk, NDPI_SUSPICIOUS_DGA_DOMAIN);
}
#endif
-
+
switch(ret->app_protocol) {
/*
Skype for a host doing MS Teams means MS Teams
@@ -6839,16 +6841,14 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
if((rc == 0) && (match.number != 0))
rc = 1;
-#ifdef TRIGRAM_CHECK
- if(rc && match.number) {
+ if(ndpi_verbose_dga_detection && rc && match.number) {
printf("[%s:%d] [NDPI] Trigram %c%c%c\n",
__FILE__, __LINE__,
trigram_to_match[0],
trigram_to_match[1],
trigram_to_match[2]);
}
-#endif
-
+
return(rc ? match.number : 0);
}
@@ -7223,11 +7223,11 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
static int enough(int a, int b) {
u_int8_t percentage = 20;
- if(b == 0) return(0);
+ if(b <= 1) return(0);
if(a == 0) return(1);
if(b > (((a+1)*percentage)/100)) return(1);
-
+
return(0);
}
@@ -7267,6 +7267,7 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
case 'o':
case 'u':
case 'y': // Not a real vowel...
+ case 'x': // Not a real vowel...
return(1);
break;
@@ -7274,39 +7275,45 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
return(0);
}
}
-
+
/* ******************************************************************** */
-
+
int ndpi_check_dga_name(struct ndpi_detection_module_struct *ndpi_str,
struct ndpi_flow_struct *flow,
char *name, u_int8_t is_hostname) {
- int len, rc = 0;
- u_int8_t max_num_char_repetitions = 0, last_char = 0, num_char_repetitions = 0, num_dots = 0;
+ int len, rc = 0, trigram_char_skip = 0;
+ u_int8_t max_num_char_repetitions = 0, last_char = 0, num_char_repetitions = 0, num_dots = 0, num_trigram_dots = 0;
u_int8_t max_domain_element_len = 0, curr_domain_element_len = 0, first_element_is_numeric = 1;
- if(!name) return(0);
-
+ if((!name)
+ || (strchr(name, '_') != NULL)
+ || (endsWith(name, "in-addr.arpa", 12)))
+ return(0);
+
if(flow && (flow->packet.detected_protocol_stack[1] != NDPI_PROTOCOL_UNKNOWN))
return(0); /* Ignore DGA check for protocols already fully detected */
-#ifdef DGA_DEBUG
- printf("[DGA] %s\n", name);
-#endif
+ if(strncmp(name, "www.", 4) == 0)
+ name = &name[4];
+
+ if(ndpi_verbose_dga_detection)
+ printf("[DGA check] %s\n", name);
len = strlen(name);
if(len >= 5) {
int i, j, num_found = 0, num_impossible = 0, num_bigram_checks = 0,
- num_trigram_found = 0, num_trigram_checked = 0,
- num_digits = 0, num_vowels = 0, num_words = 0;
+ num_trigram_found = 0, num_trigram_checked = 0, num_dash = 0,
+ num_digits = 0, num_vowels = 0, num_trigram_vowels = 0, num_words = 0, skip_next_bigram = 0;
char tmp[128], *word, *tok_tmp;
u_int max_tmp_len = sizeof(tmp)-1;
len = snprintf(tmp, max_tmp_len, "%s", name);
if(len < 0) {
-#ifdef DGA_DEBUG
- printf("[DGA] Too short");
-#endif
+
+ if(ndpi_verbose_dga_detection)
+ printf("[DGA] Too short");
+
return(0);
} else
tmp[len < max_tmp_len ? len : max_tmp_len] = '\0';
@@ -7314,19 +7321,32 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
for(i=0, j=0; (i<len) && (j<max_tmp_len); i++) {
tmp[j] = tolower(name[i]);
- if(tmp[j] == '.')
+ if(tmp[j] == '.') {
num_dots++;
- else if(num_dots == 0) {
+ } else if(num_dots == 0) {
if(!isdigit(tmp[j]))
first_element_is_numeric = 0;
}
-
+
+ if(ndpi_is_vowel(tmp[j]))
+ num_vowels++;
+
if(last_char == tmp[j]) {
if(++num_char_repetitions > max_num_char_repetitions)
max_num_char_repetitions = num_char_repetitions;
} else
num_char_repetitions = 1, last_char = tmp[j];
-
+
+ if(isdigit(tmp[j])) {
+ num_digits++;
+
+ if(((j+2)<len) && isdigit(tmp[j+1]) && (tmp[j+2] == '.')) {
+ /* Check if there are too many digits */
+ if(num_digits < 4)
+ return(0); /* Double digits */
+ }
+ }
+
switch(tmp[j]) {
case '.':
case '-':
@@ -7360,22 +7380,20 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
j++;
}
- if(num_dots < 2) /* At least XXX.YYY.ZZZ */
+ if(num_dots == 0) /* Doesn't look like a domain name */
return(0);
if(curr_domain_element_len > max_domain_element_len)
max_domain_element_len = curr_domain_element_len;
-#ifdef DGA_DEBUG
- printf("[DGA] [max_num_char_repetitions: %u][max_domain_element_len: %u]\n",
- max_num_char_repetitions, max_domain_element_len);
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("[DGA] [max_num_char_repetitions: %u][max_domain_element_len: %u]\n",
+ max_num_char_repetitions, max_domain_element_len);
if(
(is_hostname
&& (num_dots > 5)
&& (!first_element_is_numeric)
- && (!endsWith(tmp, "in-addr.arpa", 12))
)
|| (max_num_char_repetitions > 5 /* num or consecutive repeated chars */)
/*
@@ -7389,9 +7407,10 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
|| (max_domain_element_len >= 19 /* word too long. Example bbcbedxhgjmdobdprmen.com */)
) {
if(flow) ndpi_set_risk(flow, NDPI_SUSPICIOUS_DGA_DOMAIN);
-#ifdef DGA_DEBUG
- printf("[DGA] Found!");
-#endif
+
+ if(ndpi_verbose_dga_detection)
+ printf("[DGA] Found!");
+
return(1);
}
@@ -7405,21 +7424,15 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
if(strlen(word) < 3) continue;
-#ifdef DGA_DEBUG
- printf("-> %s [%s][len: %u]\n", word, name, (unsigned int)strlen(word));
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("-> word(%s) [%s][len: %u]\n", word, name, (unsigned int)strlen(word));
+ trigram_char_skip = 0;
+
for(i = 0; word[i+1] != '\0'; i++) {
- if(isdigit(word[i])) {
- num_digits++;
-
- // if(!isdigit(word[i+1])) num_impossible++;
-
- continue;
- }
-
switch(word[i]) {
case '-':
+ num_dash++;
/*
Let's check for double+consecutive --
that are usually ok
@@ -7437,84 +7450,98 @@ uint8_t ndpi_connection_tracking(struct ndpi_detection_module_struct *ndpi_str,
continue;
break;
}
-
-#if 0
- switch(word[i]) {
- case 'a':
- case 'e':
- case 'i':
- case 'o':
- case 'u':
- num_vowels++;
- break;
- }
-#endif
- if(isdigit(word[i+1])) {
- num_digits++;
- // num_impossible++;
- continue;
- }
-
num_bigram_checks++;
-#ifdef DGA_DEBUG
- printf("-> Checking %c%c\n", word[i], word[i+1]);
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("-> Checking %c%c\n", word[i], word[i+1]);
if(ndpi_match_bigram(ndpi_str,
&ndpi_str->impossible_bigrams_automa,
&word[i])) {
-#ifdef DGA_DEBUG
- printf("IMPOSSIBLE %s\n", &word[i]);
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("IMPOSSIBLE %s\n", &word[i]);
+
num_impossible++;
- } else if(ndpi_match_bigram(ndpi_str, &ndpi_str->bigrams_automa, &word[i])) {
- num_found++;
+ } else {
+ if(!skip_next_bigram) {
+ if(ndpi_match_bigram(ndpi_str, &ndpi_str->bigrams_automa, &word[i])) {
+ num_found++, skip_next_bigram = 1;
+ }
+ } else
+ skip_next_bigram = 0;
}
- if((i > 0) && (word[0] != '_') && (word[i+2] != '\0')) {
+ if((num_trigram_dots < 2) && (word[i+2] != '\0')) {
+ if(ndpi_verbose_dga_detection)
+ printf("***> %s [trigram_char_skip: %u]\n", &word[i], trigram_char_skip);
+
if(ndpi_is_trigram_char(word[i]) && ndpi_is_trigram_char(word[i+1]) && ndpi_is_trigram_char(word[i+2])) {
- num_trigram_checked++;
-
- if(ndpi_match_trigram(ndpi_str, &ndpi_str->trigrams_automa, &word[i])) {
- num_trigram_found++;
+ if(trigram_char_skip) {
+ trigram_char_skip--;
+ } else {
+ num_trigram_checked++;
+
+ if(ndpi_match_trigram(ndpi_str, &ndpi_str->trigrams_automa, &word[i]))
+ num_trigram_found++, trigram_char_skip = 2 /* 1 char overlap */;
+ else if(ndpi_verbose_dga_detection)
+ printf("[NDPI] NO Trigram %c%c%c\n", word[i], word[i+1], word[i+2]);
+
+ /* Count vowels */
+ num_trigram_vowels += ndpi_is_vowel(word[i]) + ndpi_is_vowel(word[i+1]) + ndpi_is_vowel(word[i+2]);
}
+ } else {
+ if(word[i] == '.')
+ num_trigram_dots++;
+
+ trigram_char_skip = 0;
}
-
- /* Count vowels */
- num_vowels += ndpi_is_vowel(word[i]) + ndpi_is_vowel(word[i+1]) + ndpi_is_vowel(word[i+2]);
}
} /* for */
} /* for */
-#ifdef DGA_DEBUG
- printf("[%s][num_found: %u][num_impossible: %u][num_digits: %u][num_bigram_checks: %u][num_vowels: %u/%u][num_trigram_found: %u/%u][vowels: %u]\n",
- name, num_found, num_impossible, num_digits, num_bigram_checks, num_vowels, j-num_vowels,
- num_trigram_checked, num_trigram_found, num_vowels);
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("[%s][num_found: %u][num_impossible: %u][num_digits: %u][num_bigram_checks: %u][num_vowels: %u/%u][num_trigram_vowels: %u][num_trigram_found: %u/%u][vowels: %u][rc: %u]\n",
+ name, num_found, num_impossible, num_digits, num_bigram_checks, num_vowels, len, num_trigram_vowels,
+ num_trigram_checked, num_trigram_found, num_vowels, rc);
+ if((len > 16) && (num_dots < 3) && ((num_vowels*4) < (len-num_dots))) {
+ if((num_trigram_checked > 2) && (num_trigram_vowels >= (num_trigram_found-1)))
+ ; /* skip me */
+ else
+ rc = 1;
+ }
+
if(num_bigram_checks
+ && (num_dots > 0)
&& ((num_found == 0) || ((num_digits > 5) && (num_words <= 3))
- || enough(num_found, num_impossible) || ((num_trigram_checked > 3) && (num_trigram_found < (num_trigram_checked/2)))))
+ || enough(num_found, num_impossible)
+ || ((num_trigram_checked > 2)
+ && ((num_trigram_found < (num_trigram_checked/2))
+ || ((num_trigram_vowels < (num_trigram_found-1)) && (num_dash == 0) && (num_dots > 1)))
+ )
+ )
+ )
rc = 1;
- if(num_trigram_checked && (num_vowels == 0))
+ if((num_trigram_checked > 2) && (num_vowels == 0))
rc = 1;
+
+ if(num_dash > 2)
+ rc = 0;
-#ifdef DGA_DEBUG
- if(rc)
- printf("DGA %s [num_found: %u][num_impossible: %u]\n",
- name, num_found, num_impossible);
-#endif
+ if(ndpi_verbose_dga_detection) {
+ if(rc)
+ printf("DGA %s [num_found: %u][num_impossible: %u]\n",
+ name, num_found, num_impossible);
+ }
}
-#ifdef DGA_DEBUG
- printf("[DGA] Result: %u", rc);
-#endif
+ if(ndpi_verbose_dga_detection)
+ printf("[DGA] Result: %u\n", rc);
if(rc && flow)
ndpi_set_risk(flow, NDPI_SUSPICIOUS_DGA_DOMAIN);
-
+
return(rc);
}