aboutsummaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorVitaly Lavrov <vel21ripn@gmail.com>2021-07-12 15:39:43 +0000
committerGitHub <noreply@github.com>2021-07-12 17:39:43 +0200
commitc418b7110b9385c5c3748c10e198df27ae0f7083 (patch)
tree046941f8085b48bf27b03cd60bfaee180906af21 /src/lib
parent78b1295dc18e297c1da53006bde1e0870e278db9 (diff)
ahoсorasick. Code review. Part 2. (#1236)
Simplified the process of adding lines to AC_AUTOMATA_t. Use the ndpi_string_to_automa() function to add patterns with domain names. For other cases can use ndpi_add_string_value_to_automa(). ac_automata_feature(ac_automa, AC_FEATURE_LC) allows adding and compare data in a case insensitive manner. For mandatory pattern comparison from the end of the line, the "ac_pattern.rep.at_end=1" flag is used. This eliminated unnecessary conversions to lowercase and adding "$" for end-of-line matching in domain name patterns. ac_match_handler() has been renamed ac_domain_match_handler() and has been greatly simplified. ac_domain_match_handler() looks for the template with the highest domain level. For special cases it is possible to manually specify the domain level. Added test for checking ambiguous domain names like: - short.weixin.qq.com is QQ, not Wechat - instagram.faae1-1.fna.fbcdn.net is Instagram, not Facebook If you specify a NULL handler when creating the AC_AUTOMATA_t structure, then a pattern with the maximum length that satisfies the search conditions will be found (exact match, from the beginning of the string, from the end of the string, or a substring). Added debugging for ac_automata_search. To do this, you need to enable debugging globally using ac_automata_enable_debug(1) and enable debugging in the AC_AUTOMATA_t structure using ac_automata_name("name", AC_FEATURE_DEBUG). The search will display "name" and a list of matching patterns. Running "AHO_DEBUG=1 ndpiReader ..." will show the lines that were searched for templates and which templates were found. The ac_automata_dump() prototype has been changed. Now it outputs data to a file. If it is specified as NULL, then the output will be directed to stdout. If you need to get data as a string, then use open_memstream(). Added the ability to run individual tests via the do.sh script
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/ndpi_content_match.c.inc3
-rw-r--r--src/lib/ndpi_main.c414
-rw-r--r--src/lib/ndpi_utils.c16
-rw-r--r--src/lib/third_party/include/ahocorasick.h32
-rw-r--r--src/lib/third_party/src/ahocorasick.c135
5 files changed, 309 insertions, 291 deletions
diff --git a/src/lib/ndpi_content_match.c.inc b/src/lib/ndpi_content_match.c.inc
index 723f4b21c..8682418f1 100644
--- a/src/lib/ndpi_content_match.c.inc
+++ b/src/lib/ndpi_content_match.c.inc
@@ -8852,7 +8852,7 @@ static ndpi_protocol_match host_match[] =
/* Detected "instagram.c10r.facebook.com". Omitted "*amazonaws.com" and "*facebook.com" CDNs e.g. "ig-telegraph-shv-04-frc3.facebook.com" */
{ ".instagram.", "Instagram", NDPI_PROTOCOL_INSTAGRAM, NDPI_PROTOCOL_CATEGORY_SOCIAL_NETWORK, NDPI_PROTOCOL_FUN },
- { "instagram.", "Instagram", NDPI_PROTOCOL_INSTAGRAM, NDPI_PROTOCOL_CATEGORY_SOCIAL_NETWORK, NDPI_PROTOCOL_FUN },
+ { "instagram.", "Instagram", NDPI_PROTOCOL_INSTAGRAM, NDPI_PROTOCOL_CATEGORY_SOCIAL_NETWORK, NDPI_PROTOCOL_FUN, 15 },
{ ".cdninstagram.com", "Instagram", NDPI_PROTOCOL_INSTAGRAM, NDPI_PROTOCOL_CATEGORY_SOCIAL_NETWORK, NDPI_PROTOCOL_FUN },
{ "igcdn-photos-", "Instagram", NDPI_PROTOCOL_INSTAGRAM, NDPI_PROTOCOL_CATEGORY_SOCIAL_NETWORK, NDPI_PROTOCOL_FUN },
@@ -9020,6 +9020,7 @@ static ndpi_protocol_match host_match[] =
{ ".gmail.", "GMail", NDPI_PROTOCOL_GMAIL, NDPI_PROTOCOL_CATEGORY_MAIL, NDPI_PROTOCOL_ACCEPTABLE },
{ "mail.google.", "GMail", NDPI_PROTOCOL_GMAIL, NDPI_PROTOCOL_CATEGORY_MAIL, NDPI_PROTOCOL_ACCEPTABLE },
+ { "google.com", "Google", NDPI_PROTOCOL_GOOGLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE },
{ "google.", "Google", NDPI_PROTOCOL_GOOGLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE },
{ ".google.", "Google", NDPI_PROTOCOL_GOOGLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE },
{ ".gstatic.com", "Google", NDPI_PROTOCOL_GOOGLE, NDPI_PROTOCOL_CATEGORY_WEB, NDPI_PROTOCOL_SAFE },
diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c
index 91dacd001..a1c48d781 100644
--- a/src/lib/ndpi_main.c
+++ b/src/lib/ndpi_main.c
@@ -542,7 +542,6 @@ static u_int8_t ndpi_is_middle_string_char(char c) {
switch(c) {
case '.':
case '-':
- case '$': /* Do not add a double $$ */
return(1);
break;
@@ -551,59 +550,85 @@ static u_int8_t ndpi_is_middle_string_char(char c) {
}
}
+/*******************************************************/
+
+static const u_int8_t ndpi_domain_level_automat[4][4]= {
+ /* symbol,'.','-',inc */
+ { 2,1,2,0 }, // start state
+ { 2,0,0,0 }, // first char is '.'; disable .. or .-
+ { 2,3,2,0 }, // part of domain name
+ { 2,0,0,1 } // next level domain name; disable .. or .-
+};
+
+/*
+ * domain level
+ * a. = 1
+ * .a. = 1
+ * a.b = 2
+ */
+
+static u_int8_t ndpi_domain_level(const char *name) {
+ u_int8_t level = 1, state = 0;
+ char c;
+ while((c = *name++) != '\0') {
+ c = c == '-' ? 2 : (c == '.' ? 1:0);
+ level += ndpi_domain_level_automat[state][3];
+ state = ndpi_domain_level_automat[state][(uint8_t)c];
+ if(!state) break;
+ }
+ return state >= 2 ? level:0;
+}
+
/* ****************************************************** */
static int ndpi_string_to_automa(struct ndpi_detection_module_struct *ndpi_str,
- ndpi_automa *automa, char *value,
+ AC_AUTOMATA_t *ac_automa, const char *value,
u_int16_t protocol_id, ndpi_protocol_category_t category,
- ndpi_protocol_breed_t breed,
- u_int8_t free_str_on_duplicate, u_int8_t add_ends_with) {
+ ndpi_protocol_breed_t breed, uint8_t level,
+ u_int8_t add_ends_with) {
AC_PATTERN_t ac_pattern;
AC_ERROR_t rc;
- char buf[96];
- u_int len, dot;
+ u_int len;
+ char *value_dup = NULL;
if(protocol_id >= (NDPI_MAX_SUPPORTED_PROTOCOLS + NDPI_MAX_NUM_CUSTOM_PROTOCOLS)) {
NDPI_LOG_ERR(ndpi_str, "[NDPI] protoId=%d: INTERNAL ERROR\n", protocol_id);
return(-1);
}
- if((automa->ac_automa == NULL) || (value == NULL) || !*value)
+ if((ac_automa == NULL) || (value == NULL) || !*value)
return(-2);
- len = strlen(value);
- dot = len -1;
+ value_dup = ndpi_strdup(value);
+ if(!value_dup)
+ return(-1);
memset(&ac_pattern, 0, sizeof(ac_pattern));
- if((!add_ends_with) || ndpi_is_middle_string_char(value[dot])) {
- ac_pattern.length = len;
- ac_pattern.astring = value;
- } else {
- u_int mlen = sizeof(buf)-2;
-
- len = ndpi_min(len, mlen);
- ac_pattern.length = snprintf(buf, mlen, "%s$", value);
- ndpi_free(value);
- value = ndpi_strdup(buf);
- ac_pattern.astring = value;
- }
+ len = strlen(value);
- ac_pattern.rep.number = protocol_id, ac_pattern.rep.category = (u_int16_t) category,
- ac_pattern.rep.breed = (u_int16_t) breed;
+ ac_pattern.astring = value_dup;
+ ac_pattern.length = len;
+ ac_pattern.rep.number = protocol_id;
+ ac_pattern.rep.category = (u_int16_t) category;
+ ac_pattern.rep.breed = (u_int16_t) breed;
+ ac_pattern.rep.level = level ? level : ndpi_domain_level(value);
+ ac_pattern.rep.at_end = add_ends_with && !ndpi_is_middle_string_char(value[len-1]); /* len != 0 */
+ ac_pattern.rep.dot = memchr(value,'.',len) != NULL;
#ifdef MATCH_DEBUG
- printf("Adding to automa [%s][protocol_id: %u][category: %u][breed: %u]\n",
- ac_pattern.astring, protocol_id, category, breed);
+ printf("Adding to %s %lx [%s%s][protocol_id: %u][category: %u][breed: %u][level: %u]\n",
+ ac_automa->name,(unsigned long int)ac_automa,
+ ac_pattern.astring,ac_pattern.rep.at_end? "$":"", protocol_id, category, breed,ac_pattern.rep.level);
#endif
- rc = ac_automata_add(((AC_AUTOMATA_t *) automa->ac_automa), &ac_pattern);
-
- if((rc != ACERR_DUPLICATE_PATTERN) && (rc != ACERR_SUCCESS))
- return(-2);
+ rc = ac_automata_add(ac_automa, &ac_pattern);
- if((rc == ACERR_DUPLICATE_PATTERN) && free_str_on_duplicate)
- ndpi_free(value);
+ if(rc != ACERR_SUCCESS) {
+ ndpi_free(value_dup);
+ if(rc != ACERR_DUPLICATE_PATTERN)
+ return (-2);
+ }
return(0);
}
@@ -611,25 +636,16 @@ static int ndpi_string_to_automa(struct ndpi_detection_module_struct *ndpi_str,
/* ****************************************************** */
static int ndpi_add_host_url_subprotocol(struct ndpi_detection_module_struct *ndpi_str,
- char *_value, int protocol_id,
+ char *value, int protocol_id,
ndpi_protocol_category_t category,
- ndpi_protocol_breed_t breed) {
- int rv;
- char *value = ndpi_strdup(_value);
-
- if(!value)
- return(-1);
-
-#ifdef DEBUG
+ ndpi_protocol_breed_t breed, uint8_t level) {
+#ifndef DEBUG
NDPI_LOG_DBG2(ndpi_str, "[NDPI] Adding [%s][%d]\n", value, protocol_id);
#endif
- rv = ndpi_string_to_automa(ndpi_str, &ndpi_str->host_automa, value, protocol_id, category, breed, 1, 1);
-
- if(rv != 0)
- ndpi_free(value);
+ return ndpi_string_to_automa(ndpi_str, (AC_AUTOMATA_t *)ndpi_str->host_automa.ac_automa,
+ value, protocol_id, category, breed, level, 1);
- return(rv);
}
/* ****************************************************** */
@@ -668,7 +684,7 @@ void ndpi_init_protocol_match(struct ndpi_detection_module_struct *ndpi_str,
ndpi_add_host_url_subprotocol(ndpi_str, match->string_to_match,
match->protocol_id, match->protocol_category,
- match->protocol_breed);
+ match->protocol_breed, match->level);
}
/* ******************************************************************** */
@@ -1732,75 +1748,57 @@ static void ndpi_init_protocol_defaults(struct ndpi_detection_module_struct *ndp
/* ****************************************************** */
-static int ac_match_handler(AC_MATCH_t *m, AC_TEXT_t *txt, AC_REP_t *match) {
- int min_len = (txt->length < m->patterns->length) ? txt->length : m->patterns->length;
- char buf[64] = {'\0'}, *whatfound;
- int min_buf_len = (txt->length > 63 /* sizeof(buf)-1 */) ? 63 : txt->length;
- u_int buf_len = strlen(buf);
-
- strncpy(buf, txt->astring, min_buf_len);
- buf[min_buf_len] = '\0';
+#define MATCH_DEBUG_INFO(fmt, ...) if(txt->option & AC_FEATURE_DEBUG) printf(fmt, ##__VA_ARGS__)
-#ifdef MATCH_DEBUG
- printf("Searching [to search: %s/%u][pattern: %s/%u] [len: %d][match_num: %u][%s]\n", buf,
- (unsigned int) txt->length, m->patterns->astring,
- (unsigned int) m->patterns->length, min_len, m->match_num,
- m->patterns->astring);
-#endif
-
- whatfound = strstr(buf, m->patterns->astring);
-
-#ifdef MATCH_DEBUG
- printf("[NDPI] %s() [searching=%s][pattern=%s][%s][%c]\n", __FUNCTION__, buf, m->patterns->astring,
- whatfound ? whatfound : "<NULL>", whatfound[-1]);
-#endif
+static int ac_domain_match_handler(AC_MATCH_t *m, AC_TEXT_t *txt, AC_REP_t *match) {
+ AC_PATTERN_t *pattern = m->patterns;
+ int i,start,end = m->position;
- if(whatfound) {
+ for(i=0; i < m->match_num; i++,pattern++) {
/*
- The patch below allows in case of pattern ws.amazon.com
- to avoid matching aws.amazon.com whereas a.ws.amazon.com
- has to match
- */
- if((whatfound != buf)
- && (strchr(whatfound, '=') == NULL) /* This is not a match from tls_certificate_match[] */
- && (m->patterns->astring[0] != '.') /* The searched pattern does not start with . */
- && strchr(m->patterns->astring, '.') /* The matched pattern has a . (e.g. numeric or sym IPs) */) {
- int len = strlen(m->patterns->astring);
-
- if(((whatfound[-1] != '.') && (whatfound[0] != '-') && (whatfound[-1] != '-'))
- || ((m->patterns->astring[len - 1] != '.')
- && (whatfound[len] != '\0') /* endsWith does not hold here */)) {
- return(0);
- } else {
- memcpy(match, &m->patterns[0].rep, sizeof(AC_REP_t)); /* Partial match? */
- return(0); /* Keep searching as probably there is a better match */
- }
+ * See ac_automata_exact_match()
+ * The bit is set if the pattern exactly matches AND
+ * the length of the pattern is longer than that of the previous one.
+ * Skip shorter (less precise) templates.
+ */
+ if(!(m->match_map & (1 << i)))
+ continue;
+ start = end - pattern->length;
+
+ MATCH_DEBUG_INFO("[NDPI] Searching: [to search: %.*s/%u][pattern: %s%.*s%s/%u l:%u] %d-%d\n",
+ txt->length, txt->astring,(unsigned int) txt->length,
+ m->patterns[0].rep.from_start ? "^":"",
+ (unsigned int) pattern->length, pattern->astring,
+ m->patterns[0].rep.at_end ? "$":"", (unsigned int) pattern->length,m->patterns[0].rep.level,
+ start,end);
+
+ if(start == 0 && end == txt->length) {
+ *match = pattern->rep; txt->match.last = pattern;
+ MATCH_DEBUG_INFO("[NDPI] Searching: Found exact match. Proto %d \n",pattern->rep.number);
+ return 1;
+ }
+ /* pattern is DOMAIN.NAME and string x.DOMAIN.NAME ? */
+ if(start > 1 && !ndpi_is_middle_string_char(pattern->astring[0]) && pattern->rep.dot) {
+ /*
+ The patch below allows in case of pattern ws.amazon.com
+ to avoid matching aws.amazon.com whereas a.ws.amazon.com
+ has to match
+ */
+ if(ndpi_is_middle_string_char(txt->astring[start-1])) {
+ if(!txt->match.last || txt->match.last->rep.level < pattern->rep.level) {
+ txt->match.last = pattern; *match = pattern->rep;
+ MATCH_DEBUG_INFO("[NDPI] Searching: Found domain match. Proto %d \n",pattern->rep.number);
+ }
+ }
+ continue;
}
- }
-
- /*
- Return 1 for stopping to the first match.
- We might consider searching for the more
- specific match, paying more cpu cycles.
- */
- memcpy(match, &m->patterns[0].rep, sizeof(AC_REP_t));
- if(((buf_len >= min_len) && (strncmp(&buf[buf_len - min_len], m->patterns->astring, min_len) == 0)) ||
- (strncmp(buf, m->patterns->astring, min_len) == 0) /* begins with */
- ) {
-#ifdef MATCH_DEBUG
- printf("Found match [%s][%s] [len: %d]"
- // "[proto_id: %u]"
- "\n",
- buf, m->patterns->astring, min_len /* , *matching_protocol_id */);
-#endif
- return(1); /* If the pattern found matches the string at the beginning we stop here */
- } else {
-#ifdef MATCH_DEBUG
- printf("NO match found: continue\n");
-#endif
- return(0); /* 0 to continue searching, !0 to stop */
+ if(!txt->match.last || txt->match.last->rep.level < pattern->rep.level) {
+ txt->match.last = pattern; *match = pattern->rep;
+ MATCH_DEBUG_INFO("[NDPI] Searching: matched. Proto %d \n",pattern->rep.number);
+ }
}
+ return 0;
}
/* ******************************************************************** */
@@ -2335,9 +2333,9 @@ struct ndpi_detection_module_struct *ndpi_init_detection_module(ndpi_init_prefs
ndpi_str->ndpi_num_supported_protocols = NDPI_MAX_SUPPORTED_PROTOCOLS;
ndpi_str->ndpi_num_custom_protocols = 0;
- ndpi_str->host_automa.ac_automa = ac_automata_init(ac_match_handler);
- ndpi_str->content_automa.ac_automa = ac_automata_init(ac_match_handler);
- ndpi_str->tls_cert_subject_automa.ac_automa = ac_automata_init(ac_match_handler);
+ ndpi_str->host_automa.ac_automa = ac_automata_init(ac_domain_match_handler);
+ ndpi_str->content_automa.ac_automa = ac_automata_init(ac_domain_match_handler);
+ ndpi_str->tls_cert_subject_automa.ac_automa = ac_automata_init(NULL);
ndpi_str->malicious_ja3_automa.ac_automa = NULL; /* Initialized on demand */
ndpi_str->malicious_sha1_automa.ac_automa = NULL; /* Initialized on demand */
ndpi_str->risky_domain_automa.ac_automa = NULL; /* Initialized on demand */
@@ -2348,19 +2346,41 @@ struct ndpi_detection_module_struct *ndpi_init_detection_module(ndpi_init_prefs
return(NULL);
}
- ndpi_str->custom_categories.hostnames.ac_automa = ac_automata_init(ac_match_handler);
- ndpi_str->custom_categories.hostnames_shadow.ac_automa = ac_automata_init(ac_match_handler);
+ ndpi_str->custom_categories.hostnames.ac_automa = ac_automata_init(ac_domain_match_handler);
+ ndpi_str->custom_categories.hostnames_shadow.ac_automa = ac_automata_init(ac_domain_match_handler);
ndpi_str->custom_categories.ipAddresses = ndpi_patricia_new(32 /* IPv4 */);
ndpi_str->custom_categories.ipAddresses_shadow = ndpi_patricia_new(32 /* IPv4 */);
- if(ndpi_str->host_automa.ac_automa)
+ if(ndpi_str->host_automa.ac_automa)
ac_automata_feature(ndpi_str->host_automa.ac_automa,AC_FEATURE_LC);
+
if(ndpi_str->custom_categories.hostnames.ac_automa)
ac_automata_feature(ndpi_str->custom_categories.hostnames.ac_automa,AC_FEATURE_LC);
+
if(ndpi_str->custom_categories.hostnames_shadow.ac_automa)
ac_automata_feature(ndpi_str->custom_categories.hostnames_shadow.ac_automa,AC_FEATURE_LC);
+ if(ndpi_str->tls_cert_subject_automa.ac_automa)
+ ac_automata_feature(ndpi_str->tls_cert_subject_automa.ac_automa,AC_FEATURE_LC);
+
+ if(ndpi_str->content_automa.ac_automa)
+ ac_automata_feature(ndpi_str->content_automa.ac_automa,AC_FEATURE_LC);
+
+ /* ahocorasick debug */
+ /* Needed ac_automata_enable_debug(1) for show debug */
+ if(ndpi_str->host_automa.ac_automa)
+ ac_automata_name(ndpi_str->host_automa.ac_automa,"host",AC_FEATURE_DEBUG);
+ if(ndpi_str->custom_categories.hostnames.ac_automa)
+ ac_automata_name(ndpi_str->custom_categories.hostnames.ac_automa,"ccat",0);
+ if(ndpi_str->custom_categories.hostnames_shadow.ac_automa)
+ ac_automata_name(ndpi_str->custom_categories.hostnames_shadow.ac_automa,"ccat_sh",0);
+ if(ndpi_str->tls_cert_subject_automa.ac_automa)
+ ac_automata_name(ndpi_str->tls_cert_subject_automa.ac_automa,"tls_cert",AC_FEATURE_DEBUG);
+ if(ndpi_str->content_automa.ac_automa)
+ ac_automata_name(ndpi_str->content_automa.ac_automa,"content",AC_FEATURE_DEBUG);
+
+
if((ndpi_str->custom_categories.ipAddresses == NULL) || (ndpi_str->custom_categories.ipAddresses_shadow == NULL)) {
NDPI_LOG_ERR(ndpi_str, "[NDPI] Error allocating Patricia trees\n");
return(NULL);
@@ -2420,7 +2440,7 @@ void ndpi_finalize_initialization(struct ndpi_detection_module_struct *ndpi_str)
/* Wrappers */
void *ndpi_init_automa(void) {
- return(ac_automata_init(ac_match_handler));
+ return(ac_automata_init(ac_domain_match_handler));
}
/* ****************************************************** */
@@ -2463,7 +2483,7 @@ void ndpi_finalize_automa(void *_automa) {
/* ****************************************************** */
static int ndpi_match_string_common(AC_AUTOMATA_t *automa, char *string_to_match,size_t string_len,
- u_int16_t *protocol_id, ndpi_protocol_category_t *category,
+ u_int32_t *protocol_id, ndpi_protocol_category_t *category,
ndpi_protocol_breed_t *breed) {
AC_REP_t match = { NDPI_PROTOCOL_UNKNOWN, NDPI_PROTOCOL_CATEGORY_UNSPECIFIED, NDPI_PROTOCOL_UNRATED };
AC_TEXT_t ac_input_text;
@@ -2481,17 +2501,9 @@ static int ndpi_match_string_common(AC_AUTOMATA_t *automa, char *string_to_match
}
ac_input_text.astring = string_to_match, ac_input_text.length = string_len;
- ac_input_text.ignore_case = 0;
+ ac_input_text.option = 0;
rc = ac_automata_search(automa, &ac_input_text, &match);
- /*
- As ac_automata_search can detect partial matches and continue the search process
- in case rc == 0 (i.e. no match), we need to check if there is a partial match
- and in this case return it
- */
- if((rc == 0) && (match.number != 0))
- rc = 1;
-
if(protocol_id)
*protocol_id = rc ? match.number : NDPI_PROTOCOL_UNKNOWN;
@@ -2507,7 +2519,7 @@ static int ndpi_match_string_common(AC_AUTOMATA_t *automa, char *string_to_match
/* ****************************************************** */
int ndpi_match_string(void *_automa, char *string_to_match) {
- uint16_t proto_id;
+ uint32_t proto_id;
int rc;
if(!string_to_match)
@@ -2526,58 +2538,32 @@ int ndpi_match_string_protocol_id(void *automa, char *string_to_match,
u_int match_len, u_int16_t *protocol_id,
ndpi_protocol_category_t *category,
ndpi_protocol_breed_t *breed) {
-
+ u_int32_t proto_id;
int rc = ndpi_match_string_common((AC_AUTOMATA_t*)automa, string_to_match,
- match_len, protocol_id, category, breed);
+ match_len, &proto_id, category, breed);
if(rc < 0) return rc;
-
- return(*protocol_id != NDPI_PROTOCOL_UNKNOWN ? 0 : -1);
+ *protocol_id = (u_int16_t)proto_id;
+ return(proto_id != NDPI_PROTOCOL_UNKNOWN ? 0 : -1);
}
/* ****************************************************** */
-int ndpi_match_string_value(void *_automa, char *string_to_match,
- u_int match_len, u_int32_t *num) {
- AC_REP_t match = { NDPI_PROTOCOL_UNKNOWN, NDPI_PROTOCOL_CATEGORY_UNSPECIFIED, NDPI_PROTOCOL_UNRATED };
- AC_TEXT_t ac_input_text;
- int rc;
- AC_AUTOMATA_t *automa = (AC_AUTOMATA_t*)_automa;
-
- if(num) *num = 0;
-
- if((automa == NULL) || (string_to_match == NULL) || (string_to_match[0] == '\0')) {
- return(-2);
- }
-
- if(automa->automata_open) {
- printf("[%s:%d] [NDPI] Internal error: please call ndpi_finalize_initialization()\n", __FILE__, __LINE__);
- return(-1);
- }
-
- ac_input_text.astring = string_to_match, ac_input_text.length = match_len;
- ac_input_text.ignore_case = 0;
- rc = ac_automata_search(automa, &ac_input_text, &match);
-
- /*
- As ac_automata_search can detect partial matches and continue the search process
- in case rc == 0 (i.e. no match), we need to check if there is a partial match
- and in this case return it
- */
- if((rc == 0) && (match.number != 0))
- rc = 1;
-
- if(num && rc)
- *num = match.number;
+int ndpi_match_string_value(void *automa, char *string_to_match,
+ u_int match_len, u_int32_t *num) {
+ int rc = ndpi_match_string_common((AC_AUTOMATA_t *)automa, string_to_match,
+ match_len, num, NULL, NULL);
+ if(rc < 0) return rc;
return rc ? 0 : -1;
-}
+ }
+
/* *********************************************** */
int ndpi_match_custom_category(struct ndpi_detection_module_struct *ndpi_str,
char *name, u_int name_len,
ndpi_protocol_category_t *category) {
- u_int16_t id;
+ u_int32_t id;
int rc = ndpi_match_string_common(ndpi_str->custom_categories.hostnames.ac_automa,
name, name_len, &id, category, NULL);
if(rc < 0) return rc;
@@ -2993,7 +2979,7 @@ int ndpi_handle_rule(struct ndpi_detection_module_struct *ndpi_str, char *rule,
} else {
if(do_add)
ndpi_add_host_url_subprotocol(ndpi_str, value, subprotocol_id, NDPI_PROTOCOL_CATEGORY_UNSPECIFIED,
- NDPI_PROTOCOL_ACCEPTABLE);
+ NDPI_PROTOCOL_ACCEPTABLE,0);
else
ndpi_remove_host_url_subprotocol(ndpi_str, value, subprotocol_id);
}
@@ -3062,28 +3048,17 @@ int ndpi_load_categories_file(struct ndpi_detection_module_struct *ndpi_str, con
static int ndpi_load_risky_domain(struct ndpi_detection_module_struct *ndpi_str,
char* domain_name) {
if(ndpi_str->risky_domain_automa.ac_automa == NULL) {
- ndpi_str->risky_domain_automa.ac_automa = ac_automata_init(ac_match_handler);
- if(ndpi_str->risky_domain_automa.ac_automa)
- ac_automata_feature(ndpi_str->risky_domain_automa.ac_automa,AC_FEATURE_LC);
+ ndpi_str->risky_domain_automa.ac_automa = ac_automata_init(ac_domain_match_handler);
+ if(!ndpi_str->risky_domain_automa.ac_automa) return -1;
+ ac_automata_feature(ndpi_str->risky_domain_automa.ac_automa,AC_FEATURE_LC);
+ ac_automata_name(ndpi_str->risky_domain_automa.ac_automa,"risky",0);
}
- if(ndpi_str->risky_domain_automa.ac_automa) {
- char buf[64], *str;
- u_int i, len;
-
- snprintf(buf, sizeof(buf)-1, "%s$", domain_name);
- for(i = 0, len = strlen(buf)-1 /* Skip $ */; i < len; i++) buf[i] = tolower(buf[i]);
-
- str = ndpi_strdup(buf);
- if (str == NULL) {
- NDPI_LOG_ERR(ndpi_str, "Memory allocation failure\n");
- return -1;
- };
-
- return(ndpi_add_string_to_automa(ndpi_str->risky_domain_automa.ac_automa, str));
- }
+ if(!ndpi_str->risky_domain_automa.ac_automa)
+ return -1;
- return(-1);
+ return ndpi_string_to_automa(ndpi_str, (AC_AUTOMATA_t *)ndpi_str->risky_domain_automa.ac_automa,
+ domain_name, 1, 0, 0, 0, 1); /* domain, protocol, category, breed, level , at_end */
}
/* ******************************************************************** */
@@ -3147,7 +3122,9 @@ int ndpi_load_malicious_ja3_file(struct ndpi_detection_module_struct *ndpi_str,
int len, num = 0;
if(ndpi_str->malicious_ja3_automa.ac_automa == NULL)
- ndpi_str->malicious_ja3_automa.ac_automa = ac_automata_init(ac_match_handler);
+ ndpi_str->malicious_ja3_automa.ac_automa = ac_automata_init(NULL);
+ if(ndpi_str->malicious_ja3_automa.ac_automa)
+ ac_automata_name(ndpi_str->malicious_ja3_automa.ac_automa,"ja3",0);
fd = fopen(path, "r");
@@ -3208,7 +3185,9 @@ int ndpi_load_malicious_sha1_file(struct ndpi_detection_module_struct *ndpi_str,
int num = 0;
if (ndpi_str->malicious_sha1_automa.ac_automa == NULL)
- ndpi_str->malicious_sha1_automa.ac_automa = ac_automata_init(ac_match_handler);
+ ndpi_str->malicious_sha1_automa.ac_automa = ac_automata_init(NULL);
+ if(ndpi_str->malicious_sha1_automa.ac_automa)
+ ac_automata_name(ndpi_str->malicious_sha1_automa.ac_automa,"sha1",0);
fd = fopen(path, "r");
@@ -4826,45 +4805,15 @@ int ndpi_load_ip_category(struct ndpi_detection_module_struct *ndpi_str, const c
int ndpi_load_hostname_category(struct ndpi_detection_module_struct *ndpi_str, const char *name_to_add,
ndpi_protocol_category_t category) {
- char *name;
- u_int len;
- AC_PATTERN_t ac_pattern;
- AC_ERROR_t rc;
- if(name_to_add == NULL)
- return(-1);
- else
- len = strlen(name_to_add);
-
- if((name = (char*)ndpi_malloc(len+3)) == NULL)
+ if(ndpi_str->custom_categories.hostnames_shadow.ac_automa == NULL)
return(-1);
- memset(&ac_pattern, 0, sizeof(ac_pattern));
- ac_pattern.length = snprintf(name, len+2, "%s%s", name_to_add,
- ndpi_is_middle_string_char(name_to_add[len-1]) ? "" : "$");
-
-#if 0
- printf("===> %s() Loading %s as %u\n", __FUNCTION__, name, category);
-#endif
-
- if(ndpi_str->custom_categories.hostnames_shadow.ac_automa == NULL) {
- ndpi_free(name);
- return(-1);
- }
-
- ac_pattern.astring = name;
- ac_pattern.rep.number = (u_int32_t) category, ac_pattern.rep.category = category;
-
- rc = ac_automata_add(ndpi_str->custom_categories.hostnames_shadow.ac_automa, &ac_pattern);
- if(rc != ACERR_DUPLICATE_PATTERN && rc != ACERR_SUCCESS) {
- ndpi_free(name);
+ if(name_to_add == NULL)
return(-1);
- }
-
- if(rc == ACERR_DUPLICATE_PATTERN)
- ndpi_free(name);
- return(0);
+ return ndpi_string_to_automa(ndpi_str,(AC_AUTOMATA_t *)ndpi_str->custom_categories.hostnames_shadow.ac_automa,
+ name_to_add,category,category, 0, 0, 1); /* at_end */
}
/* ********************************************************************************* */
@@ -4905,7 +4854,11 @@ int ndpi_enable_loaded_categories(struct ndpi_detection_module_struct *ndpi_str)
ndpi_str->custom_categories.hostnames.ac_automa = ndpi_str->custom_categories.hostnames_shadow.ac_automa;
/* Realloc */
- ndpi_str->custom_categories.hostnames_shadow.ac_automa = ac_automata_init(ac_match_handler);
+ ndpi_str->custom_categories.hostnames_shadow.ac_automa = ac_automata_init(ac_domain_match_handler);
+ if(ndpi_str->custom_categories.hostnames_shadow.ac_automa) {
+ ac_automata_feature(ndpi_str->custom_categories.hostnames_shadow.ac_automa,AC_FEATURE_LC);
+ ac_automata_name(ndpi_str->custom_categories.hostnames_shadow.ac_automa,"ccat_sh",0);
+ }
if(ndpi_str->custom_categories.ipAddresses != NULL)
ndpi_patricia_destroy((ndpi_patricia_tree_t *) ndpi_str->custom_categories.ipAddresses, free_ptree_data);
@@ -6779,20 +6732,14 @@ u_int16_t ndpi_match_host_subprotocol(struct ndpi_detection_module_struct *ndpi_
char *string_to_match, u_int string_to_match_len,
ndpi_protocol_match_result *ret_match,
u_int16_t master_protocol_id) {
- u_int16_t rc, buf_len, i;
+ u_int16_t rc;
ndpi_protocol_category_t id;
- char buf[96];
-
- buf_len = ndpi_min(string_to_match_len, sizeof(buf)-2);
- for(i=0; i<buf_len; i++) buf[i] = tolower(string_to_match[i]);
- buf[i++] = '$'; /* Add trailer $ */
- buf[i] = '\0';
- rc = ndpi_automa_match_string_subprotocol(ndpi_str, flow, buf, i,
+ rc = ndpi_automa_match_string_subprotocol(ndpi_str, flow, string_to_match, string_to_match_len,
master_protocol_id, ret_match, 1);
id = ret_match->protocol_category;
- if(ndpi_get_custom_category_match(ndpi_str, buf, i, &id) != -1) {
+ if(ndpi_get_custom_category_match(ndpi_str, string_to_match, string_to_match_len, &id) != -1) {
/* if(id != -1) */ {
flow->category = ret_match->protocol_category = id;
rc = master_protocol_id;
@@ -6800,8 +6747,9 @@ u_int16_t ndpi_match_host_subprotocol(struct ndpi_detection_module_struct *ndpi_
}
if(ndpi_str->risky_domain_automa.ac_automa != NULL) {
- u_int16_t rc1 = ndpi_match_string(ndpi_str->risky_domain_automa.ac_automa, buf);
-
+ u_int32_t proto_id;
+ u_int16_t rc1 = ndpi_match_string_common(ndpi_str->risky_domain_automa.ac_automa,
+ string_to_match,string_to_match_len, &proto_id, NULL, NULL);
if(rc1 > 0)
ndpi_set_risk(flow, NDPI_RISKY_DOMAIN);
}
diff --git a/src/lib/ndpi_utils.c b/src/lib/ndpi_utils.c
index 9814733f7..95f0a4345 100644
--- a/src/lib/ndpi_utils.c
+++ b/src/lib/ndpi_utils.c
@@ -731,7 +731,7 @@ const char* ndpi_cipher2str(u_int32_t cipher) {
/* ******************************************************************** */
-static int ndpi_is_other_char(char c) {
+static inline int ndpi_is_other_char(char c) {
return((c == '.')
|| (c == ' ')
|| (c == '@')
@@ -741,7 +741,7 @@ static int ndpi_is_other_char(char c) {
/* ******************************************************************** */
-static int ndpi_is_valid_char(char c) {
+static int _ndpi_is_valid_char(char c) {
if(ispunct(c) && (!ndpi_is_other_char(c)))
return(0);
else
@@ -749,6 +749,18 @@ static int ndpi_is_valid_char(char c) {
|| isalpha(c)
|| ndpi_is_other_char(c));
}
+static char ndpi_is_valid_char_tbl[256],ndpi_is_valid_char_tbl_init=0;
+
+static void _ndpi_is_valid_char_init(void) {
+ int c;
+ for(c=0; c < 256; c++) ndpi_is_valid_char_tbl[c] = _ndpi_is_valid_char(c);
+ ndpi_is_valid_char_tbl_init = 1;
+}
+static inline int ndpi_is_valid_char(char c) {
+ if(!ndpi_is_valid_char_tbl_init)
+ _ndpi_is_valid_char_init();
+ return ndpi_is_valid_char_tbl[(unsigned char)c];
+}
/* ******************************************************************** */
diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h
index 71fc22d0d..5efbc05f2 100644
--- a/src/lib/third_party/include/ahocorasick.h
+++ b/src/lib/third_party/include/ahocorasick.h
@@ -54,8 +54,11 @@ typedef char AC_ALPHABET_t;
**/
typedef struct {
uint32_t number; /* Often used to store procotolId */
- uint16_t breed,
- category:14,from_start:1,at_end:1;
+ uint16_t breed, category;
+ uint16_t level, /* Domain level for comparison */
+ from_start:1, /* match from start of string */
+ at_end:1, /* match at end of string */
+ dot:1; /* is domain name */
} AC_REP_t;
/* AC_PATTERN_t:
@@ -103,8 +106,10 @@ typedef struct {
typedef struct
{
- AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */
- AC_PATTERN_t *patterns; /* Array of matched pattern */
+ AC_PATTERN_t *matched[4], /* for ac_automata_exact_match() */
+ *last; /* for callback */
+ AC_PATTERN_t *patterns; /* Array of matched pattern */
+ unsigned int match_map; /* Matched patterns (bitmap) */
unsigned int position; /* The end position of matching pattern(s) in the text */
unsigned short int match_num; /* Number of matched patterns */
unsigned short int match_counter; /* Counter of found matches */
@@ -120,7 +125,7 @@ typedef struct
AC_MATCH_t match;
AC_ALPHABET_t * astring; /* String of alphabets */
unsigned short int length, /* Length of string */
- ignore_case;
+ option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */;
} AC_TEXT_t;
@@ -218,7 +223,7 @@ typedef struct
* means not finalized (is open). after finalizing automata you can not
* add pattern to automata anymore. */
unsigned short automata_open,
- to_lc:1, no_root_range:1; /* lowercase match */
+ to_lc:1, no_root_range:1,debug:1; /* lowercase match */
/* Statistic Variables */
unsigned long total_patterns; /* Total patterns in the automata */
@@ -229,17 +234,20 @@ typedef struct
int id; /* node id */
int add_to_range; /* for convert to range */
int n_oc,n_range,n_find; /* statistics */
+ char name[32]; /* if debug != 0 */
} AC_AUTOMATA_t;
typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);
typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);
-#define AC_FEATURE_LC 1
-#define AC_FEATURE_NO_ROOT_RANGE 2
+#define AC_FEATURE_DEBUG 1
+#define AC_FEATURE_LC 2
+#define AC_FEATURE_NO_ROOT_RANGE 4
AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc);
AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature);
+AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug);
AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz);
AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
@@ -252,7 +260,9 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
void ac_automata_clean (AC_AUTOMATA_t * thiz);
void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
#ifndef __KERNEL__
-void ac_automata_dump (AC_AUTOMATA_t * thiz,
- char *buf, size_t bufsize, char repcast);
+/* Global debug control. */
+void ac_automata_enable_debug (int debug);
+/* See man open_memstream() for get result as string */
+void ac_automata_dump (AC_AUTOMATA_t * thiz, FILE *);
+#endif
#endif
-#endif
diff --git a/src/lib/third_party/src/ahocorasick.c b/src/lib/third_party/src/ahocorasick.c
index ab9c5d333..06ed56a27 100644
--- a/src/lib/third_party/src/ahocorasick.c
+++ b/src/lib/third_party/src/ahocorasick.c
@@ -69,9 +69,11 @@ struct aho_dump_info {
int buf_pos,ip;
char *bufstr;
size_t bufstr_len;
+ FILE *file;
};
static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *);
+static int ac_automata_global_debug = 0;
#endif
/* Private function prototype */
@@ -195,6 +197,20 @@ AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature)
return ACERR_SUCCESS;
}
+AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug)
+{
+ if(!thiz) return ACERR_ERROR;
+ strncpy(thiz->name,name,sizeof(thiz->name)-1);
+ thiz->debug = debug != 0;
+ return ACERR_SUCCESS;
+}
+
+#ifndef __KERNEL__
+void ac_automata_enable_debug (int debug) {
+ ac_automata_global_debug = debug != 0;
+}
+#endif
+
/******************************************************************************
* FUNCTION: ac_automata_add
* Adds pattern to the automata.
@@ -368,30 +384,28 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *txt) {
AC_PATTERN_t *patterns = mp->patterns;
AC_PATTERN_t **matched = txt->match.matched;
int i;
- for(i=0; i < mp->num; i++,patterns++) {
+ int match_map = 0;
+ for(i=0; i < mp->num && i < (__SIZEOF_INT__*8-1); i++,patterns++) {
do {
if(patterns->rep.from_start && patterns->rep.at_end) {
if(pos == txt->length && patterns->length == pos)
- matched[0] = patterns;
+ matched[0] = patterns, match_map |= 1 << i;
break;
}
if(patterns->rep.from_start) {
- if(patterns->length == pos)
- if(!matched[1] || patterns->length > matched[1]->length)
- matched[1] = patterns;
+ if(patterns->length == pos)
+ matched[1] = patterns, match_map |= 1 << i;
break;
}
if(patterns->rep.at_end) {
- if(pos == txt->length)
- if(!matched[2] || patterns->length > matched[2]->length)
- matched[2] = patterns;
+ if(pos == txt->length)
+ matched[2] = patterns, match_map |= 1 << i;
break;
}
- if(!matched[3] || patterns->length > matched[3]->length)
- matched[3] = patterns;
+ matched[3] = patterns, match_map |= 1 << i;
} while(0);
}
- return 0;
+ return match_map;
}
/******************************************************************************
@@ -414,7 +428,7 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
AC_TEXT_t * txt, AC_REP_t * param)
{
unsigned long position;
- int icase = 0,i;
+ int icase = 0,i,debug=0;
AC_MATCH_t *match;
AC_NODE_t *curr;
AC_NODE_t *next;
@@ -426,14 +440,20 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
position = 0;
curr = thiz->root;
apos = txt->astring;
+#ifndef __KERNEL__
+ if(thiz->debug && ac_automata_global_debug) debug = 1;
+ if(debug) {
+ txt->option = debug; /* for callback */
+ printf("aho %s: search %.*s\n", thiz->name[0] ? thiz->name:"unknown", txt->length, apos);
+ }
+#endif
match = &txt->match;
memset((char*)match,0,sizeof(*match));
- icase = !thiz->to_lc;
/* The 'txt->ignore_case' option is checked
* separately otherwise clang will detect
* uninitialized memory usage much later. */
- if(txt->ignore_case == 1) icase = 1;
+ if(txt->option & AC_FEATURE_LC) icase = 1;
/* This is the main search loop.
* it must be keep as lightweight as possible. */
while (position < txt->length) {
@@ -448,19 +468,35 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
curr = next;
position++;
if(curr->final) {
- match->match_counter++; /* we have a matching */
/* select best match */
- ac_automata_exact_match(curr->matched_patterns,position,txt);
- if(thiz->match_handler) {
- /* We check 'next' to find out if we came here after a alphabet
- * transition or due to a fail. in second case we should not report
- * matching because it was reported in previous node */
- match->position = position;
- match->match_num = curr->matched_patterns->num;
- match->patterns = curr->matched_patterns->patterns;
- if (thiz->match_handler(match, txt, param))
- return 1;
- }
+ match->match_map = ac_automata_exact_match(curr->matched_patterns,position,txt);
+ if(match->match_map) {
+ match->match_counter++; /* we have a matching */
+#ifndef __KERNEL__
+ if(debug) {
+ int i;
+ AC_PATTERN_t *patterns = curr->matched_patterns->patterns;
+ for(i=0; i < curr->matched_patterns->num; i++) {
+ if(!(match->match_map & (1 << i))) continue;
+ printf(" match%d: %c%.*s%c [%u]\n",i+1,
+ patterns[i].rep.from_start ? '^':' ',
+ patterns[i].length,patterns[i].astring,
+ patterns[i].rep.at_end ? '$':' ',
+ patterns[i].rep.number);
+ }
+ }
+#endif
+ if(thiz->match_handler) {
+ /* We check 'next' to find out if we came here after a alphabet
+ * transition or due to a fail. in second case we should not report
+ * matching because it was reported in previous node */
+ match->position = position;
+ match->match_num = curr->matched_patterns->num;
+ match->patterns = curr->matched_patterns->patterns;
+ if (thiz->match_handler(match, txt, param))
+ return 1;
+ }
+ } /* match->match_map */
}
}
}
@@ -470,6 +506,16 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
for(i = 0; i < 4; i++)
if(txt->match.matched[i]) {
*param = (txt->match.matched[i])->rep;
+#ifndef __KERNEL__
+ if(debug) {
+ AC_PATTERN_t *pattern = txt->match.matched[i];
+ printf("best match: %c%.*s%c [%u]\n",
+ pattern->rep.from_start ? '^':' ',
+ pattern->length,pattern->astring,
+ pattern->rep.at_end ? '$':' ',
+ pattern->rep.number);
+ }
+#endif
return 1;
}
return 0;
@@ -538,26 +584,26 @@ void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern) {
static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) {
char *c;
int i;
- printf("%04d: ",n->id);
- if(n->failure_node) printf(" failure %04d:",n->failure_node->id);
- printf(" d:%d %c",n->depth, n->use ? '+':'-');
+ fprintf(ai->file,"%04d: ",n->id);
+ if(n->failure_node) fprintf(ai->file," failure %04d:",n->failure_node->id);
+ fprintf(ai->file," d:%d %c",n->depth, n->use ? '+':'-');
ai->memcnt += sizeof(*n);
if(n->matched_patterns) {
ai->memcnt += sizeof(n->matched_patterns) +
n->matched_patterns->max*sizeof(n->matched_patterns->patterns[0]);
}
- if(!n->use) { printf("\n"); return; }
+ if(!n->use) { fprintf(ai->file,"\n"); return; }
if(n->one) {
(ai->node_oc)++;
- printf(" '%c' next->%d\n",n->one_alpha,
+ fprintf(ai->file," '%c' next->%d\n",n->one_alpha,
n->outgoing ? ((AC_NODE_t *)n->outgoing)->id : -1);
return;
}
if(!n->outgoing) {
- printf(" BUG! !outgoing\n");
+ fprintf(ai->file," BUG! !outgoing\n");
return;
}
- printf("%s\n",n->range ? " RANGE":"");
+ fprintf(ai->file,"%s\n",n->range ? " RANGE":"");
c = (char *)edge_get_alpha(n->outgoing);
if(n->outgoing->degree <= 8)
(ai->node_8c)++;
@@ -566,7 +612,7 @@ static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) {
if(n->range)
(ai->node_xr)++;
for(i=0; i < n->outgoing->degree; i++) {
- printf(" %d: \"%c\" -> %d\n",i,c[i],
+ fprintf(ai->file," %d: \"%c\" -> %d\n",i,c[i],
n->outgoing->next[i] ? n->outgoing->next[i]->id:-1);
}
ai->memcnt += sizeof(n->outgoing) + edge_data_size(n->outgoing->max);
@@ -580,7 +626,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz,
if(idx) return ACERR_SUCCESS;
dump_node_header(n,ai);
if (n->matched_patterns && n->matched_patterns->num && n->final) {
- char lbuf[300];
+ char lbuf[512];
int nl = 0,j;
nl = snprintf(lbuf,sizeof(lbuf),"'%.100s' N:%d{",rstr,n->matched_patterns->num);
@@ -593,7 +639,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz,
sid->astring,
sid->rep.number & 0x4000 ? '$':' ');
}
- printf("%s}\n",lbuf);
+ fprintf(ai->file,"%s}\n",lbuf);
}
return ACERR_SUCCESS;
}
@@ -615,22 +661,23 @@ static void dump_node_str(AC_AUTOMATA_t * thiz, AC_NODE_t * node,
* char repcast: 'n': print AC_REP_t as number, 's': print AC_REP_t as string
******************************************************************************/
-void ac_automata_dump(AC_AUTOMATA_t * thiz, char *rstr, size_t rstr_size, char repcast) {
+void ac_automata_dump(AC_AUTOMATA_t * thiz, FILE *file) {
struct aho_dump_info ai;
memset((char *)&ai,0,sizeof(ai));
-
- printf("---DUMP- all nodes %u - max strlen %u -%s---\n",
+ ai.file = file ? file : stdout;
+ fprintf(ai.file,"---DUMP- all nodes %u - max strlen %u -%s---\n",
(unsigned int)thiz->all_nodes_num,
(unsigned int)thiz->max_str_len,
thiz->automata_open ? "open":"ready");
- printf("root: %px\n",thiz->root);
- *rstr = '\0';
- ai.bufstr = rstr;
- ai.bufstr_len = rstr_size;
+
+ ai.bufstr = acho_malloc(AC_PATTRN_MAX_LENGTH+1);
+ ai.bufstr_len = AC_PATTRN_MAX_LENGTH;
+ if(!ai.bufstr) return;
+ ai.bufstr[0] = '\0';
ac_automata_walk(thiz,dump_node_common,dump_node_str,(void *)&ai);
- printf("---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n",
+ fprintf(ai.file,"---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n",
ai.memcnt,(int)ai.memcnt/(thiz->all_nodes_num+1),(int)ai.node_oc,(int)ai.node_8c,(int)ai.node_xc,(int)ai.node_xr);
}
#endif