diff options
author | Vitaly Lavrov <vel21ripn@gmail.com> | 2021-07-12 15:39:43 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-12 17:39:43 +0200 |
commit | c418b7110b9385c5c3748c10e198df27ae0f7083 (patch) | |
tree | 046941f8085b48bf27b03cd60bfaee180906af21 /src/lib/third_party | |
parent | 78b1295dc18e297c1da53006bde1e0870e278db9 (diff) |
ahoсorasick. Code review. Part 2. (#1236)
Simplified the process of adding lines to AC_AUTOMATA_t.
Use the ndpi_string_to_automa() function to add patterns with domain names.
For other cases can use ndpi_add_string_value_to_automa().
ac_automata_feature(ac_automa, AC_FEATURE_LC) allows adding
and compare data in a case insensitive manner. For mandatory pattern comparison
from the end of the line, the "ac_pattern.rep.at_end=1" flag is used.
This eliminated unnecessary conversions to lowercase and adding "$" for
end-of-line matching in domain name patterns.
ac_match_handler() has been renamed ac_domain_match_handler() and has been greatly simplified.
ac_domain_match_handler() looks for the template with the highest domain level.
For special cases it is possible to manually specify the domain level.
Added test for checking ambiguous domain names like:
- short.weixin.qq.com is QQ, not Wechat
- instagram.faae1-1.fna.fbcdn.net is Instagram, not Facebook
If you specify a NULL handler when creating the AC_AUTOMATA_t structure,
then a pattern with the maximum length that satisfies the search conditions will be found
(exact match, from the beginning of the string, from the end of the string, or a substring).
Added debugging for ac_automata_search.
To do this, you need to enable debugging globally using ac_automata_enable_debug(1) and
enable debugging in the AC_AUTOMATA_t structure using ac_automata_name("name", AC_FEATURE_DEBUG).
The search will display "name" and a list of matching patterns.
Running "AHO_DEBUG=1 ndpiReader ..." will show the lines that were searched for templates
and which templates were found.
The ac_automata_dump() prototype has been changed. Now it outputs data to a file.
If it is specified as NULL, then the output will be directed to stdout.
If you need to get data as a string, then use open_memstream().
Added the ability to run individual tests via the do.sh script
Diffstat (limited to 'src/lib/third_party')
-rw-r--r-- | src/lib/third_party/include/ahocorasick.h | 32 | ||||
-rw-r--r-- | src/lib/third_party/src/ahocorasick.c | 135 |
2 files changed, 112 insertions, 55 deletions
diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h index 71fc22d0d..5efbc05f2 100644 --- a/src/lib/third_party/include/ahocorasick.h +++ b/src/lib/third_party/include/ahocorasick.h @@ -54,8 +54,11 @@ typedef char AC_ALPHABET_t; **/ typedef struct { uint32_t number; /* Often used to store procotolId */ - uint16_t breed, - category:14,from_start:1,at_end:1; + uint16_t breed, category; + uint16_t level, /* Domain level for comparison */ + from_start:1, /* match from start of string */ + at_end:1, /* match at end of string */ + dot:1; /* is domain name */ } AC_REP_t; /* AC_PATTERN_t: @@ -103,8 +106,10 @@ typedef struct { typedef struct { - AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */ - AC_PATTERN_t *patterns; /* Array of matched pattern */ + AC_PATTERN_t *matched[4], /* for ac_automata_exact_match() */ + *last; /* for callback */ + AC_PATTERN_t *patterns; /* Array of matched pattern */ + unsigned int match_map; /* Matched patterns (bitmap) */ unsigned int position; /* The end position of matching pattern(s) in the text */ unsigned short int match_num; /* Number of matched patterns */ unsigned short int match_counter; /* Counter of found matches */ @@ -120,7 +125,7 @@ typedef struct AC_MATCH_t match; AC_ALPHABET_t * astring; /* String of alphabets */ unsigned short int length, /* Length of string */ - ignore_case; + option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */; } AC_TEXT_t; @@ -218,7 +223,7 @@ typedef struct * means not finalized (is open). after finalizing automata you can not * add pattern to automata anymore. */ unsigned short automata_open, - to_lc:1, no_root_range:1; /* lowercase match */ + to_lc:1, no_root_range:1,debug:1; /* lowercase match */ /* Statistic Variables */ unsigned long total_patterns; /* Total patterns in the automata */ @@ -229,17 +234,20 @@ typedef struct int id; /* node id */ int add_to_range; /* for convert to range */ int n_oc,n_range,n_find; /* statistics */ + char name[32]; /* if debug != 0 */ } AC_AUTOMATA_t; typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *); typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *); -#define AC_FEATURE_LC 1 -#define AC_FEATURE_NO_ROOT_RANGE 2 +#define AC_FEATURE_DEBUG 1 +#define AC_FEATURE_LC 2 +#define AC_FEATURE_NO_ROOT_RANGE 4 AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc); AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature); +AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug); AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz); AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb, @@ -252,7 +260,9 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *); void ac_automata_clean (AC_AUTOMATA_t * thiz); void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern); #ifndef __KERNEL__ -void ac_automata_dump (AC_AUTOMATA_t * thiz, - char *buf, size_t bufsize, char repcast); +/* Global debug control. */ +void ac_automata_enable_debug (int debug); +/* See man open_memstream() for get result as string */ +void ac_automata_dump (AC_AUTOMATA_t * thiz, FILE *); +#endif #endif -#endif diff --git a/src/lib/third_party/src/ahocorasick.c b/src/lib/third_party/src/ahocorasick.c index ab9c5d333..06ed56a27 100644 --- a/src/lib/third_party/src/ahocorasick.c +++ b/src/lib/third_party/src/ahocorasick.c @@ -69,9 +69,11 @@ struct aho_dump_info { int buf_pos,ip; char *bufstr; size_t bufstr_len; + FILE *file; }; static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *); +static int ac_automata_global_debug = 0; #endif /* Private function prototype */ @@ -195,6 +197,20 @@ AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature) return ACERR_SUCCESS; } +AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug) +{ + if(!thiz) return ACERR_ERROR; + strncpy(thiz->name,name,sizeof(thiz->name)-1); + thiz->debug = debug != 0; + return ACERR_SUCCESS; +} + +#ifndef __KERNEL__ +void ac_automata_enable_debug (int debug) { + ac_automata_global_debug = debug != 0; +} +#endif + /****************************************************************************** * FUNCTION: ac_automata_add * Adds pattern to the automata. @@ -368,30 +384,28 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *txt) { AC_PATTERN_t *patterns = mp->patterns; AC_PATTERN_t **matched = txt->match.matched; int i; - for(i=0; i < mp->num; i++,patterns++) { + int match_map = 0; + for(i=0; i < mp->num && i < (__SIZEOF_INT__*8-1); i++,patterns++) { do { if(patterns->rep.from_start && patterns->rep.at_end) { if(pos == txt->length && patterns->length == pos) - matched[0] = patterns; + matched[0] = patterns, match_map |= 1 << i; break; } if(patterns->rep.from_start) { - if(patterns->length == pos) - if(!matched[1] || patterns->length > matched[1]->length) - matched[1] = patterns; + if(patterns->length == pos) + matched[1] = patterns, match_map |= 1 << i; break; } if(patterns->rep.at_end) { - if(pos == txt->length) - if(!matched[2] || patterns->length > matched[2]->length) - matched[2] = patterns; + if(pos == txt->length) + matched[2] = patterns, match_map |= 1 << i; break; } - if(!matched[3] || patterns->length > matched[3]->length) - matched[3] = patterns; + matched[3] = patterns, match_map |= 1 << i; } while(0); } - return 0; + return match_map; } /****************************************************************************** @@ -414,7 +428,7 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, AC_REP_t * param) { unsigned long position; - int icase = 0,i; + int icase = 0,i,debug=0; AC_MATCH_t *match; AC_NODE_t *curr; AC_NODE_t *next; @@ -426,14 +440,20 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, position = 0; curr = thiz->root; apos = txt->astring; +#ifndef __KERNEL__ + if(thiz->debug && ac_automata_global_debug) debug = 1; + if(debug) { + txt->option = debug; /* for callback */ + printf("aho %s: search %.*s\n", thiz->name[0] ? thiz->name:"unknown", txt->length, apos); + } +#endif match = &txt->match; memset((char*)match,0,sizeof(*match)); - icase = !thiz->to_lc; /* The 'txt->ignore_case' option is checked * separately otherwise clang will detect * uninitialized memory usage much later. */ - if(txt->ignore_case == 1) icase = 1; + if(txt->option & AC_FEATURE_LC) icase = 1; /* This is the main search loop. * it must be keep as lightweight as possible. */ while (position < txt->length) { @@ -448,19 +468,35 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, curr = next; position++; if(curr->final) { - match->match_counter++; /* we have a matching */ /* select best match */ - ac_automata_exact_match(curr->matched_patterns,position,txt); - if(thiz->match_handler) { - /* We check 'next' to find out if we came here after a alphabet - * transition or due to a fail. in second case we should not report - * matching because it was reported in previous node */ - match->position = position; - match->match_num = curr->matched_patterns->num; - match->patterns = curr->matched_patterns->patterns; - if (thiz->match_handler(match, txt, param)) - return 1; - } + match->match_map = ac_automata_exact_match(curr->matched_patterns,position,txt); + if(match->match_map) { + match->match_counter++; /* we have a matching */ +#ifndef __KERNEL__ + if(debug) { + int i; + AC_PATTERN_t *patterns = curr->matched_patterns->patterns; + for(i=0; i < curr->matched_patterns->num; i++) { + if(!(match->match_map & (1 << i))) continue; + printf(" match%d: %c%.*s%c [%u]\n",i+1, + patterns[i].rep.from_start ? '^':' ', + patterns[i].length,patterns[i].astring, + patterns[i].rep.at_end ? '$':' ', + patterns[i].rep.number); + } + } +#endif + if(thiz->match_handler) { + /* We check 'next' to find out if we came here after a alphabet + * transition or due to a fail. in second case we should not report + * matching because it was reported in previous node */ + match->position = position; + match->match_num = curr->matched_patterns->num; + match->patterns = curr->matched_patterns->patterns; + if (thiz->match_handler(match, txt, param)) + return 1; + } + } /* match->match_map */ } } } @@ -470,6 +506,16 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, for(i = 0; i < 4; i++) if(txt->match.matched[i]) { *param = (txt->match.matched[i])->rep; +#ifndef __KERNEL__ + if(debug) { + AC_PATTERN_t *pattern = txt->match.matched[i]; + printf("best match: %c%.*s%c [%u]\n", + pattern->rep.from_start ? '^':' ', + pattern->length,pattern->astring, + pattern->rep.at_end ? '$':' ', + pattern->rep.number); + } +#endif return 1; } return 0; @@ -538,26 +584,26 @@ void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern) { static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) { char *c; int i; - printf("%04d: ",n->id); - if(n->failure_node) printf(" failure %04d:",n->failure_node->id); - printf(" d:%d %c",n->depth, n->use ? '+':'-'); + fprintf(ai->file,"%04d: ",n->id); + if(n->failure_node) fprintf(ai->file," failure %04d:",n->failure_node->id); + fprintf(ai->file," d:%d %c",n->depth, n->use ? '+':'-'); ai->memcnt += sizeof(*n); if(n->matched_patterns) { ai->memcnt += sizeof(n->matched_patterns) + n->matched_patterns->max*sizeof(n->matched_patterns->patterns[0]); } - if(!n->use) { printf("\n"); return; } + if(!n->use) { fprintf(ai->file,"\n"); return; } if(n->one) { (ai->node_oc)++; - printf(" '%c' next->%d\n",n->one_alpha, + fprintf(ai->file," '%c' next->%d\n",n->one_alpha, n->outgoing ? ((AC_NODE_t *)n->outgoing)->id : -1); return; } if(!n->outgoing) { - printf(" BUG! !outgoing\n"); + fprintf(ai->file," BUG! !outgoing\n"); return; } - printf("%s\n",n->range ? " RANGE":""); + fprintf(ai->file,"%s\n",n->range ? " RANGE":""); c = (char *)edge_get_alpha(n->outgoing); if(n->outgoing->degree <= 8) (ai->node_8c)++; @@ -566,7 +612,7 @@ static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) { if(n->range) (ai->node_xr)++; for(i=0; i < n->outgoing->degree; i++) { - printf(" %d: \"%c\" -> %d\n",i,c[i], + fprintf(ai->file," %d: \"%c\" -> %d\n",i,c[i], n->outgoing->next[i] ? n->outgoing->next[i]->id:-1); } ai->memcnt += sizeof(n->outgoing) + edge_data_size(n->outgoing->max); @@ -580,7 +626,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz, if(idx) return ACERR_SUCCESS; dump_node_header(n,ai); if (n->matched_patterns && n->matched_patterns->num && n->final) { - char lbuf[300]; + char lbuf[512]; int nl = 0,j; nl = snprintf(lbuf,sizeof(lbuf),"'%.100s' N:%d{",rstr,n->matched_patterns->num); @@ -593,7 +639,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz, sid->astring, sid->rep.number & 0x4000 ? '$':' '); } - printf("%s}\n",lbuf); + fprintf(ai->file,"%s}\n",lbuf); } return ACERR_SUCCESS; } @@ -615,22 +661,23 @@ static void dump_node_str(AC_AUTOMATA_t * thiz, AC_NODE_t * node, * char repcast: 'n': print AC_REP_t as number, 's': print AC_REP_t as string ******************************************************************************/ -void ac_automata_dump(AC_AUTOMATA_t * thiz, char *rstr, size_t rstr_size, char repcast) { +void ac_automata_dump(AC_AUTOMATA_t * thiz, FILE *file) { struct aho_dump_info ai; memset((char *)&ai,0,sizeof(ai)); - - printf("---DUMP- all nodes %u - max strlen %u -%s---\n", + ai.file = file ? file : stdout; + fprintf(ai.file,"---DUMP- all nodes %u - max strlen %u -%s---\n", (unsigned int)thiz->all_nodes_num, (unsigned int)thiz->max_str_len, thiz->automata_open ? "open":"ready"); - printf("root: %px\n",thiz->root); - *rstr = '\0'; - ai.bufstr = rstr; - ai.bufstr_len = rstr_size; + + ai.bufstr = acho_malloc(AC_PATTRN_MAX_LENGTH+1); + ai.bufstr_len = AC_PATTRN_MAX_LENGTH; + if(!ai.bufstr) return; + ai.bufstr[0] = '\0'; ac_automata_walk(thiz,dump_node_common,dump_node_str,(void *)&ai); - printf("---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n", + fprintf(ai.file,"---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n", ai.memcnt,(int)ai.memcnt/(thiz->all_nodes_num+1),(int)ai.node_oc,(int)ai.node_8c,(int)ai.node_xc,(int)ai.node_xr); } #endif |