aboutsummaryrefslogtreecommitdiff
path: root/src/lib/third_party
diff options
context:
space:
mode:
authorVitaly Lavrov <vel21ripn@gmail.com>2021-07-12 15:39:43 +0000
committerGitHub <noreply@github.com>2021-07-12 17:39:43 +0200
commitc418b7110b9385c5c3748c10e198df27ae0f7083 (patch)
tree046941f8085b48bf27b03cd60bfaee180906af21 /src/lib/third_party
parent78b1295dc18e297c1da53006bde1e0870e278db9 (diff)
ahoсorasick. Code review. Part 2. (#1236)
Simplified the process of adding lines to AC_AUTOMATA_t. Use the ndpi_string_to_automa() function to add patterns with domain names. For other cases can use ndpi_add_string_value_to_automa(). ac_automata_feature(ac_automa, AC_FEATURE_LC) allows adding and compare data in a case insensitive manner. For mandatory pattern comparison from the end of the line, the "ac_pattern.rep.at_end=1" flag is used. This eliminated unnecessary conversions to lowercase and adding "$" for end-of-line matching in domain name patterns. ac_match_handler() has been renamed ac_domain_match_handler() and has been greatly simplified. ac_domain_match_handler() looks for the template with the highest domain level. For special cases it is possible to manually specify the domain level. Added test for checking ambiguous domain names like: - short.weixin.qq.com is QQ, not Wechat - instagram.faae1-1.fna.fbcdn.net is Instagram, not Facebook If you specify a NULL handler when creating the AC_AUTOMATA_t structure, then a pattern with the maximum length that satisfies the search conditions will be found (exact match, from the beginning of the string, from the end of the string, or a substring). Added debugging for ac_automata_search. To do this, you need to enable debugging globally using ac_automata_enable_debug(1) and enable debugging in the AC_AUTOMATA_t structure using ac_automata_name("name", AC_FEATURE_DEBUG). The search will display "name" and a list of matching patterns. Running "AHO_DEBUG=1 ndpiReader ..." will show the lines that were searched for templates and which templates were found. The ac_automata_dump() prototype has been changed. Now it outputs data to a file. If it is specified as NULL, then the output will be directed to stdout. If you need to get data as a string, then use open_memstream(). Added the ability to run individual tests via the do.sh script
Diffstat (limited to 'src/lib/third_party')
-rw-r--r--src/lib/third_party/include/ahocorasick.h32
-rw-r--r--src/lib/third_party/src/ahocorasick.c135
2 files changed, 112 insertions, 55 deletions
diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h
index 71fc22d0d..5efbc05f2 100644
--- a/src/lib/third_party/include/ahocorasick.h
+++ b/src/lib/third_party/include/ahocorasick.h
@@ -54,8 +54,11 @@ typedef char AC_ALPHABET_t;
**/
typedef struct {
uint32_t number; /* Often used to store procotolId */
- uint16_t breed,
- category:14,from_start:1,at_end:1;
+ uint16_t breed, category;
+ uint16_t level, /* Domain level for comparison */
+ from_start:1, /* match from start of string */
+ at_end:1, /* match at end of string */
+ dot:1; /* is domain name */
} AC_REP_t;
/* AC_PATTERN_t:
@@ -103,8 +106,10 @@ typedef struct {
typedef struct
{
- AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */
- AC_PATTERN_t *patterns; /* Array of matched pattern */
+ AC_PATTERN_t *matched[4], /* for ac_automata_exact_match() */
+ *last; /* for callback */
+ AC_PATTERN_t *patterns; /* Array of matched pattern */
+ unsigned int match_map; /* Matched patterns (bitmap) */
unsigned int position; /* The end position of matching pattern(s) in the text */
unsigned short int match_num; /* Number of matched patterns */
unsigned short int match_counter; /* Counter of found matches */
@@ -120,7 +125,7 @@ typedef struct
AC_MATCH_t match;
AC_ALPHABET_t * astring; /* String of alphabets */
unsigned short int length, /* Length of string */
- ignore_case;
+ option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */;
} AC_TEXT_t;
@@ -218,7 +223,7 @@ typedef struct
* means not finalized (is open). after finalizing automata you can not
* add pattern to automata anymore. */
unsigned short automata_open,
- to_lc:1, no_root_range:1; /* lowercase match */
+ to_lc:1, no_root_range:1,debug:1; /* lowercase match */
/* Statistic Variables */
unsigned long total_patterns; /* Total patterns in the automata */
@@ -229,17 +234,20 @@ typedef struct
int id; /* node id */
int add_to_range; /* for convert to range */
int n_oc,n_range,n_find; /* statistics */
+ char name[32]; /* if debug != 0 */
} AC_AUTOMATA_t;
typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);
typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);
-#define AC_FEATURE_LC 1
-#define AC_FEATURE_NO_ROOT_RANGE 2
+#define AC_FEATURE_DEBUG 1
+#define AC_FEATURE_LC 2
+#define AC_FEATURE_NO_ROOT_RANGE 4
AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc);
AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature);
+AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug);
AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz);
AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
@@ -252,7 +260,9 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
void ac_automata_clean (AC_AUTOMATA_t * thiz);
void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
#ifndef __KERNEL__
-void ac_automata_dump (AC_AUTOMATA_t * thiz,
- char *buf, size_t bufsize, char repcast);
+/* Global debug control. */
+void ac_automata_enable_debug (int debug);
+/* See man open_memstream() for get result as string */
+void ac_automata_dump (AC_AUTOMATA_t * thiz, FILE *);
+#endif
#endif
-#endif
diff --git a/src/lib/third_party/src/ahocorasick.c b/src/lib/third_party/src/ahocorasick.c
index ab9c5d333..06ed56a27 100644
--- a/src/lib/third_party/src/ahocorasick.c
+++ b/src/lib/third_party/src/ahocorasick.c
@@ -69,9 +69,11 @@ struct aho_dump_info {
int buf_pos,ip;
char *bufstr;
size_t bufstr_len;
+ FILE *file;
};
static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *);
+static int ac_automata_global_debug = 0;
#endif
/* Private function prototype */
@@ -195,6 +197,20 @@ AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature)
return ACERR_SUCCESS;
}
+AC_ERROR_t ac_automata_name (AC_AUTOMATA_t * thiz, char *name, int debug)
+{
+ if(!thiz) return ACERR_ERROR;
+ strncpy(thiz->name,name,sizeof(thiz->name)-1);
+ thiz->debug = debug != 0;
+ return ACERR_SUCCESS;
+}
+
+#ifndef __KERNEL__
+void ac_automata_enable_debug (int debug) {
+ ac_automata_global_debug = debug != 0;
+}
+#endif
+
/******************************************************************************
* FUNCTION: ac_automata_add
* Adds pattern to the automata.
@@ -368,30 +384,28 @@ int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *txt) {
AC_PATTERN_t *patterns = mp->patterns;
AC_PATTERN_t **matched = txt->match.matched;
int i;
- for(i=0; i < mp->num; i++,patterns++) {
+ int match_map = 0;
+ for(i=0; i < mp->num && i < (__SIZEOF_INT__*8-1); i++,patterns++) {
do {
if(patterns->rep.from_start && patterns->rep.at_end) {
if(pos == txt->length && patterns->length == pos)
- matched[0] = patterns;
+ matched[0] = patterns, match_map |= 1 << i;
break;
}
if(patterns->rep.from_start) {
- if(patterns->length == pos)
- if(!matched[1] || patterns->length > matched[1]->length)
- matched[1] = patterns;
+ if(patterns->length == pos)
+ matched[1] = patterns, match_map |= 1 << i;
break;
}
if(patterns->rep.at_end) {
- if(pos == txt->length)
- if(!matched[2] || patterns->length > matched[2]->length)
- matched[2] = patterns;
+ if(pos == txt->length)
+ matched[2] = patterns, match_map |= 1 << i;
break;
}
- if(!matched[3] || patterns->length > matched[3]->length)
- matched[3] = patterns;
+ matched[3] = patterns, match_map |= 1 << i;
} while(0);
}
- return 0;
+ return match_map;
}
/******************************************************************************
@@ -414,7 +428,7 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
AC_TEXT_t * txt, AC_REP_t * param)
{
unsigned long position;
- int icase = 0,i;
+ int icase = 0,i,debug=0;
AC_MATCH_t *match;
AC_NODE_t *curr;
AC_NODE_t *next;
@@ -426,14 +440,20 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
position = 0;
curr = thiz->root;
apos = txt->astring;
+#ifndef __KERNEL__
+ if(thiz->debug && ac_automata_global_debug) debug = 1;
+ if(debug) {
+ txt->option = debug; /* for callback */
+ printf("aho %s: search %.*s\n", thiz->name[0] ? thiz->name:"unknown", txt->length, apos);
+ }
+#endif
match = &txt->match;
memset((char*)match,0,sizeof(*match));
- icase = !thiz->to_lc;
/* The 'txt->ignore_case' option is checked
* separately otherwise clang will detect
* uninitialized memory usage much later. */
- if(txt->ignore_case == 1) icase = 1;
+ if(txt->option & AC_FEATURE_LC) icase = 1;
/* This is the main search loop.
* it must be keep as lightweight as possible. */
while (position < txt->length) {
@@ -448,19 +468,35 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
curr = next;
position++;
if(curr->final) {
- match->match_counter++; /* we have a matching */
/* select best match */
- ac_automata_exact_match(curr->matched_patterns,position,txt);
- if(thiz->match_handler) {
- /* We check 'next' to find out if we came here after a alphabet
- * transition or due to a fail. in second case we should not report
- * matching because it was reported in previous node */
- match->position = position;
- match->match_num = curr->matched_patterns->num;
- match->patterns = curr->matched_patterns->patterns;
- if (thiz->match_handler(match, txt, param))
- return 1;
- }
+ match->match_map = ac_automata_exact_match(curr->matched_patterns,position,txt);
+ if(match->match_map) {
+ match->match_counter++; /* we have a matching */
+#ifndef __KERNEL__
+ if(debug) {
+ int i;
+ AC_PATTERN_t *patterns = curr->matched_patterns->patterns;
+ for(i=0; i < curr->matched_patterns->num; i++) {
+ if(!(match->match_map & (1 << i))) continue;
+ printf(" match%d: %c%.*s%c [%u]\n",i+1,
+ patterns[i].rep.from_start ? '^':' ',
+ patterns[i].length,patterns[i].astring,
+ patterns[i].rep.at_end ? '$':' ',
+ patterns[i].rep.number);
+ }
+ }
+#endif
+ if(thiz->match_handler) {
+ /* We check 'next' to find out if we came here after a alphabet
+ * transition or due to a fail. in second case we should not report
+ * matching because it was reported in previous node */
+ match->position = position;
+ match->match_num = curr->matched_patterns->num;
+ match->patterns = curr->matched_patterns->patterns;
+ if (thiz->match_handler(match, txt, param))
+ return 1;
+ }
+ } /* match->match_map */
}
}
}
@@ -470,6 +506,16 @@ int ac_automata_search (AC_AUTOMATA_t * thiz,
for(i = 0; i < 4; i++)
if(txt->match.matched[i]) {
*param = (txt->match.matched[i])->rep;
+#ifndef __KERNEL__
+ if(debug) {
+ AC_PATTERN_t *pattern = txt->match.matched[i];
+ printf("best match: %c%.*s%c [%u]\n",
+ pattern->rep.from_start ? '^':' ',
+ pattern->length,pattern->astring,
+ pattern->rep.at_end ? '$':' ',
+ pattern->rep.number);
+ }
+#endif
return 1;
}
return 0;
@@ -538,26 +584,26 @@ void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern) {
static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) {
char *c;
int i;
- printf("%04d: ",n->id);
- if(n->failure_node) printf(" failure %04d:",n->failure_node->id);
- printf(" d:%d %c",n->depth, n->use ? '+':'-');
+ fprintf(ai->file,"%04d: ",n->id);
+ if(n->failure_node) fprintf(ai->file," failure %04d:",n->failure_node->id);
+ fprintf(ai->file," d:%d %c",n->depth, n->use ? '+':'-');
ai->memcnt += sizeof(*n);
if(n->matched_patterns) {
ai->memcnt += sizeof(n->matched_patterns) +
n->matched_patterns->max*sizeof(n->matched_patterns->patterns[0]);
}
- if(!n->use) { printf("\n"); return; }
+ if(!n->use) { fprintf(ai->file,"\n"); return; }
if(n->one) {
(ai->node_oc)++;
- printf(" '%c' next->%d\n",n->one_alpha,
+ fprintf(ai->file," '%c' next->%d\n",n->one_alpha,
n->outgoing ? ((AC_NODE_t *)n->outgoing)->id : -1);
return;
}
if(!n->outgoing) {
- printf(" BUG! !outgoing\n");
+ fprintf(ai->file," BUG! !outgoing\n");
return;
}
- printf("%s\n",n->range ? " RANGE":"");
+ fprintf(ai->file,"%s\n",n->range ? " RANGE":"");
c = (char *)edge_get_alpha(n->outgoing);
if(n->outgoing->degree <= 8)
(ai->node_8c)++;
@@ -566,7 +612,7 @@ static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) {
if(n->range)
(ai->node_xr)++;
for(i=0; i < n->outgoing->degree; i++) {
- printf(" %d: \"%c\" -> %d\n",i,c[i],
+ fprintf(ai->file," %d: \"%c\" -> %d\n",i,c[i],
n->outgoing->next[i] ? n->outgoing->next[i]->id:-1);
}
ai->memcnt += sizeof(n->outgoing) + edge_data_size(n->outgoing->max);
@@ -580,7 +626,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz,
if(idx) return ACERR_SUCCESS;
dump_node_header(n,ai);
if (n->matched_patterns && n->matched_patterns->num && n->final) {
- char lbuf[300];
+ char lbuf[512];
int nl = 0,j;
nl = snprintf(lbuf,sizeof(lbuf),"'%.100s' N:%d{",rstr,n->matched_patterns->num);
@@ -593,7 +639,7 @@ static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz,
sid->astring,
sid->rep.number & 0x4000 ? '$':' ');
}
- printf("%s}\n",lbuf);
+ fprintf(ai->file,"%s}\n",lbuf);
}
return ACERR_SUCCESS;
}
@@ -615,22 +661,23 @@ static void dump_node_str(AC_AUTOMATA_t * thiz, AC_NODE_t * node,
* char repcast: 'n': print AC_REP_t as number, 's': print AC_REP_t as string
******************************************************************************/
-void ac_automata_dump(AC_AUTOMATA_t * thiz, char *rstr, size_t rstr_size, char repcast) {
+void ac_automata_dump(AC_AUTOMATA_t * thiz, FILE *file) {
struct aho_dump_info ai;
memset((char *)&ai,0,sizeof(ai));
-
- printf("---DUMP- all nodes %u - max strlen %u -%s---\n",
+ ai.file = file ? file : stdout;
+ fprintf(ai.file,"---DUMP- all nodes %u - max strlen %u -%s---\n",
(unsigned int)thiz->all_nodes_num,
(unsigned int)thiz->max_str_len,
thiz->automata_open ? "open":"ready");
- printf("root: %px\n",thiz->root);
- *rstr = '\0';
- ai.bufstr = rstr;
- ai.bufstr_len = rstr_size;
+
+ ai.bufstr = acho_malloc(AC_PATTRN_MAX_LENGTH+1);
+ ai.bufstr_len = AC_PATTRN_MAX_LENGTH;
+ if(!ai.bufstr) return;
+ ai.bufstr[0] = '\0';
ac_automata_walk(thiz,dump_node_common,dump_node_str,(void *)&ai);
- printf("---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n",
+ fprintf(ai.file,"---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n",
ai.memcnt,(int)ai.memcnt/(thiz->all_nodes_num+1),(int)ai.node_oc,(int)ai.node_8c,(int)ai.node_xc,(int)ai.node_xr);
}
#endif