diff options
author | Vitaly Lavrov <vel21ripn@gmail.com> | 2021-06-07 12:19:40 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-06-07 14:19:40 +0200 |
commit | 65678dbeeabcf51e02ba3cc9ffbd36324a92a971 (patch) | |
tree | 6ad7729fe9fcd32edb1157ad18adde15b6e26b72 /src/lib/third_party | |
parent | 2fcf641e87edbd7188b5c8390c3e12128638f01a (diff) |
New version of the ahocorasick library (#1200)
The new version is about 25% faster with -O2 and 45% faster with -O3.
No recursion is used (smaller stack size required).
Uses less memory (by valgrind info)
bigram:
- original 1796 allocs, 247864 bytes allocated
- new 1232 allocs, 158880 bytes allocated
host_match:
- original 18038 allocs, 3004576 bytes allocated
- new 6861 allocs, 396624 bytes allocated
The function ac_automata_search() is thread safe.
Optional case-insensitive comparison.
Matching at the beginning and at the end of the string is supported.
One code file and one header file.
Diffstat (limited to 'src/lib/third_party')
-rw-r--r-- | src/lib/third_party/include/actypes.h | 136 | ||||
-rw-r--r-- | src/lib/third_party/include/ahocorasick.h | 246 | ||||
-rw-r--r-- | src/lib/third_party/include/node.h | 66 | ||||
-rw-r--r-- | src/lib/third_party/include/sort.h | 6 | ||||
-rw-r--r-- | src/lib/third_party/src/ahocorasick.c | 1198 | ||||
-rw-r--r-- | src/lib/third_party/src/node.c | 274 | ||||
-rw-r--r-- | src/lib/third_party/src/sort.c | 92 |
7 files changed, 1227 insertions, 791 deletions
diff --git a/src/lib/third_party/include/actypes.h b/src/lib/third_party/include/actypes.h deleted file mode 100644 index b775f6ac9..000000000 --- a/src/lib/third_party/include/actypes.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * actypes.h: Includes basic data types of ahocorasick library - * This file is part of multifast. - * - Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> - - multifast is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - multifast is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with multifast. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _AC_TYPES_H_ -#define _AC_TYPES_H_ - -/* AC_ALPHABET_t: - * defines the alphabet type. - * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal - * with streams of other (bigger) types e.g. integers, specific enum, objects. - * Although they consists of string of bytes (chars), but using their specific - * types for AC_ALPHABET_t will lead to a better performance. so instead of - * dealing with strings of chars, we assume dealing with strings of - * AC_ALPHABET_t and leave it optional for other developers to define their - * own alphabets. - **/ -typedef char AC_ALPHABET_t; - -/* AC_REP_t: - * Provides a more readable representative for a pattern. - * because patterns themselves are not always suitable for displaying - * (e.g. for hex patterns), we offer this type to improve intelligibility - * of output. furthermore, sometimes it is useful, for example while - * retrieving patterns from a database, to maintain their identifiers in the - * automata for further reference. we provisioned two possible types as a - * union for this purpose. you can add your desired type in it. - **/ -typedef struct { - u_int32_t number; /* Often used to store procotolId */ - u_int16_t category, breed; -} AC_REP_t; - -/* AC_PATTERN_t: - * This is the pattern type that must be fed into AC automata. - * the 'astring' field is not null-terminated, due to it can contain zero - * value bytes. the 'length' field determines the number of AC_ALPHABET_t it - * carries. the 'representative' field is described in AC_REP_t. despite - * 'astring', 'representative' can have duplicate values for different given - * AC_PATTERN_t. it is an optional field and you can just fill it with 0. - * CAUTION: - * Not always the 'astring' points to the correct position in memory. - * it is the responsibility of your program to maintain a permanent allocation - * for astring field of the added pattern to automata. - **/ -typedef struct -{ - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of pattern */ - u_int8_t is_existing; /* 1 if the node is already part of another AC_PATTERN_t */ - AC_REP_t rep; /* Representative string (optional) */ -} AC_PATTERN_t; - -/* AC_TEXT_t: - * The input text type that is fed to ac_automata_search() to be searched. - * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input - * text, but for the purpose of being more readable, we defined this new type. - **/ -typedef struct -{ - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of string */ -} AC_TEXT_t; - -/* AC_MATCH_t: - * Provides the structure for reporting a match event. - * a match event occurs when the automata reaches a final node. any final - * node can match one or more pattern at a position in a text. the - * 'patterns' field holds these matched patterns. obviously these - * matched patterns have same end-position in the text. there is a relationship - * between matched patterns: the shorter one is a factor (tail) of the longer - * one. the 'position' maintains the end position of matched patterns. the - * start position of patterns could be found by knowing their 'length' in - * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at - * position 40 in the text, then the start position of them are 34 and 36 - * respectively. finally the field 'match_num' maintains the number of - * matched patterns. - **/ -typedef struct -{ - AC_PATTERN_t * patterns; /* Array of matched pattern */ - long position; /* The end position of matching pattern(s) in the text */ - unsigned int match_num; /* Number of matched patterns */ -} AC_MATCH_t; - -/* AC_ERROR_t: - * Error that may occur while adding a pattern to the automata. - * it is returned by ac_automata_add(). - **/ -typedef enum - { - ACERR_SUCCESS = 0, /* No error occurred */ - ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ - ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ - ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ - ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling - ac_automata_finalize() you can not add new patterns to the automata. */ - } AC_ERROR_t; - -/* MATCH_CALLBACK_t: - * This is the call-back function type that must be given to automata at - * initialization to report match occurrence to the caller. - * at a match event, the automata will reach you using this function and sends - * you a pointer to AC_MATCH_t. using that pointer you can handle - * matches. you can send parameters to the call-back function when you call - * ac_automata_search(). at call-back, the automata will sent you those - * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside - * the call-back function you can cast it to whatever you want. - * If you return 0 from MATCH_CALLBACK_t function to the automata, it will - * continue searching, otherwise it will return from ac_automata_search() - * to your calling function. - **/ -typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *); - -/* AC_PATTRN_MAX_LENGTH: - * Maximum acceptable pattern length in AC_PATTERN_t.length - **/ -#define AC_PATTRN_MAX_LENGTH 1024 - -#endif diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h index 943be88eb..551a922f9 100644 --- a/src/lib/third_party/include/ahocorasick.h +++ b/src/lib/third_party/include/ahocorasick.h @@ -1,5 +1,5 @@ /* - * ahocorasick.h: the main ahocorasick header file. + * ahocorasick.h: Includes all types of ahocorasick library * This file is part of multifast. * Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> @@ -18,53 +18,245 @@ along with multifast. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _AUTOMATA_H_ -#define _AUTOMATA_H_ +#ifndef _AC_TYPES_H_ +#define _AC_TYPES_H_ -#include "node.h" +#define AC_AHOCORASICK_NEW +#define AC_PATTRN_MAX_LENGTH 256 + +/* reallocation step for AC_NODE_t.matched_patterns */ +#define REALLOC_CHUNK_MATCHSTR 8 + +/* reallocation step for AC_NODE_t.outgoing array */ +/* Must be a multiple of __SIZEOF_LONG__ ! */ +#define REALLOC_CHUNK_OUTGOING 8 + +/* AC_ALPHABET_t: + * defines the alphabet type. + * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal + * with streams of other (bigger) types e.g. integers, specific enum, objects. + * Although they consists of string of bytes (chars), but using their specific + * types for AC_ALPHABET_t will lead to a better performance. so instead of + * dealing with strings of chars, we assume dealing with strings of + * AC_ALPHABET_t and leave it optional for other developers to define their + * own alphabets. + **/ +typedef char AC_ALPHABET_t; + +/* AC_REP_t: + * Provides a more readable representative for a pattern. + * because patterns themselves are not always suitable for displaying + * (e.g. for hex patterns), we offer this type to improve intelligibility + * of output. furthermore, sometimes it is useful, for example while + * retrieving patterns from a database, to maintain their identifiers in the + * automata for further reference. we provisioned two possible types as a + * union for this purpose. you can add your desired type in it. + **/ +typedef struct { + uint32_t number; /* Often used to store procotolId */ + uint16_t breed, + category:14,from_start:1,at_end:1; +} AC_REP_t; + +/* AC_PATTERN_t: + * This is the pattern type that must be fed into AC automata. + * the 'astring' field is not null-terminated, due to it can contain zero + * value bytes. the 'length' field determines the number of AC_ALPHABET_t it + * carries. the 'representative' field is described in AC_REP_t. despite + * 'astring', 'representative' can have duplicate values for different given + * AC_PATTERN_t. it is an optional field and you can just fill it with 0. + * CAUTION: + * Not always the 'astring' points to the correct position in memory. + * it is the responsibility of your program to maintain a permanent allocation + * for astring field of the added pattern to automata. + **/ typedef struct { - /* The root of the Aho-Corasick trie */ - AC_NODE_t * root; + AC_ALPHABET_t * astring; /* String of alphabets */ + uint16_t length, /* Length of pattern */ + is_existing; /* not union_matchstr */ + AC_REP_t rep; /* Representative string (optional) */ +} AC_PATTERN_t; - /* maintain all nodes pointers. it will be used to access or release - * all nodes. */ - AC_NODE_t ** all_nodes; +typedef struct { + unsigned short num; /* Number of matched patterns at this node */ + unsigned short max; /* Max capacity of allocated memory for matched_patterns */ + AC_PATTERN_t patterns[]; +} AC_PATTERNS_t; - unsigned int all_nodes_num; /* Number of all nodes in the automata */ - unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */ - AC_MATCH_t match; /* Any match is reported with this */ - MATCH_CALLBACK_f match_callback; /* Match call-back function */ +/* AC_MATCH_t: + * Provides the structure for reporting a match event. + * a match event occurs when the automata reaches a final node. any final + * node can match one or more pattern at a position in a text. the + * 'patterns' field holds these matched patterns. obviously these + * matched patterns have same end-position in the text. there is a relationship + * between matched patterns: the shorter one is a factor (tail) of the longer + * one. the 'position' maintains the end position of matched patterns. the + * start position of patterns could be found by knowing their 'length' in + * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at + * position 40 in the text, then the start position of them are 34 and 36 + * respectively. finally the field 'match_num' maintains the number of + * matched patterns. + **/ + +typedef struct +{ + AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */ + AC_PATTERN_t *patterns; /* Array of matched pattern */ + unsigned int position; /* The end position of matching pattern(s) in the text */ + unsigned short int match_num; /* Number of matched patterns */ + unsigned short int match_counter; /* Counter of found matches */ +} AC_MATCH_t; + +/* AC_TEXT_t: + * The input text type that is fed to ac_automata_search() to be searched. + * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input + * text, but for the purpose of being more readable, we defined this new type. + **/ +typedef struct +{ + AC_MATCH_t match; + AC_ALPHABET_t * astring; /* String of alphabets */ + unsigned short int length, /* Length of string */ + ignore_case; +} AC_TEXT_t; + + +/* AC_ERROR_t: + * Error that may occur while adding a pattern to the automata. + * it is returned by ac_automata_add(). + **/ +typedef enum + { + ACERR_SUCCESS = 0, /* No error occurred */ + ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ + ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ + ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ + ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling + ac_automata_finalize() you can not add new patterns to the automata. */ + ACERR_ERROR, /* common error */ + } AC_ERROR_t; + +/* MATCH_CALLBACK_t: + * This is the call-back function type that must be given to automata at + * initialization to report match occurrence to the caller. + * at a match event, the automata will reach you using this function and sends + * you a pointer to AC_MATCH_t. using that pointer you can handle + * matches. you can send parameters to the call-back function when you call + * ac_automata_search(). at call-back, the automata will sent you those + * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside + * the call-back function you can cast it to whatever you want. + * If you return 0 from MATCH_CALLBACK_t function to the automata, it will + * continue searching, otherwise it will return from ac_automata_search() + * to your calling function. + **/ +typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *); + +/* AC_PATTRN_MAX_LENGTH: + * Maximum acceptable pattern length in AC_PATTERN_t.length + **/ + +/* Forward Declaration */ +struct edge; + +/* + * automata node + * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit + */ +typedef struct ac_node +{ + int id; /* Node ID : set after finalize(), only for ac_automata_dump */ + AC_ALPHABET_t one_alpha, + one:1, /* one_char: yes/no */ + range:1, /* range symbols start from one_alpha */ + root:1, /* is root node */ + final:1, /* 0: no ; 1: yes, it is a final node */ + use:1, /* use: yes/no */ + ff:1; /* finalized node */ + unsigned short depth; /* depth: distance between this node and the root */ + + AC_PATTERNS_t *matched_patterns; /* Array of matched patterns */ + struct edge *outgoing; /* Array of outgoing edges */ + + struct ac_node *failure_node; /* The failure node of this node */ + AC_ALPHABET_t *a_ptr; +} AC_NODE_t; + +#ifndef __SIZEOF_POINTER__ +#error SIZEOF_POINTER not defined! +#endif + +struct edge { + unsigned short degree; /* Number of outgoing edges */ + unsigned short max; /* Max capacity of allocated memory for outgoing */ + uint32_t cmap[8]; /* 256 bit */ + AC_NODE_t *next[]; + /* + * first N elements used for 'next' pointers + + * M elements used for symbols storage + * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1)) + * + * if sizeof(void*)==8 + * for max = 8 we must alloc next[9]; + * for max = 16 we must alloc next[18]; + * + */ +}; + +struct ac_path { + AC_NODE_t * n; + unsigned short int idx,l; +}; + +typedef struct +{ + /* The root of the Aho-Corasick trie */ + AC_NODE_t * root; + MATCH_CALLBACK_f match_handler; + unsigned int all_nodes_num; /* Number of all nodes in the automata */ /* this flag indicates that if automata is finalized by * ac_automata_finalize() or not. 1 means finalized and 0 * means not finalized (is open). after finalizing automata you can not * add pattern to automata anymore. */ - unsigned short automata_open; + unsigned short automata_open, + to_lc:1, no_root_range:1; /* lowercase match */ /* Statistic Variables */ unsigned long total_patterns; /* Total patterns in the automata */ + unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */ + + struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */ + int id; /* node id */ + int add_to_range; /* for convert to range */ + int n_oc,n_range,n_find; /* statistics */ } AC_AUTOMATA_t; -typedef struct { - /* It is possible to feed a large input to the automata chunk by chunk to - * be searched using ac_automata_search(). in fact by default automata - * thinks that all chunks are related unless you do ac_automata_reset(). - * followings are variables that keep track of searching state. */ - AC_NODE_t * current_node; /* Pointer to current node while searching */ - unsigned long base_position; /* Represents the position of current chunk - related to whole input text */ -} AC_SEARCH_t; +typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *); + +typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *); +#define AC_FEATURE_LC 1 +#define AC_FEATURE_NO_ROOT_RANGE 2 AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc); +AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature); AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); -void ac_automata_finalize (AC_AUTOMATA_t * thiz); -int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * str, AC_REP_t * param); -void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern); -void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast); +AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz); +AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb, + ALPHA_CALLBACK_f alpha_cb, void *); +int ac_automata_search (AC_AUTOMATA_t * thiz, + AC_TEXT_t * str, + AC_REP_t * param); +int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *); +void ac_automata_clean (AC_AUTOMATA_t * thiz); +void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern); +#ifndef __KERNEL__ +void ac_automata_dump (AC_AUTOMATA_t * thiz, + char *buf, size_t bufsize, char repcast); #endif +#endif diff --git a/src/lib/third_party/include/node.h b/src/lib/third_party/include/node.h deleted file mode 100644 index c172112aa..000000000 --- a/src/lib/third_party/include/node.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * node.h: automata node header file - * This file is part of multifast. - * - Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> - - multifast is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - multifast is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with multifast. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _NODE_H_ -#define _NODE_H_ - -#include "actypes.h" - -/* Forward Declaration */ -struct edge; - -/* automata node */ -typedef struct ac_node -{ - int id; /* Node ID : for debugging purpose */ - short int final; /* 0: no ; 1: yes, it is a final node */ - struct ac_node * failure_node; /* The failure node of this node */ - unsigned short depth; /* depth: distance between this node and the root */ - - /* Matched patterns */ - AC_PATTERN_t * matched_patterns; /* Array of matched patterns */ - unsigned short matched_patterns_num; /* Number of matched patterns at this node */ - unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */ - - /* Outgoing Edges */ - struct edge * outgoing; /* Array of outgoing edges */ - unsigned short outgoing_degree; /* Number of outgoing edges */ - unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */ -} AC_NODE_t; - -/* The Edge of the Node */ -struct edge -{ - AC_ALPHABET_t alpha; /* Edge alpha */ - struct ac_node * next; /* Target of the edge */ -}; - - -AC_NODE_t * node_create (void); -AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing); -void node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha); -AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -void node_release (AC_NODE_t * thiz, u_int8_t free_pattern); -void node_assign_id (AC_NODE_t * thiz); -void node_sort_edges (AC_NODE_t * thiz); - -#endif diff --git a/src/lib/third_party/include/sort.h b/src/lib/third_party/include/sort.h deleted file mode 100644 index ee7df8a10..000000000 --- a/src/lib/third_party/include/sort.h +++ /dev/null @@ -1,6 +0,0 @@ -/* This is a function ported from the Linux kernel lib/sort.c */ - -void sort(void *base, size_t num, size_t len, - int (*cmp_func)(const void *, const void *), - void (*swap_func)(void *, void *, int size)); - diff --git a/src/lib/third_party/src/ahocorasick.c b/src/lib/third_party/src/ahocorasick.c index b17383edd..8e36d7fdb 100644 --- a/src/lib/third_party/src/ahocorasick.c +++ b/src/lib/third_party/src/ahocorasick.c @@ -4,7 +4,7 @@ * Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> Copyright 2012-21 ntop.org (Incremental improvements) - + multifast is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or @@ -19,27 +19,131 @@ along with multifast. If not, see <http://www.gnu.org/licenses/>. */ +#ifndef __KERNEL__ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <ctype.h> +#include <unistd.h> +#include <stdint.h> +#include <sys/types.h> +#else +#include <asm/byteorder.h> +#include <linux/kernel.h> +#include <linux/types.h> +typedef __kernel_size_t size_t; +#include <linux/string.h> +#include <linux/slab.h> +#endif #include "ndpi_api.h" #include "ahocorasick.h" -/* Allocation step for automata.all_nodes */ -#define REALLOC_CHUNK_ALLNODES 200 +/* TODO: For different depth of node, number of outgoing edges differs + considerably, It is efficient to use different chunk size for + different depths */ + +/* Private function prototype */ +static int node_edge_compare (struct edge * e, int a, int b); +static int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr); + +static AC_NODE_t * node_create (void); +static AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); +static int node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, int is_existing); +static int node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha); +static AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); +static AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, uint8_t alpha); +static AC_NODE_t * node_findbs_next_ac (AC_NODE_t * thiz, uint8_t alpha,int icase); +static void node_release (AC_NODE_t * thiz, int free_pattern); +static void node_release_pattern (AC_NODE_t * thiz); +static int node_range_edges (AC_AUTOMATA_t *thiz, AC_NODE_t * node); +static inline void node_sort_edges (AC_NODE_t * thiz); + +#ifndef __KERNEL__ +struct aho_dump_info { + size_t memcnt,node_oc,node_8c,node_xc,node_xr; + int buf_pos,ip; + char *bufstr; + size_t bufstr_len; +}; + +static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *); +#endif /* Private function prototype */ -static void ac_automata_register_nodeptr -(AC_AUTOMATA_t * thiz, AC_NODE_t * node); -static void ac_automata_union_matchstrs -(AC_NODE_t * node); +static int ac_automata_union_matchstrs (AC_NODE_t * node); static void ac_automata_set_failure -(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); + (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_NODE_t * next, int idx, void *); static void ac_automata_traverse_setfailure -(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas); + (AC_AUTOMATA_t * thiz); + +static inline AC_ALPHABET_t *edge_get_alpha(struct edge *e) { + return (AC_ALPHABET_t *)(&e->next[e->max]); +} +static inline size_t edge_data_size(int num) { + return sizeof(void *)*num + ((num + sizeof(void *) - 1) & ~(sizeof(void *)-1)); +} + +#ifdef __KERNEL__ +static inline void *acho_calloc(size_t nmemb, size_t size) { + return kcalloc(nmemb, size, GFP_ATOMIC); +} +static inline void *acho_malloc(size_t size) { + return kmalloc(size, GFP_ATOMIC); +} +static inline void acho_free(void *old) { + return kfree(old); +} +#else + +#define acho_calloc(a,b) ndpi_calloc(a,b) +#define acho_malloc(a) ndpi_malloc(a) +#define acho_free(a) ndpi_free(a) +#endif + +static void acho_sort(struct edge *e, size_t num, + int (*cmp_func)(struct edge *e, int a, int b), + void (*swap_func)(struct edge *e, int a, int b)); + +/* tolower() from glibc */ +static uint8_t aho_lc[256] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', +'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, +112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, +128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, +144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, +160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, +176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, +192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, +208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, +224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, +240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 +}; +/* toupper() from glibc */ +static uint8_t aho_xc[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; /****************************************************************************** * FUNCTION: ac_automata_init @@ -50,17 +154,42 @@ static void ac_automata_traverse_setfailure ******************************************************************************/ AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc) { - AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)ndpi_malloc(sizeof(AC_AUTOMATA_t)); - memset (thiz, 0, sizeof(AC_AUTOMATA_t)); + AC_AUTOMATA_t * thiz; +// if(!mc) return NULL; + thiz = (AC_AUTOMATA_t *)acho_calloc(1,sizeof(AC_AUTOMATA_t)); + if(!thiz) return NULL; thiz->root = node_create (); - thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES; - thiz->all_nodes = (AC_NODE_t **) ndpi_malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *)); - thiz->match_callback = mc; - ac_automata_register_nodeptr (thiz, thiz->root); + if(!thiz->root) { + acho_free(thiz); + return NULL; + } + thiz->root->id = 1; + thiz->root->root = 1; thiz->total_patterns = 0; thiz->automata_open = 1; + thiz->match_handler = mc; + thiz->to_lc = 0; + thiz->no_root_range = 0; + thiz->add_to_range = REALLOC_CHUNK_OUTGOING*2; return thiz; } +/****************************************************************************** + * FUNCTION: ac_automata_casecmp + * Case-insensitive comparison mode + * PARAMS: + * AC_AUTOMATA_t * thiz: the pointer to the automata + * lc: 1 for case-insensitive comparison mode + * RETUERN VALUE: AC_ERROR_t + * the return value indicates the success or failure of changes + ******************************************************************************/ +AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature) +{ + if(!thiz) return ACERR_ERROR; + if(thiz->all_nodes_num || thiz->total_patterns) return ACERR_ERROR; + thiz->to_lc = (feature & AC_FEATURE_LC) != 0; + thiz->no_root_range = (feature & AC_FEATURE_NO_ROOT_RANGE) != 0; + return ACERR_SUCCESS; +} /****************************************************************************** * FUNCTION: ac_automata_add @@ -87,41 +216,124 @@ AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) if (patt->length > AC_PATTRN_MAX_LENGTH) return ACERR_LONG_PATTERN; - for (i=0; i<patt->length; i++) - { - alpha = patt->astring[i]; - if ((next = node_find_next(n, alpha))) - { - n = next; - continue; - } - else - { - next = node_create_next(n, alpha); - next->depth = n->depth + 1; + for (i=0; i<patt->length; i++) { + alpha = patt->astring[i]; + if(thiz->to_lc) + alpha = (AC_ALPHABET_t)aho_lc[(uint8_t)alpha]; + + if((next = node_find_next(n, alpha)) != 0) { + n = next; + continue; + } + if(!(next = node_create_next(n, alpha))) + return ACERR_ERROR; + next->id = ++thiz->id; + thiz->all_nodes_num++; n = next; - ac_automata_register_nodeptr(thiz, n); } - } + if(thiz->max_str_len < patt->length) + thiz->max_str_len = patt->length; if(n->final) { -#if 0 - /* Original code */ + patt->rep.number = n->matched_patterns->patterns[0].rep.number; return ACERR_DUPLICATE_PATTERN; -#else - /* ntop */ - memcpy(&n->matched_patterns->rep, &patt->rep, sizeof(AC_REP_t)); - return ACERR_DUPLICATE_PATTERN; /* Caller might need to free patt->astring */ -#endif } - - n->final = 1; - node_register_matchstr(n, patt, 0); + + if(node_register_matchstr(n, patt, 0)) + return ACERR_ERROR; + thiz->total_patterns++; return ACERR_SUCCESS; } +AC_ERROR_t ac_automata_walk(AC_AUTOMATA_t * thiz, + NODE_CALLBACK_f node_cb, ALPHA_CALLBACK_f alpha_cb, void *data) +{ + unsigned int ip; + AC_NODE_t *next, *n; + struct ac_path * path = thiz->ac_path; + AC_ERROR_t r; + + ip = 1; + path[1].n = thiz->root; + path[1].idx = 0; + + while(ip) { + unsigned int i,last; + n = path[ip].n; + i = path[ip].idx; + last = !n->outgoing || (n->one && i > 0) || (!n->one && i >= n->outgoing->degree); + if(node_cb && (!i || last)) { + r = node_cb(thiz, n, i, data); + if(r != ACERR_SUCCESS) return r; + } + if(last) { + ip--; continue; + } + next = NULL; + if(n->one) { + next = (AC_NODE_t *)n->outgoing; + } else { + while(i < n->outgoing->degree) { + next = n->outgoing->next[i]; + if(next) break; + i++; + } + } + if(!next) { + if(!n->range || i >= n->outgoing->degree) { + r = node_cb ? node_cb(thiz, n, i, data):ACERR_SUCCESS; + if(r != ACERR_SUCCESS) return r; + } + ip--; continue; + } + + if(n->depth < AC_PATTRN_MAX_LENGTH) { + path[n->depth].l = n->one ? n->one_alpha: + edge_get_alpha(n->outgoing)[i]; + if(alpha_cb) + alpha_cb(thiz, n, next, i, data); + } + + path[ip].idx = i+1; + if(ip >= AC_PATTRN_MAX_LENGTH) + continue; + + ip++; + + path[ip].n = next; + path[ip].idx = 0; + + } + return ACERR_SUCCESS; +} + + +static AC_ERROR_t ac_finalize_node(AC_AUTOMATA_t * thiz,AC_NODE_t * n, int idx, void *data) { + if(!n->ff) { + n->id = ++(thiz->id); + n->ff = 1; + if(ac_automata_union_matchstrs (n)) + return ACERR_ERROR; + if(n->use) { + if(!n->one) { + if(node_range_edges (thiz,n)) { + node_sort_edges (n); + thiz->n_range++; + } else + thiz->n_find++; + } else + thiz->n_oc++; + } + } + if(!n->a_ptr && n->outgoing && !n->one) { + n->a_ptr = edge_get_alpha(n->outgoing); + } + return ACERR_SUCCESS; +} + + /****************************************************************************** * FUNCTION: ac_automata_finalize * Locate the failure node for all nodes and collect all matched pattern for @@ -131,24 +343,51 @@ AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt) * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata ******************************************************************************/ -void ac_automata_finalize (AC_AUTOMATA_t * thiz) -{ - unsigned int i; - AC_ALPHABET_t *alphas; - AC_NODE_t * node; - if((alphas = ndpi_malloc(AC_PATTRN_MAX_LENGTH)) != NULL) { - ac_automata_traverse_setfailure (thiz, thiz->root, alphas); +AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz) { - for (i=0; i < thiz->all_nodes_num; i++) - { - node = thiz->all_nodes[i]; - ac_automata_union_matchstrs (node); - node_sort_edges (node); + AC_ERROR_t r = ACERR_SUCCESS; + if(!thiz->automata_open) return r; + + ac_automata_traverse_setfailure (thiz); + thiz->id=0; + thiz->n_oc = 0; + thiz->n_range = 0; + thiz->n_find = 0; + r = ac_automata_walk(thiz,ac_finalize_node,NULL,NULL); + if(r == ACERR_SUCCESS) + thiz->automata_open = 0; + return r; +} + +int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *txt) { + AC_PATTERN_t *patterns = mp->patterns; + AC_PATTERN_t **matched = txt->match.matched; + int i; + for(i=0; i < mp->num; i++,patterns++) { + do { + if(patterns->rep.from_start && patterns->rep.at_end) { + if(pos == txt->length && patterns->length == pos) + matched[0] = patterns; + break; + } + if(patterns->rep.from_start) { + if(patterns->length == pos) + if(!matched[1] || patterns->length > matched[1]->length) + matched[1] = patterns; + break; + } + if(patterns->rep.at_end) { + if(pos == txt->length) + if(!matched[2] || patterns->length > matched[2]->length) + matched[2] = patterns; + break; + } + if(!matched[3] || patterns->length > matched[3]->length) + matched[3] = patterns; + } while(0); } - thiz->automata_open = 0; /* do not accept patterns any more */ - ndpi_free(alphas); - } + return 0; } /****************************************************************************** @@ -167,49 +406,69 @@ void ac_automata_finalize (AC_AUTOMATA_t * thiz) * 0: success; continue searching; call-back sent me a 0 value * 1: success; stop searching; call-back sent me a non-0 value ******************************************************************************/ -int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, AC_REP_t * param) { +int ac_automata_search (AC_AUTOMATA_t * thiz, + AC_TEXT_t * txt, AC_REP_t * param) +{ + uint8_t alpha; unsigned long position; + int icase = 0,i; + AC_MATCH_t *match; AC_NODE_t *curr; AC_NODE_t *next; - AC_SEARCH_t s; - + AC_ALPHABET_t *apos; + if(thiz->automata_open) /* you must call ac_automata_locate_failure() first */ return -1; - - /* Reset search */ - s.current_node = thiz->root; - s.base_position = 0; - position = 0; - curr = s.current_node; + curr = thiz->root; + apos = txt->astring; + match = &txt->match; + memset((char*)match,0,sizeof(*match)); + icase = !thiz->to_lc; + /* The 'txt->ignore_case' option is checked + * separately otherwise clang will detect + * uninitialized memory usage much later. */ + if(txt->ignore_case == 1) icase = 1; /* This is the main search loop. * it must be keep as lightweight as possible. */ while (position < txt->length) { - if(!(next = node_findbs_next(curr, txt->astring[position]))) { - if(curr->failure_node /* we are not in the root node */) - curr = curr->failure_node; - else - position++; - } else { - curr = next; - position++; - } - - if(curr->final && next) { - /* We check 'next' to find out if we came here after a alphabet - * transition or due to a fail. in second case we should not report - * matching because it was reported in previous node */ - thiz->match.position = position + s.base_position; - thiz->match.match_num = curr->matched_patterns_num; - thiz->match.patterns = curr->matched_patterns; - /* we found a match! do call-back */ - if (thiz->match_callback(&thiz->match, txt, param)) - return 1; - } + alpha = (uint8_t)apos[position]; + if(thiz->to_lc) alpha = aho_lc[alpha]; + if(!(next = node_findbs_next_ac(curr, (uint8_t)alpha, icase))) { + if(curr->failure_node) /* we are not in the root node */ + curr = curr->failure_node; + else + position++; + } else { + curr = next; + position++; + if(curr->final) { + match->match_counter++; /* we have a matching */ + /* select best match */ + ac_automata_exact_match(curr->matched_patterns,position,txt); + if(thiz->match_handler) { + /* We check 'next' to find out if we came here after a alphabet + * transition or due to a fail. in second case we should not report + * matching because it was reported in previous node */ + match->position = position; + match->match_num = curr->matched_patterns->num; + match->patterns = curr->matched_patterns->patterns; + if (thiz->match_handler(match, txt, param)) + return 1; + } + } + } } + if(thiz->match_handler) + return match->match_counter > 0 ? 1:0; + for(i = 0; i < 4; i++) + if(txt->match.matched[i]) { + *param = (txt->match.matched[i])->rep; + return 1; + } return 0; } @@ -218,112 +477,182 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, AC_REP_t * param) * Release all allocated memories to the automata * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata - * uint8_t free_pattern: if true, deallocate the patterns strings + * free_pattern: + * 0 - free all struct w/o free pattern + * 1 - free all struct and pattern + * 2 - clean struct w/o free pattern ******************************************************************************/ -void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern) -{ - unsigned int i; - AC_NODE_t * n; - for (i=0; i < thiz->all_nodes_num; i++) - { - n = thiz->all_nodes[i]; - node_release(n, free_pattern); - } - ndpi_free(thiz->all_nodes); - ndpi_free(thiz); +static AC_ERROR_t ac_automata_release_node(AC_AUTOMATA_t * thiz, + AC_NODE_t *n, int idx, void *data) { + + if(!n->outgoing || idx) { + if(n->outgoing) { + if(n->one) thiz->n_oc--; + else if(n->range) thiz->n_range--; + else thiz->n_find--; + } + node_release(n,data != NULL); + } + + return ACERR_SUCCESS; +} +void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern) { + + ac_automata_walk(thiz,ac_automata_release_node,NULL,free_pattern ? (void *)1:NULL); + + if(free_pattern <= 1) { + node_release(thiz->root,free_pattern | 0x4); + thiz->root = NULL; + acho_free(thiz); + } else { + AC_NODE_t *n; + thiz->all_nodes_num = 0; + thiz->total_patterns = 0; + thiz->max_str_len = 0; + thiz->automata_open = 1; + + n = thiz->root; + n->failure_node = NULL; + n->id = 0; + n->final = 0; + n->depth = 0; + if(n->outgoing) { + acho_free(n->outgoing); + n->outgoing = NULL; + } + if(n->matched_patterns) { + acho_free(n->matched_patterns); + n->matched_patterns=NULL; + } + n->use = 0; + n->one = 0; + } +} + +#ifndef __KERNEL__ + +static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) { + char *c; + int i; + printf("%04d: ",n->id); + if(n->failure_node) printf(" failure %04d:",n->failure_node->id); + printf(" d:%d %c",n->depth, n->use ? '+':'-'); + ai->memcnt += sizeof(*n); + if(n->matched_patterns) { + ai->memcnt += sizeof(n->matched_patterns) + + n->matched_patterns->max*sizeof(n->matched_patterns->patterns[0]); + } + if(!n->use) { printf("\n"); return; } + if(n->one) { + (ai->node_oc)++; + printf(" '%c' next->%d\n",n->one_alpha, + n->outgoing ? ((AC_NODE_t *)n->outgoing)->id : -1); + return; + } + if(!n->outgoing) { + printf(" BUG! !outgoing\n"); + return; + } + printf("%s\n",n->range ? " RANGE":""); + c = (char *)edge_get_alpha(n->outgoing); + if(n->outgoing->degree <= 8) + (ai->node_8c)++; + else + (ai->node_xc)++; + if(n->range) + (ai->node_xr)++; + for(i=0; i < n->outgoing->degree; i++) { + printf(" %d: \"%c\" -> %d\n",i,c[i], + n->outgoing->next[i] ? n->outgoing->next[i]->id:-1); + } + ai->memcnt += sizeof(n->outgoing) + edge_data_size(n->outgoing->max); +} + +static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz, + AC_NODE_t * n, int idx, void *data) { + struct aho_dump_info *ai = (struct aho_dump_info *)data; + char *rstr = ai->bufstr; + + if(idx) return ACERR_SUCCESS; + dump_node_header(n,ai); + if (n->matched_patterns && n->matched_patterns->num && n->final) { + char lbuf[300]; + int nl = 0,j; + + nl = snprintf(lbuf,sizeof(lbuf),"'%.100s' N:%d{",rstr,n->matched_patterns->num); + for (j=0; j<n->matched_patterns->num; j++) { + AC_PATTERN_t *sid = &n->matched_patterns->patterns[j]; + if(j) nl += snprintf(&lbuf[nl],sizeof(lbuf)-nl-1,", "); + nl += snprintf(&lbuf[nl],sizeof(lbuf)-nl-1,"%d %c%.100s%c", + sid->rep.number & 0x3fff, + sid->rep.number & 0x8000 ? '^':' ', + sid->astring, + sid->rep.number & 0x4000 ? '$':' '); + } + printf("%s}\n",lbuf); + } + return ACERR_SUCCESS; +} +static void dump_node_str(AC_AUTOMATA_t * thiz, AC_NODE_t * node, + AC_NODE_t * next, int idx, void *data) { + struct aho_dump_info *ai = (struct aho_dump_info *)data; + ai->bufstr[node->depth] = thiz->ac_path[node->depth].l; + ai->bufstr[node->depth+1] = 0; } /****************************************************************************** - * FUNCTION: ac_automata_display + * FUNCTION: ac_automata_dump * Prints the automata to output in human readable form. it is useful for * debugging purpose. * PARAMS: * AC_AUTOMATA_t * thiz: the pointer to the automata + * rstr: char[] buffer + * rstr_size: size of rstr buffser * char repcast: 'n': print AC_REP_t as number, 's': print AC_REP_t as string ******************************************************************************/ -void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast) -{ - unsigned int i, j; - AC_NODE_t * n; - struct edge * e; - AC_PATTERN_t sid; - - printf("---------------------------------\n"); - - for (i=0; i<thiz->all_nodes_num; i++) - { - n = thiz->all_nodes[i]; - printf("NODE(%3d)/----fail----> NODE(%3d)\n", - n->id, (n->failure_node)?n->failure_node->id:1); - for (j=0; j<n->outgoing_degree; j++) - { - e = &n->outgoing[j]; - printf(" |----("); - if(isgraph(e->alpha)) - printf("%c)---", e->alpha); - else - printf("0x%x)", e->alpha); - printf("--> NODE(%3d)\n", e->next->id); - } - if (n->matched_patterns_num) { - printf("Accepted patterns: {"); - for (j=0; j<n->matched_patterns_num; j++) - { - sid = n->matched_patterns[j]; - if(j) printf(", "); - switch (repcast) - { - case 'n': - printf("%u/%u/%u", - sid.rep.number, - sid.rep.category, - sid.rep.breed); - break; - } - } - printf("}\n"); - } - printf("---------------------------------\n"); - } -} -/****************************************************************************** - * FUNCTION: ac_automata_register_nodeptr - * Adds the node pointer to all_nodes. - ******************************************************************************/ -static void ac_automata_register_nodeptr (AC_AUTOMATA_t * thiz, AC_NODE_t * node) -{ - if(thiz->all_nodes_num >= thiz->all_nodes_max) - { - thiz->all_nodes = ndpi_realloc(thiz->all_nodes, - thiz->all_nodes_max*sizeof(AC_NODE_t *), - (REALLOC_CHUNK_ALLNODES+thiz->all_nodes_max)*sizeof(AC_NODE_t *) - ); - thiz->all_nodes_max += REALLOC_CHUNK_ALLNODES; - } - thiz->all_nodes[thiz->all_nodes_num++] = node; +void ac_automata_dump(AC_AUTOMATA_t * thiz, char *rstr, size_t rstr_size, char repcast) { + struct aho_dump_info ai; + + memset((char *)&ai,0,sizeof(ai)); + + printf("---DUMP- all nodes %u - max strlen %u -%s---\n", + (unsigned int)thiz->all_nodes_num, + (unsigned int)thiz->max_str_len, + thiz->automata_open ? "open":"ready"); + printf("root: %px\n",thiz->root); + *rstr = '\0'; + ai.bufstr = rstr; + ai.bufstr_len = rstr_size; + + ac_automata_walk(thiz,dump_node_common,dump_node_str,(void *)&ai); + printf("---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n", + ai.memcnt,(int)ai.memcnt/(thiz->all_nodes_num+1),(int)ai.node_oc,(int)ai.node_8c,(int)ai.node_xc,(int)ai.node_xr); } +#endif /****************************************************************************** * FUNCTION: ac_automata_union_matchstrs * Collect accepted patterns of the node. the accepted patterns consist of the * node's own accepted pattern plus accepted patterns of its failure node. ******************************************************************************/ -static void ac_automata_union_matchstrs (AC_NODE_t * node) +static int ac_automata_union_matchstrs (AC_NODE_t * node) { unsigned int i; - AC_NODE_t * m = node; + AC_NODE_t * m; - while ((m = m->failure_node)) - { - for (i=0; i < m->matched_patterns_num; i++) - node_register_matchstr(node, &(m->matched_patterns[i]), 1 /* this is an existing node */); + for (m = node; m; m = m->failure_node) { + if(!m->matched_patterns) continue; - if (m->final) - node->final = 1; - } - // TODO : sort matched_patterns? is that necessary? I don't think so. + for (i=0; i < m->matched_patterns->num; i++) + if(node_register_matchstr(node, &(m->matched_patterns->patterns[i]), 1)) + return 1; + + if (m->final) + node->final = 1; + } + return 0; } /****************************************************************************** @@ -331,24 +660,24 @@ static void ac_automata_union_matchstrs (AC_NODE_t * node) * find failure node for the given node. ******************************************************************************/ static void ac_automata_set_failure -(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) +(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_NODE_t * next, int idx, void *data) { unsigned int i, j; AC_NODE_t * m; + struct ac_path * path = thiz->ac_path; - for (i=1; i < node->depth; i++) - { - m = thiz->root; - for (j=i; j < node->depth && m; j++) - m = node_find_next (m, alphas[j]); - if (m) - { - node->failure_node = m; - break; - } + for (i=1; i < next->depth; i++) { + m = thiz->root; + for (j=i; j < next->depth && m; j++) { + m = node_find_next (m, path[j].l); + } + if (m) { + next->failure_node = m; + break; + } } - if (!node->failure_node) - node->failure_node = thiz->root; + if (!next->failure_node) + next->failure_node = thiz->root; } /****************************************************************************** @@ -358,20 +687,509 @@ static void ac_automata_set_failure * called after adding last pattern to automata. i.e. after calling this you * can not add further pattern to automata. ******************************************************************************/ -static void ac_automata_traverse_setfailure -(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas) +static inline void ac_automata_traverse_setfailure (AC_AUTOMATA_t * thiz) +{ + ac_automata_walk(thiz,NULL,ac_automata_set_failure,NULL); +} + +/****************************************************************************** + * FUNCTION: node_create + * Create the node + ******************************************************************************/ +static inline AC_NODE_t * node_create(void) +{ + return (AC_NODE_t *) acho_calloc (1,sizeof(AC_NODE_t)); +} + + +static void node_release_pattern(AC_NODE_t * thiz) +{ + int i; + AC_PATTERN_t * str; + + if(!thiz->matched_patterns) return; + str = thiz->matched_patterns->patterns; + + for (i=0; i < thiz->matched_patterns->num; str++,i++) + { + if(!str->is_existing && str->astring) { + acho_free(str->astring); + str->astring = NULL; + } + } +} + + +/****************************************************************************** + * FUNCTION: node_release + * Release node + ******************************************************************************/ +static void node_release(AC_NODE_t * thiz, int free_pattern) +{ + if(thiz->root && (free_pattern & 0x4) == 0) return; + + if(free_pattern & 1) node_release_pattern(thiz); + + if(thiz->matched_patterns) { + acho_free(thiz->matched_patterns); + thiz->matched_patterns = NULL; + } + if(!thiz->one && thiz->outgoing) { + acho_free(thiz->outgoing); + } + thiz->outgoing = NULL; + acho_free(thiz); +} + +/* Nonzero if X is not aligned on a "long" boundary. */ +#define UNALIGNED(X) ((long)X & (__SIZEOF_LONG__ - 1)) +#define LBLOCKSIZE __SIZEOF_LONG__ + +#if __SIZEOF_LONG__ == 4 +#define DETECTNULL(X) (((X) - 0x01010101UL) & ~(X) & 0x80808080UL) +#define DUPC 0x01010101UL + +static inline size_t bsf(uint32_t bits) +{ +#ifdef __GNUC__ + return __builtin_ctz(bits); +#else + size_t i=0; + if(!bits) return i; + if((bits & 0xffff)bits == 0) { i+=16; bits >>=16; } + if((bits & 0xff) == 0) i+=8; + return i; +#endif +} + +#else +#define DETECTNULL(X) (((X) - 0x0101010101010101ULL) & ~(X) & 0x8080808080808080ULL) +#define DUPC 0x0101010101010101UL + +static inline size_t bsf(uint64_t bits) +{ +#ifdef __GNUC__ + return __builtin_ctzll(bits); +#else + size_t i=0; + if(!bits) return i; + if((bits & 0xffffffff) == 0) { i+=32; bits >>=32; } + if((bits & 0xffff) == 0) { i+=16; bits >>=16; } + if((bits & 0xff) == 0) i+=8; + return i; +#endif +} +#endif + +static inline char * +xmemchr(char *s, char i,int n) +{ + unsigned char c = (unsigned char)i; + + while(n > 0) { + if (n >= LBLOCKSIZE && !UNALIGNED (s)) { + unsigned long int mask; + mask = c * DUPC; + + while (n >= LBLOCKSIZE) { + unsigned long int nc = DETECTNULL((*(unsigned long int *)s) ^ mask); + if(nc) + return s + (bsf(nc) >> 3); + s += LBLOCKSIZE; + n -= LBLOCKSIZE; + } + if(!n) return NULL; + } + if (*s == c) return s; + s++; + n--; + } + return NULL; +} + + +/****************************************************************************** + * FUNCTION: node_find_next + * Find out the next node for a given Alpha to move. this function is used in + * the pre-processing stage in which edge array is not sorted. so it uses + * linear search. + ******************************************************************************/ +static AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha) +{ + AC_ALPHABET_t *alphas, *fc; + + if(thiz->one) return alpha == thiz->one_alpha ? (AC_NODE_t *)thiz->outgoing:NULL; + if(!thiz->outgoing) return NULL; + + alphas = edge_get_alpha(thiz->outgoing); + fc = xmemchr((char *)alphas,(char)alpha,thiz->outgoing->degree); + return fc ? thiz->outgoing->next[fc-alphas] : NULL; +} + + +/****************************************************************************** + * FUNCTION: node_findbs_next + * Find out the next node for a given Alpha. this function is used after the + * pre-processing stage in which we sort edges. so it uses Binary Search. + ******************************************************************************/ + +static inline AC_NODE_t *node_findbs_next (AC_NODE_t * thiz, uint8_t alpha) +{ + + if(thiz->one) + return alpha == thiz->one_alpha ? (AC_NODE_t *)thiz->outgoing:NULL; + + if(!(thiz->outgoing->cmap[(uint8_t)alpha >> 5] & (1u << (alpha & 0x1f)))) + return NULL; + + if(thiz->range) + return thiz->outgoing->next[(uint8_t)alpha - (uint8_t)thiz->one_alpha]; + + return thiz->outgoing->next[ + xmemchr((char *)thiz->a_ptr,(char)alpha,thiz->outgoing->degree) - thiz->a_ptr]; +} + +static AC_NODE_t *node_findbs_next_ac (AC_NODE_t * thiz, uint8_t alpha,int icase) { + AC_NODE_t *next; + uint8_t alpha_c; + + if(!thiz->outgoing) return NULL; + + next = node_findbs_next(thiz,alpha); + if(next || !icase) return next; + + alpha_c = aho_xc[alpha]; + if(!alpha_c) return NULL; + return node_findbs_next(thiz, alpha ^ alpha_c); +} + +/****************************************************************************** + * FUNCTION: node_has_matchstr + * Determine if a final node contains a pattern in its accepted pattern list + * or not. return values: 1 = it has, 0 = it hasn't + ******************************************************************************/ +static int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr) +{ + int i; + AC_PATTERN_t * str; + if(!thiz->matched_patterns) return 0; + str = thiz->matched_patterns->patterns; + + for (i=0; i < thiz->matched_patterns->num; str++,i++) + { + if (str->length != newstr->length) + continue; + + if(!memcmp(str->astring,newstr->astring,str->length)) + return 1; + + } + return 0; +} + +/****************************************************************************** + * FUNCTION: node_create_next + * Create the next node for the given alpha. + ******************************************************************************/ +static AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) { - unsigned int i; AC_NODE_t * next; + next = node_find_next (thiz, alpha); + if (next) + /* The edge already exists */ + return NULL; + /* Otherwise register new edge */ + next = node_create (); + if(next) { + if(node_register_outgoing(thiz, next, alpha)) { + node_release(next,0); + return NULL; + } + next->depth = thiz->depth+1; + } + + return next; +} + +static inline int mp_data_size(int n) { + return sizeof(AC_PATTERNS_t) + n*sizeof(AC_PATTERN_t); +} - for (i=0; i < node->outgoing_degree; i++) { - alphas[node->depth] = node->outgoing[i].alpha; - next = node->outgoing[i].next; +static AC_PATTERNS_t * node_resize_mp(AC_PATTERNS_t *m) { +AC_PATTERNS_t *new_m; - /* At every node look for its failure node */ - ac_automata_set_failure (thiz, next, alphas); + if(!m) { + m = acho_calloc(1,mp_data_size(REALLOC_CHUNK_MATCHSTR)); + if(!m) return m; + m->max = REALLOC_CHUNK_MATCHSTR; + return m; + } + new_m = acho_malloc(mp_data_size(m->max+REALLOC_CHUNK_MATCHSTR)); + if(!new_m) return new_m; + memcpy((char *)new_m,(char *)m,mp_data_size(m->max)); + new_m->max += REALLOC_CHUNK_MATCHSTR; + acho_free(m); + return new_m; +} + +/****************************************************************************** + * FUNCTION: node_register_matchstr + * Adds the pattern to the list of accepted pattern. + ******************************************************************************/ +static int node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str,int is_existing) +{ + AC_PATTERN_t *l; + + if(!is_existing) + thiz->final = 1; + /* Check if the new pattern already exists in the node list */ + if (thiz->matched_patterns && node_has_matchstr(thiz, str)) + return 0; + + if(!thiz->matched_patterns) + thiz->matched_patterns = node_resize_mp(thiz->matched_patterns); - /* Recursively call itself to traverse all nodes */ - ac_automata_traverse_setfailure (thiz, next, alphas); + /* Manage memory */ + if (thiz->matched_patterns->num >= thiz->matched_patterns->max) { + AC_PATTERNS_t *new_mp = node_resize_mp(thiz->matched_patterns); + if(!new_mp) return 1; + thiz->matched_patterns = new_mp; + } + l = &thiz->matched_patterns->patterns[thiz->matched_patterns->num]; + l->astring = str->astring; + l->length = str->length; + l->is_existing = is_existing; + l->rep = str->rep; + thiz->matched_patterns->num++; + return 0; +} + +static struct edge *node_resize_outgoing(struct edge * e,size_t added) { +struct edge *new_e; +int ds; + + if(!added) added = REALLOC_CHUNK_OUTGOING; + if(!e) { + e = acho_calloc(1,sizeof(struct edge) + edge_data_size(REALLOC_CHUNK_OUTGOING)); + if(!e) return e; + e->max = REALLOC_CHUNK_OUTGOING; + return e; + } + ds = edge_data_size(e->max + added); + new_e = acho_calloc(1,sizeof(struct edge) + ds); + if(!new_e) return new_e; + memcpy(new_e,e,sizeof(struct edge) + sizeof(AC_NODE_t *)*e->max); + new_e->max += added; + + if(e->degree) + memcpy(edge_get_alpha(new_e),edge_get_alpha(e),e->degree); + + acho_free(e); + return new_e; +} + +/****************************************************************************** + * FUNCTION: node_register_outgoing + * Establish an edge between two nodes + ******************************************************************************/ +static int node_register_outgoing +(AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha) +{ + struct edge *o; + if(!thiz->use) { + thiz->use = 1; + thiz->one = 1; + thiz->one_alpha = alpha; + thiz->outgoing = (struct edge *)next; + return 0; } + if(thiz->one) { + o = node_resize_outgoing(NULL,0); + if(!o) return 1; + o->next[0] = (AC_NODE_t *)thiz->outgoing; + *edge_get_alpha(o) = thiz->one_alpha; + o->degree = 1; + thiz->one = 0; + thiz->one_alpha = 0; + thiz->outgoing = o; + } else + o = thiz->outgoing; + + if(!o) return 1; + + if(o->degree >= o->max) + { + struct edge *new_o = node_resize_outgoing(thiz->outgoing,0); + if(!new_o) return 1; + + thiz->outgoing = new_o; + o = new_o; + } + edge_get_alpha(o)[o->degree] = alpha; + o->next[o->degree] = next; + o->degree++; + return 0; } + +/****************************************************************************** + * FUNCTION: node_edge_compare + * Comparison function for qsort. see man qsort. + ******************************************************************************/ +static int node_edge_compare (struct edge * e, int a, int b) { + AC_ALPHABET_t *c = edge_get_alpha(e); + return c[a] >= c[b] ? 1:0; +} + +static void node_edge_swap (struct edge * e, int a, int b) +{ +AC_ALPHABET_t *c,tc; +AC_NODE_t *tn; + c = edge_get_alpha(e); + tc = c[a]; c[a] = c[b]; c[b] = tc; + tn = e->next[a]; e->next[a] = e->next[b]; e->next[b] = tn; +} + +/****************************************************************************** + * FUNCTION: acho_2range + * Adds missing characters in the range low - high + ******************************************************************************/ +static void acho_2range(AC_NODE_t * thiz,uint8_t low, uint8_t high) { + struct edge *e; + int i; + uint8_t *c = (uint8_t *)edge_get_alpha(thiz->outgoing); + + thiz->range = 1; + thiz->one_alpha = (AC_ALPHABET_t)low; + e = thiz->outgoing; + for (i=0; low <= high && i < e->max; i++,low++) { + if(e->cmap[(low >> 5) & 0x7] & (1u << (low & 0x1f))) continue; + c[e->degree] = low; + e->next[e->degree] = NULL; + e->degree++; + } +} + +/****************************************************************************** + * FUNCTION: node_range_edges + * Converts to a range if possible. + ******************************************************************************/ +static int node_range_edges (AC_AUTOMATA_t *thiz, AC_NODE_t * node) +{ + struct edge *e = node->outgoing; + uint8_t *c = (uint8_t *)edge_get_alpha(node->outgoing); + uint8_t low = 0xff,high = 0; + int i; + + memset((char *)&e->cmap,0,sizeof(e->cmap)); + for(i = 0; i < e->degree; i++) { + uint8_t cc = c[i]; + if(cc < low) low = cc; + if(cc > high) high = cc; + e->cmap[(cc >> 5) & 0x7] |= 1u << (cc & 0x1f); + } + if(high - low + 1 == e->degree) { + node->range = 1; + node->one_alpha = (AC_ALPHABET_t)low; + return 1; + } + if(high - low + 1 < e->max) { + acho_2range(node,low,high); + return 1; + } + +// if(e->degree < __SIZEOF_LONG__) return 0; + + i = (high - low)/8; + if (i < thiz->add_to_range) i = thiz->add_to_range; + i += REALLOC_CHUNK_OUTGOING-1; + i -= i % REALLOC_CHUNK_OUTGOING; + + if(high - low + 1 < e->max + i) { + int added = (high - low + 1) - e->max; + struct edge *new_o = node_resize_outgoing(node->outgoing,added); + if(new_o) { + node->outgoing = new_o; + acho_2range(node,low,high); + return 1; + } + return 0; + } + + if(node->root && !thiz->no_root_range) { + struct edge *new_o; + int added = (high - low + 1) - e->max; + new_o = node_resize_outgoing(node->outgoing,added); + if(new_o) { + node->outgoing = new_o; + acho_2range(node,low,high); + return 1; + } + } + return 0; +} +/****************************************************************************** + * FUNCTION: node_sort_edges + * sorts edges alphabets. + ******************************************************************************/ +static inline void node_sort_edges (AC_NODE_t * thiz) +{ + + acho_sort (thiz->outgoing, thiz->outgoing->degree, + node_edge_compare, node_edge_swap); +} + +/** + * sort - sort an array of elements + * @base: pointer to data to sort + * @num: number of elements + * @size: size of each element + * @cmp_func: pointer to comparison function + * @swap_func: pointer to swap function or NULL + * + * This function does a heapsort on the given array. You may provide a + * swap_func function optimized to your element type. + * + * Sorting time is O(n log n) both on average and worst-case. While + * qsort is about 20% faster on average, it suffers from exploitable + * O(n*n) worst-case behavior and extra memory requirements that make + * it less suitable for kernel use. + */ + + void acho_sort(struct edge *e, size_t num, + int (*cmp_func)(struct edge *e, int a, int b), + void (*swap_func)(struct edge *e, int a, int b)) +{ + /* pre-scale counters for performance */ + int i = (num/2 - 1) , n = num, c, r; + + if (!swap_func) return; + if (!cmp_func) return; + + /* heapify */ + for ( ; i >= 0; i -= 1) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + if (c < n - 1 && cmp_func(e, c, c + 1) == 0) + c += 1; + if (cmp_func(e, r, c) != 0) + break; + swap_func(e, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; i -= 1) { + swap_func(e,0,i); + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + if (c < i - 1 && cmp_func(e, c, c + 1) == 0) + c += 1; + if (cmp_func(e, r, c) != 0) + break; + swap_func(e, r, c); + } + } +} + +/* vim: set ts=4 sw=4 et : */ + diff --git a/src/lib/third_party/src/node.c b/src/lib/third_party/src/node.c deleted file mode 100644 index 10a434308..000000000 --- a/src/lib/third_party/src/node.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * node.c: implementation of automata node - * This file is part of multifast. - * - Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> - - multifast is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - multifast is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with multifast. If not, see <http://www.gnu.org/licenses/>. -*/ - -#include <stdio.h> -#include <string.h> -#include <stdlib.h> -#include "ndpi_api.h" -#include "../include/node.h" -#include "sort.h" - -/* reallocation step for AC_NODE_t.matched_patterns */ -#define REALLOC_CHUNK_MATCHSTR 8 - -/* reallocation step for AC_NODE_t.outgoing array */ -#define REALLOC_CHUNK_OUTGOING 8 -/* TODO: For different depth of node, number of outgoing edges differs - considerably, It is efficient to use different chunk size for - different depths */ - -/* Private function prototype */ -void node_init (AC_NODE_t * thiz); -int node_edge_compare (const void * l, const void * r); -int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr); - - -/****************************************************************************** - * FUNCTION: node_create - * Create the node - ******************************************************************************/ -AC_NODE_t * node_create(void) -{ - AC_NODE_t * thiz = (AC_NODE_t *) ndpi_malloc (sizeof(AC_NODE_t)); - node_init(thiz); - node_assign_id(thiz); - return thiz; -} - -/****************************************************************************** - * FUNCTION: node_init - * Initialize node - ******************************************************************************/ -void node_init(AC_NODE_t * thiz) -{ - memset(thiz, 0, sizeof(AC_NODE_t)); - - thiz->outgoing_max = REALLOC_CHUNK_OUTGOING; - thiz->outgoing = (struct edge *) ndpi_malloc - (thiz->outgoing_max*sizeof(struct edge)); - - thiz->matched_patterns_max = REALLOC_CHUNK_MATCHSTR; - thiz->matched_patterns = (AC_PATTERN_t *) ndpi_malloc - (thiz->matched_patterns_max*sizeof(AC_PATTERN_t)); -} - -/****************************************************************************** - * FUNCTION: node_release - * Release node - ******************************************************************************/ -void node_release(AC_NODE_t * thiz, u_int8_t free_pattern) -{ - if(free_pattern) { - for(int i=0; i<thiz->matched_patterns_num; i++) { - if(!thiz->matched_patterns[i].is_existing) - ndpi_free(thiz->matched_patterns[i].astring); - } - } - - ndpi_free(thiz->matched_patterns); - ndpi_free(thiz->outgoing); - ndpi_free(thiz); -} - -/****************************************************************************** - * FUNCTION: node_find_next - * Find out the next node for a given Alpha to move. this function is used in - * the pre-processing stage in which edge array is not sorted. so it uses - * linear search. - ******************************************************************************/ -AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha) -{ - int i; - - for (i=0; i < thiz->outgoing_degree; i++) - { - if(thiz->outgoing[i].alpha == alpha) - return (thiz->outgoing[i].next); - } - return NULL; -} - -/****************************************************************************** - * FUNCTION: node_findbs_next - * Find out the next node for a given Alpha. this function is used after the - * pre-processing stage in which we sort edges. so it uses Binary Search. - ******************************************************************************/ -AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) -{ - int min, max, mid; - AC_ALPHABET_t amid; - - min = 0; - max = thiz->outgoing_degree - 1; - - while (min <= max) - { - mid = (min+max) >> 1; - amid = thiz->outgoing[mid].alpha; - if (alpha > amid) - min = mid + 1; - else if (alpha < amid) - max = mid - 1; - else - return (thiz->outgoing[mid].next); - } - return NULL; -} - -/****************************************************************************** - * FUNCTION: node_has_matchstr - * Determine if a final node contains a pattern in its accepted pattern list - * or not. return values: 1 = it has, 0 = it hasn't - ******************************************************************************/ -int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr) -{ - int i; -#if 0 - int j; -#endif - AC_PATTERN_t * str; - - for (i=0; i < thiz->matched_patterns_num; i++) - { - str = &thiz->matched_patterns[i]; - - if (str->length != newstr->length) - continue; - -#if 0 - /* Original code */ - for (j=0; j<(int)str->length; j++) - if(str->astring[j] != newstr->astring[j]) - continue; - - if (j == str->length) - return 1; -#else - /* https://github.com/ntop/nDPI/issues/837 */ - if (strncmp(str->astring, newstr->astring, str->length) == 0) - return 1; -#endif - } - return 0; -} - -/****************************************************************************** - * FUNCTION: node_create_next - * Create the next node for the given alpha. - ******************************************************************************/ -AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha) -{ - AC_NODE_t * next; - next = node_find_next (thiz, alpha); - if (next) - /* The edge already exists */ - return NULL; - /* Otherwise register new edge */ - next = node_create (); - node_register_outgoing(thiz, next, alpha); - - return next; -} - -/****************************************************************************** - * FUNCTION: node_register_matchstr - * Adds the pattern to the list of accepted pattern. - ******************************************************************************/ -void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing) -{ - /* Check if the new pattern already exists in the node list */ - if (node_has_matchstr(thiz, str)) - return; - - /* Manage memory */ - if (thiz->matched_patterns_num >= thiz->matched_patterns_max) - { - thiz->matched_patterns = (AC_PATTERN_t *) ndpi_realloc - (thiz->matched_patterns, thiz->matched_patterns_max*sizeof(AC_PATTERN_t), - (REALLOC_CHUNK_MATCHSTR+thiz->matched_patterns_max)*sizeof(AC_PATTERN_t)); - - thiz->matched_patterns_max += REALLOC_CHUNK_MATCHSTR; - } - - thiz->matched_patterns[thiz->matched_patterns_num].astring = str->astring; - thiz->matched_patterns[thiz->matched_patterns_num].length = str->length; - thiz->matched_patterns[thiz->matched_patterns_num].is_existing = is_existing; - memcpy(&thiz->matched_patterns[thiz->matched_patterns_num].rep, &str->rep, sizeof(AC_REP_t)); - thiz->matched_patterns_num++; -} - -/****************************************************************************** - * FUNCTION: node_register_outgoing - * Establish an edge between two nodes - ******************************************************************************/ -void node_register_outgoing -(AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha) -{ - if(thiz->outgoing_degree >= thiz->outgoing_max) - { - thiz->outgoing = (struct edge *) ndpi_realloc - (thiz->outgoing, thiz->outgoing_max*sizeof(struct edge), - (REALLOC_CHUNK_OUTGOING+thiz->outgoing_max)*sizeof(struct edge)); - thiz->outgoing_max += REALLOC_CHUNK_OUTGOING; - } - - thiz->outgoing[thiz->outgoing_degree].alpha = alpha; - thiz->outgoing[thiz->outgoing_degree++].next = next; -} - -/****************************************************************************** - * FUNCTION: node_assign_id - * assign a unique ID to the node (used for debugging purpose). - ******************************************************************************/ -void node_assign_id (AC_NODE_t * thiz) -{ - static int unique_id = 1; - thiz->id = unique_id ++; -} - -/****************************************************************************** - * FUNCTION: node_edge_compare - * Comparison function for qsort. see man qsort. - ******************************************************************************/ -int node_edge_compare (const void * l, const void * r) -{ - /* According to man page: - * The comparison function must return an integer less than, equal to, or - * greater than zero if the first argument is considered to be - * respectively less than, equal to, or greater than the second. if two - * members compare as equal, their order in the sorted array is undefined. - * - * NOTE: Because edge alphabets are unique in every node we ignore - * equivalence case. - **/ - if ( ((struct edge *)l)->alpha >= ((struct edge *)r)->alpha ) - return 1; - else - return -1; -} - -/****************************************************************************** - * FUNCTION: node_sort_edges - * sorts edges alphabets. - ******************************************************************************/ -void node_sort_edges (AC_NODE_t * thiz) -{ - sort ((void *)thiz->outgoing, thiz->outgoing_degree, sizeof(struct edge), node_edge_compare, NULL); -} diff --git a/src/lib/third_party/src/sort.c b/src/lib/third_party/src/sort.c deleted file mode 100644 index 35c8e9fdf..000000000 --- a/src/lib/third_party/src/sort.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * A fast, small, non-recursive O(nlog n) sort for the Linux kernel - * - * Jan 23 2005 Matt Mackall <mpm@selenic.com> - */ - -#ifdef WIN32 -#include <stdint.h> -typedef uint32_t u_int32_t; -#endif - -#include <stdlib.h> -#include <stdio.h> -#include <sys/types.h> - -/* This is a function ported from the Linux kernel lib/sort.c */ - -static void u_int32_t_swap(void *a, void *b, int size) -{ - u_int32_t t = *(u_int32_t *)a; - *(u_int32_t *)a = *(u_int32_t *)b; - *(u_int32_t *)b = t; -} - -static void generic_swap(void *_a, void *_b, int size) -{ - char t; - char *a = (char*)_a; - char *b = (char*)_b; - - do { - t = *a; - *a++ = *b; - *b++ = t; - } while (--size > 0); -} - -/** - * sort - sort an array of elements - * @base: pointer to data to sort - * @num: number of elements - * @size: size of each element - * @cmp_func: pointer to comparison function - * @swap_func: pointer to swap function or NULL - * - * This function does a heapsort on the given array. You may provide a - * swap_func function optimized to your element type. - * - * Sorting time is O(n log n) both on average and worst-case. While - * qsort is about 20% faster on average, it suffers from exploitable - * O(n*n) worst-case behavior and extra memory requirements that make - * it less suitable for kernel use. - */ - -void sort(void *_base, size_t num, size_t size, - int (*cmp_func)(const void *, const void *), - void (*swap_func)(void *, void *, int size)) -{ - /* pre-scale counters for performance */ - int i = (num/2 - 1) * size, n = num * size, c, r; - char *base = (char*)_base; - - if (!swap_func) - swap_func = (size == 4 ? u_int32_t_swap : generic_swap); - - /* heapify */ - for ( ; i >= 0; i -= size) { - for (r = i; r * 2 + size < n; r = c) { - c = r * 2 + size; - if (c < n - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); - } - } - - /* sort */ - for (i = n - size; i > 0; i -= size) { - swap_func(base, base + i, size); - for (r = 0; r * 2 + size < i; r = c) { - c = r * 2 + size; - if (c < i - size && - cmp_func(base + c, base + c + size) < 0) - c += size; - if (cmp_func(base + r, base + c) >= 0) - break; - swap_func(base + r, base + c, size); - } - } -} |