diff options
Diffstat (limited to 'src/lib/third_party/include')
-rw-r--r-- | src/lib/third_party/include/actypes.h | 136 | ||||
-rw-r--r-- | src/lib/third_party/include/ahocorasick.h | 246 | ||||
-rw-r--r-- | src/lib/third_party/include/node.h | 66 | ||||
-rw-r--r-- | src/lib/third_party/include/sort.h | 6 |
4 files changed, 219 insertions, 235 deletions
diff --git a/src/lib/third_party/include/actypes.h b/src/lib/third_party/include/actypes.h deleted file mode 100644 index b775f6ac9..000000000 --- a/src/lib/third_party/include/actypes.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * actypes.h: Includes basic data types of ahocorasick library - * This file is part of multifast. - * - Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> - - multifast is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - multifast is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with multifast. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _AC_TYPES_H_ -#define _AC_TYPES_H_ - -/* AC_ALPHABET_t: - * defines the alphabet type. - * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal - * with streams of other (bigger) types e.g. integers, specific enum, objects. - * Although they consists of string of bytes (chars), but using their specific - * types for AC_ALPHABET_t will lead to a better performance. so instead of - * dealing with strings of chars, we assume dealing with strings of - * AC_ALPHABET_t and leave it optional for other developers to define their - * own alphabets. - **/ -typedef char AC_ALPHABET_t; - -/* AC_REP_t: - * Provides a more readable representative for a pattern. - * because patterns themselves are not always suitable for displaying - * (e.g. for hex patterns), we offer this type to improve intelligibility - * of output. furthermore, sometimes it is useful, for example while - * retrieving patterns from a database, to maintain their identifiers in the - * automata for further reference. we provisioned two possible types as a - * union for this purpose. you can add your desired type in it. - **/ -typedef struct { - u_int32_t number; /* Often used to store procotolId */ - u_int16_t category, breed; -} AC_REP_t; - -/* AC_PATTERN_t: - * This is the pattern type that must be fed into AC automata. - * the 'astring' field is not null-terminated, due to it can contain zero - * value bytes. the 'length' field determines the number of AC_ALPHABET_t it - * carries. the 'representative' field is described in AC_REP_t. despite - * 'astring', 'representative' can have duplicate values for different given - * AC_PATTERN_t. it is an optional field and you can just fill it with 0. - * CAUTION: - * Not always the 'astring' points to the correct position in memory. - * it is the responsibility of your program to maintain a permanent allocation - * for astring field of the added pattern to automata. - **/ -typedef struct -{ - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of pattern */ - u_int8_t is_existing; /* 1 if the node is already part of another AC_PATTERN_t */ - AC_REP_t rep; /* Representative string (optional) */ -} AC_PATTERN_t; - -/* AC_TEXT_t: - * The input text type that is fed to ac_automata_search() to be searched. - * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input - * text, but for the purpose of being more readable, we defined this new type. - **/ -typedef struct -{ - AC_ALPHABET_t * astring; /* String of alphabets */ - unsigned int length; /* Length of string */ -} AC_TEXT_t; - -/* AC_MATCH_t: - * Provides the structure for reporting a match event. - * a match event occurs when the automata reaches a final node. any final - * node can match one or more pattern at a position in a text. the - * 'patterns' field holds these matched patterns. obviously these - * matched patterns have same end-position in the text. there is a relationship - * between matched patterns: the shorter one is a factor (tail) of the longer - * one. the 'position' maintains the end position of matched patterns. the - * start position of patterns could be found by knowing their 'length' in - * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at - * position 40 in the text, then the start position of them are 34 and 36 - * respectively. finally the field 'match_num' maintains the number of - * matched patterns. - **/ -typedef struct -{ - AC_PATTERN_t * patterns; /* Array of matched pattern */ - long position; /* The end position of matching pattern(s) in the text */ - unsigned int match_num; /* Number of matched patterns */ -} AC_MATCH_t; - -/* AC_ERROR_t: - * Error that may occur while adding a pattern to the automata. - * it is returned by ac_automata_add(). - **/ -typedef enum - { - ACERR_SUCCESS = 0, /* No error occurred */ - ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ - ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ - ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ - ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling - ac_automata_finalize() you can not add new patterns to the automata. */ - } AC_ERROR_t; - -/* MATCH_CALLBACK_t: - * This is the call-back function type that must be given to automata at - * initialization to report match occurrence to the caller. - * at a match event, the automata will reach you using this function and sends - * you a pointer to AC_MATCH_t. using that pointer you can handle - * matches. you can send parameters to the call-back function when you call - * ac_automata_search(). at call-back, the automata will sent you those - * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside - * the call-back function you can cast it to whatever you want. - * If you return 0 from MATCH_CALLBACK_t function to the automata, it will - * continue searching, otherwise it will return from ac_automata_search() - * to your calling function. - **/ -typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *); - -/* AC_PATTRN_MAX_LENGTH: - * Maximum acceptable pattern length in AC_PATTERN_t.length - **/ -#define AC_PATTRN_MAX_LENGTH 1024 - -#endif diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h index 943be88eb..551a922f9 100644 --- a/src/lib/third_party/include/ahocorasick.h +++ b/src/lib/third_party/include/ahocorasick.h @@ -1,5 +1,5 @@ /* - * ahocorasick.h: the main ahocorasick header file. + * ahocorasick.h: Includes all types of ahocorasick library * This file is part of multifast. * Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> @@ -18,53 +18,245 @@ along with multifast. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef _AUTOMATA_H_ -#define _AUTOMATA_H_ +#ifndef _AC_TYPES_H_ +#define _AC_TYPES_H_ -#include "node.h" +#define AC_AHOCORASICK_NEW +#define AC_PATTRN_MAX_LENGTH 256 + +/* reallocation step for AC_NODE_t.matched_patterns */ +#define REALLOC_CHUNK_MATCHSTR 8 + +/* reallocation step for AC_NODE_t.outgoing array */ +/* Must be a multiple of __SIZEOF_LONG__ ! */ +#define REALLOC_CHUNK_OUTGOING 8 + +/* AC_ALPHABET_t: + * defines the alphabet type. + * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal + * with streams of other (bigger) types e.g. integers, specific enum, objects. + * Although they consists of string of bytes (chars), but using their specific + * types for AC_ALPHABET_t will lead to a better performance. so instead of + * dealing with strings of chars, we assume dealing with strings of + * AC_ALPHABET_t and leave it optional for other developers to define their + * own alphabets. + **/ +typedef char AC_ALPHABET_t; + +/* AC_REP_t: + * Provides a more readable representative for a pattern. + * because patterns themselves are not always suitable for displaying + * (e.g. for hex patterns), we offer this type to improve intelligibility + * of output. furthermore, sometimes it is useful, for example while + * retrieving patterns from a database, to maintain their identifiers in the + * automata for further reference. we provisioned two possible types as a + * union for this purpose. you can add your desired type in it. + **/ +typedef struct { + uint32_t number; /* Often used to store procotolId */ + uint16_t breed, + category:14,from_start:1,at_end:1; +} AC_REP_t; + +/* AC_PATTERN_t: + * This is the pattern type that must be fed into AC automata. + * the 'astring' field is not null-terminated, due to it can contain zero + * value bytes. the 'length' field determines the number of AC_ALPHABET_t it + * carries. the 'representative' field is described in AC_REP_t. despite + * 'astring', 'representative' can have duplicate values for different given + * AC_PATTERN_t. it is an optional field and you can just fill it with 0. + * CAUTION: + * Not always the 'astring' points to the correct position in memory. + * it is the responsibility of your program to maintain a permanent allocation + * for astring field of the added pattern to automata. + **/ typedef struct { - /* The root of the Aho-Corasick trie */ - AC_NODE_t * root; + AC_ALPHABET_t * astring; /* String of alphabets */ + uint16_t length, /* Length of pattern */ + is_existing; /* not union_matchstr */ + AC_REP_t rep; /* Representative string (optional) */ +} AC_PATTERN_t; - /* maintain all nodes pointers. it will be used to access or release - * all nodes. */ - AC_NODE_t ** all_nodes; +typedef struct { + unsigned short num; /* Number of matched patterns at this node */ + unsigned short max; /* Max capacity of allocated memory for matched_patterns */ + AC_PATTERN_t patterns[]; +} AC_PATTERNS_t; - unsigned int all_nodes_num; /* Number of all nodes in the automata */ - unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */ - AC_MATCH_t match; /* Any match is reported with this */ - MATCH_CALLBACK_f match_callback; /* Match call-back function */ +/* AC_MATCH_t: + * Provides the structure for reporting a match event. + * a match event occurs when the automata reaches a final node. any final + * node can match one or more pattern at a position in a text. the + * 'patterns' field holds these matched patterns. obviously these + * matched patterns have same end-position in the text. there is a relationship + * between matched patterns: the shorter one is a factor (tail) of the longer + * one. the 'position' maintains the end position of matched patterns. the + * start position of patterns could be found by knowing their 'length' in + * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at + * position 40 in the text, then the start position of them are 34 and 36 + * respectively. finally the field 'match_num' maintains the number of + * matched patterns. + **/ + +typedef struct +{ + AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */ + AC_PATTERN_t *patterns; /* Array of matched pattern */ + unsigned int position; /* The end position of matching pattern(s) in the text */ + unsigned short int match_num; /* Number of matched patterns */ + unsigned short int match_counter; /* Counter of found matches */ +} AC_MATCH_t; + +/* AC_TEXT_t: + * The input text type that is fed to ac_automata_search() to be searched. + * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input + * text, but for the purpose of being more readable, we defined this new type. + **/ +typedef struct +{ + AC_MATCH_t match; + AC_ALPHABET_t * astring; /* String of alphabets */ + unsigned short int length, /* Length of string */ + ignore_case; +} AC_TEXT_t; + + +/* AC_ERROR_t: + * Error that may occur while adding a pattern to the automata. + * it is returned by ac_automata_add(). + **/ +typedef enum + { + ACERR_SUCCESS = 0, /* No error occurred */ + ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */ + ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */ + ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */ + ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling + ac_automata_finalize() you can not add new patterns to the automata. */ + ACERR_ERROR, /* common error */ + } AC_ERROR_t; + +/* MATCH_CALLBACK_t: + * This is the call-back function type that must be given to automata at + * initialization to report match occurrence to the caller. + * at a match event, the automata will reach you using this function and sends + * you a pointer to AC_MATCH_t. using that pointer you can handle + * matches. you can send parameters to the call-back function when you call + * ac_automata_search(). at call-back, the automata will sent you those + * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside + * the call-back function you can cast it to whatever you want. + * If you return 0 from MATCH_CALLBACK_t function to the automata, it will + * continue searching, otherwise it will return from ac_automata_search() + * to your calling function. + **/ +typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *); + +/* AC_PATTRN_MAX_LENGTH: + * Maximum acceptable pattern length in AC_PATTERN_t.length + **/ + +/* Forward Declaration */ +struct edge; + +/* + * automata node + * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit + */ +typedef struct ac_node +{ + int id; /* Node ID : set after finalize(), only for ac_automata_dump */ + AC_ALPHABET_t one_alpha, + one:1, /* one_char: yes/no */ + range:1, /* range symbols start from one_alpha */ + root:1, /* is root node */ + final:1, /* 0: no ; 1: yes, it is a final node */ + use:1, /* use: yes/no */ + ff:1; /* finalized node */ + unsigned short depth; /* depth: distance between this node and the root */ + + AC_PATTERNS_t *matched_patterns; /* Array of matched patterns */ + struct edge *outgoing; /* Array of outgoing edges */ + + struct ac_node *failure_node; /* The failure node of this node */ + AC_ALPHABET_t *a_ptr; +} AC_NODE_t; + +#ifndef __SIZEOF_POINTER__ +#error SIZEOF_POINTER not defined! +#endif + +struct edge { + unsigned short degree; /* Number of outgoing edges */ + unsigned short max; /* Max capacity of allocated memory for outgoing */ + uint32_t cmap[8]; /* 256 bit */ + AC_NODE_t *next[]; + /* + * first N elements used for 'next' pointers + + * M elements used for symbols storage + * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1)) + * + * if sizeof(void*)==8 + * for max = 8 we must alloc next[9]; + * for max = 16 we must alloc next[18]; + * + */ +}; + +struct ac_path { + AC_NODE_t * n; + unsigned short int idx,l; +}; + +typedef struct +{ + /* The root of the Aho-Corasick trie */ + AC_NODE_t * root; + MATCH_CALLBACK_f match_handler; + unsigned int all_nodes_num; /* Number of all nodes in the automata */ /* this flag indicates that if automata is finalized by * ac_automata_finalize() or not. 1 means finalized and 0 * means not finalized (is open). after finalizing automata you can not * add pattern to automata anymore. */ - unsigned short automata_open; + unsigned short automata_open, + to_lc:1, no_root_range:1; /* lowercase match */ /* Statistic Variables */ unsigned long total_patterns; /* Total patterns in the automata */ + unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */ + + struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */ + int id; /* node id */ + int add_to_range; /* for convert to range */ + int n_oc,n_range,n_find; /* statistics */ } AC_AUTOMATA_t; -typedef struct { - /* It is possible to feed a large input to the automata chunk by chunk to - * be searched using ac_automata_search(). in fact by default automata - * thinks that all chunks are related unless you do ac_automata_reset(). - * followings are variables that keep track of searching state. */ - AC_NODE_t * current_node; /* Pointer to current node while searching */ - unsigned long base_position; /* Represents the position of current chunk - related to whole input text */ -} AC_SEARCH_t; +typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *); + +typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *); +#define AC_FEATURE_LC 1 +#define AC_FEATURE_NO_ROOT_RANGE 2 AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc); +AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature); AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str); -void ac_automata_finalize (AC_AUTOMATA_t * thiz); -int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * str, AC_REP_t * param); -void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern); -void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast); +AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz); +AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb, + ALPHA_CALLBACK_f alpha_cb, void *); +int ac_automata_search (AC_AUTOMATA_t * thiz, + AC_TEXT_t * str, + AC_REP_t * param); +int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *); +void ac_automata_clean (AC_AUTOMATA_t * thiz); +void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern); +#ifndef __KERNEL__ +void ac_automata_dump (AC_AUTOMATA_t * thiz, + char *buf, size_t bufsize, char repcast); #endif +#endif diff --git a/src/lib/third_party/include/node.h b/src/lib/third_party/include/node.h deleted file mode 100644 index c172112aa..000000000 --- a/src/lib/third_party/include/node.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * node.h: automata node header file - * This file is part of multifast. - * - Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com> - - multifast is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - multifast is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with multifast. If not, see <http://www.gnu.org/licenses/>. -*/ - -#ifndef _NODE_H_ -#define _NODE_H_ - -#include "actypes.h" - -/* Forward Declaration */ -struct edge; - -/* automata node */ -typedef struct ac_node -{ - int id; /* Node ID : for debugging purpose */ - short int final; /* 0: no ; 1: yes, it is a final node */ - struct ac_node * failure_node; /* The failure node of this node */ - unsigned short depth; /* depth: distance between this node and the root */ - - /* Matched patterns */ - AC_PATTERN_t * matched_patterns; /* Array of matched patterns */ - unsigned short matched_patterns_num; /* Number of matched patterns at this node */ - unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */ - - /* Outgoing Edges */ - struct edge * outgoing; /* Array of outgoing edges */ - unsigned short outgoing_degree; /* Number of outgoing edges */ - unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */ -} AC_NODE_t; - -/* The Edge of the Node */ -struct edge -{ - AC_ALPHABET_t alpha; /* Edge alpha */ - struct ac_node * next; /* Target of the edge */ -}; - - -AC_NODE_t * node_create (void); -AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing); -void node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha); -AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha); -void node_release (AC_NODE_t * thiz, u_int8_t free_pattern); -void node_assign_id (AC_NODE_t * thiz); -void node_sort_edges (AC_NODE_t * thiz); - -#endif diff --git a/src/lib/third_party/include/sort.h b/src/lib/third_party/include/sort.h deleted file mode 100644 index ee7df8a10..000000000 --- a/src/lib/third_party/include/sort.h +++ /dev/null @@ -1,6 +0,0 @@ -/* This is a function ported from the Linux kernel lib/sort.c */ - -void sort(void *base, size_t num, size_t len, - int (*cmp_func)(const void *, const void *), - void (*swap_func)(void *, void *, int size)); - |