aboutsummaryrefslogtreecommitdiff
path: root/src/lib/third_party/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/third_party/include')
-rw-r--r--src/lib/third_party/include/actypes.h136
-rw-r--r--src/lib/third_party/include/ahocorasick.h246
-rw-r--r--src/lib/third_party/include/node.h66
-rw-r--r--src/lib/third_party/include/sort.h6
4 files changed, 219 insertions, 235 deletions
diff --git a/src/lib/third_party/include/actypes.h b/src/lib/third_party/include/actypes.h
deleted file mode 100644
index b775f6ac9..000000000
--- a/src/lib/third_party/include/actypes.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * actypes.h: Includes basic data types of ahocorasick library
- * This file is part of multifast.
- *
- Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
-
- multifast is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- multifast is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with multifast. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _AC_TYPES_H_
-#define _AC_TYPES_H_
-
-/* AC_ALPHABET_t:
- * defines the alphabet type.
- * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
- * with streams of other (bigger) types e.g. integers, specific enum, objects.
- * Although they consists of string of bytes (chars), but using their specific
- * types for AC_ALPHABET_t will lead to a better performance. so instead of
- * dealing with strings of chars, we assume dealing with strings of
- * AC_ALPHABET_t and leave it optional for other developers to define their
- * own alphabets.
- **/
-typedef char AC_ALPHABET_t;
-
-/* AC_REP_t:
- * Provides a more readable representative for a pattern.
- * because patterns themselves are not always suitable for displaying
- * (e.g. for hex patterns), we offer this type to improve intelligibility
- * of output. furthermore, sometimes it is useful, for example while
- * retrieving patterns from a database, to maintain their identifiers in the
- * automata for further reference. we provisioned two possible types as a
- * union for this purpose. you can add your desired type in it.
- **/
-typedef struct {
- u_int32_t number; /* Often used to store procotolId */
- u_int16_t category, breed;
-} AC_REP_t;
-
-/* AC_PATTERN_t:
- * This is the pattern type that must be fed into AC automata.
- * the 'astring' field is not null-terminated, due to it can contain zero
- * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
- * carries. the 'representative' field is described in AC_REP_t. despite
- * 'astring', 'representative' can have duplicate values for different given
- * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
- * CAUTION:
- * Not always the 'astring' points to the correct position in memory.
- * it is the responsibility of your program to maintain a permanent allocation
- * for astring field of the added pattern to automata.
- **/
-typedef struct
-{
- AC_ALPHABET_t * astring; /* String of alphabets */
- unsigned int length; /* Length of pattern */
- u_int8_t is_existing; /* 1 if the node is already part of another AC_PATTERN_t */
- AC_REP_t rep; /* Representative string (optional) */
-} AC_PATTERN_t;
-
-/* AC_TEXT_t:
- * The input text type that is fed to ac_automata_search() to be searched.
- * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
- * text, but for the purpose of being more readable, we defined this new type.
- **/
-typedef struct
-{
- AC_ALPHABET_t * astring; /* String of alphabets */
- unsigned int length; /* Length of string */
-} AC_TEXT_t;
-
-/* AC_MATCH_t:
- * Provides the structure for reporting a match event.
- * a match event occurs when the automata reaches a final node. any final
- * node can match one or more pattern at a position in a text. the
- * 'patterns' field holds these matched patterns. obviously these
- * matched patterns have same end-position in the text. there is a relationship
- * between matched patterns: the shorter one is a factor (tail) of the longer
- * one. the 'position' maintains the end position of matched patterns. the
- * start position of patterns could be found by knowing their 'length' in
- * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
- * position 40 in the text, then the start position of them are 34 and 36
- * respectively. finally the field 'match_num' maintains the number of
- * matched patterns.
- **/
-typedef struct
-{
- AC_PATTERN_t * patterns; /* Array of matched pattern */
- long position; /* The end position of matching pattern(s) in the text */
- unsigned int match_num; /* Number of matched patterns */
-} AC_MATCH_t;
-
-/* AC_ERROR_t:
- * Error that may occur while adding a pattern to the automata.
- * it is returned by ac_automata_add().
- **/
-typedef enum
- {
- ACERR_SUCCESS = 0, /* No error occurred */
- ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
- ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
- ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
- ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
- ac_automata_finalize() you can not add new patterns to the automata. */
- } AC_ERROR_t;
-
-/* MATCH_CALLBACK_t:
- * This is the call-back function type that must be given to automata at
- * initialization to report match occurrence to the caller.
- * at a match event, the automata will reach you using this function and sends
- * you a pointer to AC_MATCH_t. using that pointer you can handle
- * matches. you can send parameters to the call-back function when you call
- * ac_automata_search(). at call-back, the automata will sent you those
- * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
- * the call-back function you can cast it to whatever you want.
- * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
- * continue searching, otherwise it will return from ac_automata_search()
- * to your calling function.
- **/
-typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);
-
-/* AC_PATTRN_MAX_LENGTH:
- * Maximum acceptable pattern length in AC_PATTERN_t.length
- **/
-#define AC_PATTRN_MAX_LENGTH 1024
-
-#endif
diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h
index 943be88eb..551a922f9 100644
--- a/src/lib/third_party/include/ahocorasick.h
+++ b/src/lib/third_party/include/ahocorasick.h
@@ -1,5 +1,5 @@
/*
- * ahocorasick.h: the main ahocorasick header file.
+ * ahocorasick.h: Includes all types of ahocorasick library
* This file is part of multifast.
*
Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
@@ -18,53 +18,245 @@
along with multifast. If not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _AUTOMATA_H_
-#define _AUTOMATA_H_
+#ifndef _AC_TYPES_H_
+#define _AC_TYPES_H_
-#include "node.h"
+#define AC_AHOCORASICK_NEW
+#define AC_PATTRN_MAX_LENGTH 256
+
+/* reallocation step for AC_NODE_t.matched_patterns */
+#define REALLOC_CHUNK_MATCHSTR 8
+
+/* reallocation step for AC_NODE_t.outgoing array */
+/* Must be a multiple of __SIZEOF_LONG__ ! */
+#define REALLOC_CHUNK_OUTGOING 8
+
+/* AC_ALPHABET_t:
+ * defines the alphabet type.
+ * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
+ * with streams of other (bigger) types e.g. integers, specific enum, objects.
+ * Although they consists of string of bytes (chars), but using their specific
+ * types for AC_ALPHABET_t will lead to a better performance. so instead of
+ * dealing with strings of chars, we assume dealing with strings of
+ * AC_ALPHABET_t and leave it optional for other developers to define their
+ * own alphabets.
+ **/
+typedef char AC_ALPHABET_t;
+
+/* AC_REP_t:
+ * Provides a more readable representative for a pattern.
+ * because patterns themselves are not always suitable for displaying
+ * (e.g. for hex patterns), we offer this type to improve intelligibility
+ * of output. furthermore, sometimes it is useful, for example while
+ * retrieving patterns from a database, to maintain their identifiers in the
+ * automata for further reference. we provisioned two possible types as a
+ * union for this purpose. you can add your desired type in it.
+ **/
+typedef struct {
+ uint32_t number; /* Often used to store procotolId */
+ uint16_t breed,
+ category:14,from_start:1,at_end:1;
+} AC_REP_t;
+
+/* AC_PATTERN_t:
+ * This is the pattern type that must be fed into AC automata.
+ * the 'astring' field is not null-terminated, due to it can contain zero
+ * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
+ * carries. the 'representative' field is described in AC_REP_t. despite
+ * 'astring', 'representative' can have duplicate values for different given
+ * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
+ * CAUTION:
+ * Not always the 'astring' points to the correct position in memory.
+ * it is the responsibility of your program to maintain a permanent allocation
+ * for astring field of the added pattern to automata.
+ **/
typedef struct
{
- /* The root of the Aho-Corasick trie */
- AC_NODE_t * root;
+ AC_ALPHABET_t * astring; /* String of alphabets */
+ uint16_t length, /* Length of pattern */
+ is_existing; /* not union_matchstr */
+ AC_REP_t rep; /* Representative string (optional) */
+} AC_PATTERN_t;
- /* maintain all nodes pointers. it will be used to access or release
- * all nodes. */
- AC_NODE_t ** all_nodes;
+typedef struct {
+ unsigned short num; /* Number of matched patterns at this node */
+ unsigned short max; /* Max capacity of allocated memory for matched_patterns */
+ AC_PATTERN_t patterns[];
+} AC_PATTERNS_t;
- unsigned int all_nodes_num; /* Number of all nodes in the automata */
- unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */
- AC_MATCH_t match; /* Any match is reported with this */
- MATCH_CALLBACK_f match_callback; /* Match call-back function */
+/* AC_MATCH_t:
+ * Provides the structure for reporting a match event.
+ * a match event occurs when the automata reaches a final node. any final
+ * node can match one or more pattern at a position in a text. the
+ * 'patterns' field holds these matched patterns. obviously these
+ * matched patterns have same end-position in the text. there is a relationship
+ * between matched patterns: the shorter one is a factor (tail) of the longer
+ * one. the 'position' maintains the end position of matched patterns. the
+ * start position of patterns could be found by knowing their 'length' in
+ * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
+ * position 40 in the text, then the start position of them are 34 and 36
+ * respectively. finally the field 'match_num' maintains the number of
+ * matched patterns.
+ **/
+
+typedef struct
+{
+ AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */
+ AC_PATTERN_t *patterns; /* Array of matched pattern */
+ unsigned int position; /* The end position of matching pattern(s) in the text */
+ unsigned short int match_num; /* Number of matched patterns */
+ unsigned short int match_counter; /* Counter of found matches */
+} AC_MATCH_t;
+
+/* AC_TEXT_t:
+ * The input text type that is fed to ac_automata_search() to be searched.
+ * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
+ * text, but for the purpose of being more readable, we defined this new type.
+ **/
+typedef struct
+{
+ AC_MATCH_t match;
+ AC_ALPHABET_t * astring; /* String of alphabets */
+ unsigned short int length, /* Length of string */
+ ignore_case;
+} AC_TEXT_t;
+
+
+/* AC_ERROR_t:
+ * Error that may occur while adding a pattern to the automata.
+ * it is returned by ac_automata_add().
+ **/
+typedef enum
+ {
+ ACERR_SUCCESS = 0, /* No error occurred */
+ ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
+ ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
+ ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
+ ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
+ ac_automata_finalize() you can not add new patterns to the automata. */
+ ACERR_ERROR, /* common error */
+ } AC_ERROR_t;
+
+/* MATCH_CALLBACK_t:
+ * This is the call-back function type that must be given to automata at
+ * initialization to report match occurrence to the caller.
+ * at a match event, the automata will reach you using this function and sends
+ * you a pointer to AC_MATCH_t. using that pointer you can handle
+ * matches. you can send parameters to the call-back function when you call
+ * ac_automata_search(). at call-back, the automata will sent you those
+ * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
+ * the call-back function you can cast it to whatever you want.
+ * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
+ * continue searching, otherwise it will return from ac_automata_search()
+ * to your calling function.
+ **/
+typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);
+
+/* AC_PATTRN_MAX_LENGTH:
+ * Maximum acceptable pattern length in AC_PATTERN_t.length
+ **/
+
+/* Forward Declaration */
+struct edge;
+
+/*
+ * automata node
+ * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit
+ */
+typedef struct ac_node
+{
+ int id; /* Node ID : set after finalize(), only for ac_automata_dump */
+ AC_ALPHABET_t one_alpha,
+ one:1, /* one_char: yes/no */
+ range:1, /* range symbols start from one_alpha */
+ root:1, /* is root node */
+ final:1, /* 0: no ; 1: yes, it is a final node */
+ use:1, /* use: yes/no */
+ ff:1; /* finalized node */
+ unsigned short depth; /* depth: distance between this node and the root */
+
+ AC_PATTERNS_t *matched_patterns; /* Array of matched patterns */
+ struct edge *outgoing; /* Array of outgoing edges */
+
+ struct ac_node *failure_node; /* The failure node of this node */
+ AC_ALPHABET_t *a_ptr;
+} AC_NODE_t;
+
+#ifndef __SIZEOF_POINTER__
+#error SIZEOF_POINTER not defined!
+#endif
+
+struct edge {
+ unsigned short degree; /* Number of outgoing edges */
+ unsigned short max; /* Max capacity of allocated memory for outgoing */
+ uint32_t cmap[8]; /* 256 bit */
+ AC_NODE_t *next[];
+ /*
+ * first N elements used for 'next' pointers +
+ * M elements used for symbols storage
+ * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1))
+ *
+ * if sizeof(void*)==8
+ * for max = 8 we must alloc next[9];
+ * for max = 16 we must alloc next[18];
+ *
+ */
+};
+
+struct ac_path {
+ AC_NODE_t * n;
+ unsigned short int idx,l;
+};
+
+typedef struct
+{
+ /* The root of the Aho-Corasick trie */
+ AC_NODE_t * root;
+ MATCH_CALLBACK_f match_handler;
+ unsigned int all_nodes_num; /* Number of all nodes in the automata */
/* this flag indicates that if automata is finalized by
* ac_automata_finalize() or not. 1 means finalized and 0
* means not finalized (is open). after finalizing automata you can not
* add pattern to automata anymore. */
- unsigned short automata_open;
+ unsigned short automata_open,
+ to_lc:1, no_root_range:1; /* lowercase match */
/* Statistic Variables */
unsigned long total_patterns; /* Total patterns in the automata */
+ unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */
+
+ struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */
+ int id; /* node id */
+ int add_to_range; /* for convert to range */
+ int n_oc,n_range,n_find; /* statistics */
} AC_AUTOMATA_t;
-typedef struct {
- /* It is possible to feed a large input to the automata chunk by chunk to
- * be searched using ac_automata_search(). in fact by default automata
- * thinks that all chunks are related unless you do ac_automata_reset().
- * followings are variables that keep track of searching state. */
- AC_NODE_t * current_node; /* Pointer to current node while searching */
- unsigned long base_position; /* Represents the position of current chunk
- related to whole input text */
-} AC_SEARCH_t;
+typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);
+
+typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);
+#define AC_FEATURE_LC 1
+#define AC_FEATURE_NO_ROOT_RANGE 2
AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc);
+AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature);
AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
-void ac_automata_finalize (AC_AUTOMATA_t * thiz);
-int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * str, AC_REP_t * param);
-void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern);
-void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast);
+AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz);
+AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
+ ALPHA_CALLBACK_f alpha_cb, void *);
+int ac_automata_search (AC_AUTOMATA_t * thiz,
+ AC_TEXT_t * str,
+ AC_REP_t * param);
+int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
+void ac_automata_clean (AC_AUTOMATA_t * thiz);
+void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
+#ifndef __KERNEL__
+void ac_automata_dump (AC_AUTOMATA_t * thiz,
+ char *buf, size_t bufsize, char repcast);
#endif
+#endif
diff --git a/src/lib/third_party/include/node.h b/src/lib/third_party/include/node.h
deleted file mode 100644
index c172112aa..000000000
--- a/src/lib/third_party/include/node.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * node.h: automata node header file
- * This file is part of multifast.
- *
- Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
-
- multifast is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- multifast is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with multifast. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _NODE_H_
-#define _NODE_H_
-
-#include "actypes.h"
-
-/* Forward Declaration */
-struct edge;
-
-/* automata node */
-typedef struct ac_node
-{
- int id; /* Node ID : for debugging purpose */
- short int final; /* 0: no ; 1: yes, it is a final node */
- struct ac_node * failure_node; /* The failure node of this node */
- unsigned short depth; /* depth: distance between this node and the root */
-
- /* Matched patterns */
- AC_PATTERN_t * matched_patterns; /* Array of matched patterns */
- unsigned short matched_patterns_num; /* Number of matched patterns at this node */
- unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */
-
- /* Outgoing Edges */
- struct edge * outgoing; /* Array of outgoing edges */
- unsigned short outgoing_degree; /* Number of outgoing edges */
- unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */
-} AC_NODE_t;
-
-/* The Edge of the Node */
-struct edge
-{
- AC_ALPHABET_t alpha; /* Edge alpha */
- struct ac_node * next; /* Target of the edge */
-};
-
-
-AC_NODE_t * node_create (void);
-AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing);
-void node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha);
-AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-void node_release (AC_NODE_t * thiz, u_int8_t free_pattern);
-void node_assign_id (AC_NODE_t * thiz);
-void node_sort_edges (AC_NODE_t * thiz);
-
-#endif
diff --git a/src/lib/third_party/include/sort.h b/src/lib/third_party/include/sort.h
deleted file mode 100644
index ee7df8a10..000000000
--- a/src/lib/third_party/include/sort.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* This is a function ported from the Linux kernel lib/sort.c */
-
-void sort(void *base, size_t num, size_t len,
- int (*cmp_func)(const void *, const void *),
- void (*swap_func)(void *, void *, int size));
-