aboutsummaryrefslogtreecommitdiff
path: root/src/lib/third_party
diff options
context:
space:
mode:
authorVitaly Lavrov <vel21ripn@gmail.com>2021-06-07 12:19:40 +0000
committerGitHub <noreply@github.com>2021-06-07 14:19:40 +0200
commit65678dbeeabcf51e02ba3cc9ffbd36324a92a971 (patch)
tree6ad7729fe9fcd32edb1157ad18adde15b6e26b72 /src/lib/third_party
parent2fcf641e87edbd7188b5c8390c3e12128638f01a (diff)
New version of the ahocorasick library (#1200)
The new version is about 25% faster with -O2 and 45% faster with -O3. No recursion is used (smaller stack size required). Uses less memory (by valgrind info) bigram: - original 1796 allocs, 247864 bytes allocated - new 1232 allocs, 158880 bytes allocated host_match: - original 18038 allocs, 3004576 bytes allocated - new 6861 allocs, 396624 bytes allocated The function ac_automata_search() is thread safe. Optional case-insensitive comparison. Matching at the beginning and at the end of the string is supported. One code file and one header file.
Diffstat (limited to 'src/lib/third_party')
-rw-r--r--src/lib/third_party/include/actypes.h136
-rw-r--r--src/lib/third_party/include/ahocorasick.h246
-rw-r--r--src/lib/third_party/include/node.h66
-rw-r--r--src/lib/third_party/include/sort.h6
-rw-r--r--src/lib/third_party/src/ahocorasick.c1198
-rw-r--r--src/lib/third_party/src/node.c274
-rw-r--r--src/lib/third_party/src/sort.c92
7 files changed, 1227 insertions, 791 deletions
diff --git a/src/lib/third_party/include/actypes.h b/src/lib/third_party/include/actypes.h
deleted file mode 100644
index b775f6ac9..000000000
--- a/src/lib/third_party/include/actypes.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * actypes.h: Includes basic data types of ahocorasick library
- * This file is part of multifast.
- *
- Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
-
- multifast is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- multifast is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with multifast. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _AC_TYPES_H_
-#define _AC_TYPES_H_
-
-/* AC_ALPHABET_t:
- * defines the alphabet type.
- * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
- * with streams of other (bigger) types e.g. integers, specific enum, objects.
- * Although they consists of string of bytes (chars), but using their specific
- * types for AC_ALPHABET_t will lead to a better performance. so instead of
- * dealing with strings of chars, we assume dealing with strings of
- * AC_ALPHABET_t and leave it optional for other developers to define their
- * own alphabets.
- **/
-typedef char AC_ALPHABET_t;
-
-/* AC_REP_t:
- * Provides a more readable representative for a pattern.
- * because patterns themselves are not always suitable for displaying
- * (e.g. for hex patterns), we offer this type to improve intelligibility
- * of output. furthermore, sometimes it is useful, for example while
- * retrieving patterns from a database, to maintain their identifiers in the
- * automata for further reference. we provisioned two possible types as a
- * union for this purpose. you can add your desired type in it.
- **/
-typedef struct {
- u_int32_t number; /* Often used to store procotolId */
- u_int16_t category, breed;
-} AC_REP_t;
-
-/* AC_PATTERN_t:
- * This is the pattern type that must be fed into AC automata.
- * the 'astring' field is not null-terminated, due to it can contain zero
- * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
- * carries. the 'representative' field is described in AC_REP_t. despite
- * 'astring', 'representative' can have duplicate values for different given
- * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
- * CAUTION:
- * Not always the 'astring' points to the correct position in memory.
- * it is the responsibility of your program to maintain a permanent allocation
- * for astring field of the added pattern to automata.
- **/
-typedef struct
-{
- AC_ALPHABET_t * astring; /* String of alphabets */
- unsigned int length; /* Length of pattern */
- u_int8_t is_existing; /* 1 if the node is already part of another AC_PATTERN_t */
- AC_REP_t rep; /* Representative string (optional) */
-} AC_PATTERN_t;
-
-/* AC_TEXT_t:
- * The input text type that is fed to ac_automata_search() to be searched.
- * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
- * text, but for the purpose of being more readable, we defined this new type.
- **/
-typedef struct
-{
- AC_ALPHABET_t * astring; /* String of alphabets */
- unsigned int length; /* Length of string */
-} AC_TEXT_t;
-
-/* AC_MATCH_t:
- * Provides the structure for reporting a match event.
- * a match event occurs when the automata reaches a final node. any final
- * node can match one or more pattern at a position in a text. the
- * 'patterns' field holds these matched patterns. obviously these
- * matched patterns have same end-position in the text. there is a relationship
- * between matched patterns: the shorter one is a factor (tail) of the longer
- * one. the 'position' maintains the end position of matched patterns. the
- * start position of patterns could be found by knowing their 'length' in
- * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
- * position 40 in the text, then the start position of them are 34 and 36
- * respectively. finally the field 'match_num' maintains the number of
- * matched patterns.
- **/
-typedef struct
-{
- AC_PATTERN_t * patterns; /* Array of matched pattern */
- long position; /* The end position of matching pattern(s) in the text */
- unsigned int match_num; /* Number of matched patterns */
-} AC_MATCH_t;
-
-/* AC_ERROR_t:
- * Error that may occur while adding a pattern to the automata.
- * it is returned by ac_automata_add().
- **/
-typedef enum
- {
- ACERR_SUCCESS = 0, /* No error occurred */
- ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
- ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
- ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
- ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
- ac_automata_finalize() you can not add new patterns to the automata. */
- } AC_ERROR_t;
-
-/* MATCH_CALLBACK_t:
- * This is the call-back function type that must be given to automata at
- * initialization to report match occurrence to the caller.
- * at a match event, the automata will reach you using this function and sends
- * you a pointer to AC_MATCH_t. using that pointer you can handle
- * matches. you can send parameters to the call-back function when you call
- * ac_automata_search(). at call-back, the automata will sent you those
- * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
- * the call-back function you can cast it to whatever you want.
- * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
- * continue searching, otherwise it will return from ac_automata_search()
- * to your calling function.
- **/
-typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);
-
-/* AC_PATTRN_MAX_LENGTH:
- * Maximum acceptable pattern length in AC_PATTERN_t.length
- **/
-#define AC_PATTRN_MAX_LENGTH 1024
-
-#endif
diff --git a/src/lib/third_party/include/ahocorasick.h b/src/lib/third_party/include/ahocorasick.h
index 943be88eb..551a922f9 100644
--- a/src/lib/third_party/include/ahocorasick.h
+++ b/src/lib/third_party/include/ahocorasick.h
@@ -1,5 +1,5 @@
/*
- * ahocorasick.h: the main ahocorasick header file.
+ * ahocorasick.h: Includes all types of ahocorasick library
* This file is part of multifast.
*
Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
@@ -18,53 +18,245 @@
along with multifast. If not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _AUTOMATA_H_
-#define _AUTOMATA_H_
+#ifndef _AC_TYPES_H_
+#define _AC_TYPES_H_
-#include "node.h"
+#define AC_AHOCORASICK_NEW
+#define AC_PATTRN_MAX_LENGTH 256
+
+/* reallocation step for AC_NODE_t.matched_patterns */
+#define REALLOC_CHUNK_MATCHSTR 8
+
+/* reallocation step for AC_NODE_t.outgoing array */
+/* Must be a multiple of __SIZEOF_LONG__ ! */
+#define REALLOC_CHUNK_OUTGOING 8
+
+/* AC_ALPHABET_t:
+ * defines the alphabet type.
+ * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
+ * with streams of other (bigger) types e.g. integers, specific enum, objects.
+ * Although they consists of string of bytes (chars), but using their specific
+ * types for AC_ALPHABET_t will lead to a better performance. so instead of
+ * dealing with strings of chars, we assume dealing with strings of
+ * AC_ALPHABET_t and leave it optional for other developers to define their
+ * own alphabets.
+ **/
+typedef char AC_ALPHABET_t;
+
+/* AC_REP_t:
+ * Provides a more readable representative for a pattern.
+ * because patterns themselves are not always suitable for displaying
+ * (e.g. for hex patterns), we offer this type to improve intelligibility
+ * of output. furthermore, sometimes it is useful, for example while
+ * retrieving patterns from a database, to maintain their identifiers in the
+ * automata for further reference. we provisioned two possible types as a
+ * union for this purpose. you can add your desired type in it.
+ **/
+typedef struct {
+ uint32_t number; /* Often used to store procotolId */
+ uint16_t breed,
+ category:14,from_start:1,at_end:1;
+} AC_REP_t;
+
+/* AC_PATTERN_t:
+ * This is the pattern type that must be fed into AC automata.
+ * the 'astring' field is not null-terminated, due to it can contain zero
+ * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
+ * carries. the 'representative' field is described in AC_REP_t. despite
+ * 'astring', 'representative' can have duplicate values for different given
+ * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
+ * CAUTION:
+ * Not always the 'astring' points to the correct position in memory.
+ * it is the responsibility of your program to maintain a permanent allocation
+ * for astring field of the added pattern to automata.
+ **/
typedef struct
{
- /* The root of the Aho-Corasick trie */
- AC_NODE_t * root;
+ AC_ALPHABET_t * astring; /* String of alphabets */
+ uint16_t length, /* Length of pattern */
+ is_existing; /* not union_matchstr */
+ AC_REP_t rep; /* Representative string (optional) */
+} AC_PATTERN_t;
- /* maintain all nodes pointers. it will be used to access or release
- * all nodes. */
- AC_NODE_t ** all_nodes;
+typedef struct {
+ unsigned short num; /* Number of matched patterns at this node */
+ unsigned short max; /* Max capacity of allocated memory for matched_patterns */
+ AC_PATTERN_t patterns[];
+} AC_PATTERNS_t;
- unsigned int all_nodes_num; /* Number of all nodes in the automata */
- unsigned int all_nodes_max; /* Current max allocated memory for *all_nodes */
- AC_MATCH_t match; /* Any match is reported with this */
- MATCH_CALLBACK_f match_callback; /* Match call-back function */
+/* AC_MATCH_t:
+ * Provides the structure for reporting a match event.
+ * a match event occurs when the automata reaches a final node. any final
+ * node can match one or more pattern at a position in a text. the
+ * 'patterns' field holds these matched patterns. obviously these
+ * matched patterns have same end-position in the text. there is a relationship
+ * between matched patterns: the shorter one is a factor (tail) of the longer
+ * one. the 'position' maintains the end position of matched patterns. the
+ * start position of patterns could be found by knowing their 'length' in
+ * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
+ * position 40 in the text, then the start position of them are 34 and 36
+ * respectively. finally the field 'match_num' maintains the number of
+ * matched patterns.
+ **/
+
+typedef struct
+{
+ AC_PATTERN_t *matched[4]; /* for ac_automata_exact_match() */
+ AC_PATTERN_t *patterns; /* Array of matched pattern */
+ unsigned int position; /* The end position of matching pattern(s) in the text */
+ unsigned short int match_num; /* Number of matched patterns */
+ unsigned short int match_counter; /* Counter of found matches */
+} AC_MATCH_t;
+
+/* AC_TEXT_t:
+ * The input text type that is fed to ac_automata_search() to be searched.
+ * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
+ * text, but for the purpose of being more readable, we defined this new type.
+ **/
+typedef struct
+{
+ AC_MATCH_t match;
+ AC_ALPHABET_t * astring; /* String of alphabets */
+ unsigned short int length, /* Length of string */
+ ignore_case;
+} AC_TEXT_t;
+
+
+/* AC_ERROR_t:
+ * Error that may occur while adding a pattern to the automata.
+ * it is returned by ac_automata_add().
+ **/
+typedef enum
+ {
+ ACERR_SUCCESS = 0, /* No error occurred */
+ ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
+ ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
+ ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
+ ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
+ ac_automata_finalize() you can not add new patterns to the automata. */
+ ACERR_ERROR, /* common error */
+ } AC_ERROR_t;
+
+/* MATCH_CALLBACK_t:
+ * This is the call-back function type that must be given to automata at
+ * initialization to report match occurrence to the caller.
+ * at a match event, the automata will reach you using this function and sends
+ * you a pointer to AC_MATCH_t. using that pointer you can handle
+ * matches. you can send parameters to the call-back function when you call
+ * ac_automata_search(). at call-back, the automata will sent you those
+ * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
+ * the call-back function you can cast it to whatever you want.
+ * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
+ * continue searching, otherwise it will return from ac_automata_search()
+ * to your calling function.
+ **/
+typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);
+
+/* AC_PATTRN_MAX_LENGTH:
+ * Maximum acceptable pattern length in AC_PATTERN_t.length
+ **/
+
+/* Forward Declaration */
+struct edge;
+
+/*
+ * automata node
+ * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit
+ */
+typedef struct ac_node
+{
+ int id; /* Node ID : set after finalize(), only for ac_automata_dump */
+ AC_ALPHABET_t one_alpha,
+ one:1, /* one_char: yes/no */
+ range:1, /* range symbols start from one_alpha */
+ root:1, /* is root node */
+ final:1, /* 0: no ; 1: yes, it is a final node */
+ use:1, /* use: yes/no */
+ ff:1; /* finalized node */
+ unsigned short depth; /* depth: distance between this node and the root */
+
+ AC_PATTERNS_t *matched_patterns; /* Array of matched patterns */
+ struct edge *outgoing; /* Array of outgoing edges */
+
+ struct ac_node *failure_node; /* The failure node of this node */
+ AC_ALPHABET_t *a_ptr;
+} AC_NODE_t;
+
+#ifndef __SIZEOF_POINTER__
+#error SIZEOF_POINTER not defined!
+#endif
+
+struct edge {
+ unsigned short degree; /* Number of outgoing edges */
+ unsigned short max; /* Max capacity of allocated memory for outgoing */
+ uint32_t cmap[8]; /* 256 bit */
+ AC_NODE_t *next[];
+ /*
+ * first N elements used for 'next' pointers +
+ * M elements used for symbols storage
+ * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1))
+ *
+ * if sizeof(void*)==8
+ * for max = 8 we must alloc next[9];
+ * for max = 16 we must alloc next[18];
+ *
+ */
+};
+
+struct ac_path {
+ AC_NODE_t * n;
+ unsigned short int idx,l;
+};
+
+typedef struct
+{
+ /* The root of the Aho-Corasick trie */
+ AC_NODE_t * root;
+ MATCH_CALLBACK_f match_handler;
+ unsigned int all_nodes_num; /* Number of all nodes in the automata */
/* this flag indicates that if automata is finalized by
* ac_automata_finalize() or not. 1 means finalized and 0
* means not finalized (is open). after finalizing automata you can not
* add pattern to automata anymore. */
- unsigned short automata_open;
+ unsigned short automata_open,
+ to_lc:1, no_root_range:1; /* lowercase match */
/* Statistic Variables */
unsigned long total_patterns; /* Total patterns in the automata */
+ unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */
+
+ struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */
+ int id; /* node id */
+ int add_to_range; /* for convert to range */
+ int n_oc,n_range,n_find; /* statistics */
} AC_AUTOMATA_t;
-typedef struct {
- /* It is possible to feed a large input to the automata chunk by chunk to
- * be searched using ac_automata_search(). in fact by default automata
- * thinks that all chunks are related unless you do ac_automata_reset().
- * followings are variables that keep track of searching state. */
- AC_NODE_t * current_node; /* Pointer to current node while searching */
- unsigned long base_position; /* Represents the position of current chunk
- related to whole input text */
-} AC_SEARCH_t;
+typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);
+
+typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);
+#define AC_FEATURE_LC 1
+#define AC_FEATURE_NO_ROOT_RANGE 2
AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc);
+AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature);
AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
-void ac_automata_finalize (AC_AUTOMATA_t * thiz);
-int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * str, AC_REP_t * param);
-void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern);
-void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast);
+AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz);
+AC_ERROR_t ac_automata_walk (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
+ ALPHA_CALLBACK_f alpha_cb, void *);
+int ac_automata_search (AC_AUTOMATA_t * thiz,
+ AC_TEXT_t * str,
+ AC_REP_t * param);
+int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
+void ac_automata_clean (AC_AUTOMATA_t * thiz);
+void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
+#ifndef __KERNEL__
+void ac_automata_dump (AC_AUTOMATA_t * thiz,
+ char *buf, size_t bufsize, char repcast);
#endif
+#endif
diff --git a/src/lib/third_party/include/node.h b/src/lib/third_party/include/node.h
deleted file mode 100644
index c172112aa..000000000
--- a/src/lib/third_party/include/node.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * node.h: automata node header file
- * This file is part of multifast.
- *
- Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
-
- multifast is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- multifast is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with multifast. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef _NODE_H_
-#define _NODE_H_
-
-#include "actypes.h"
-
-/* Forward Declaration */
-struct edge;
-
-/* automata node */
-typedef struct ac_node
-{
- int id; /* Node ID : for debugging purpose */
- short int final; /* 0: no ; 1: yes, it is a final node */
- struct ac_node * failure_node; /* The failure node of this node */
- unsigned short depth; /* depth: distance between this node and the root */
-
- /* Matched patterns */
- AC_PATTERN_t * matched_patterns; /* Array of matched patterns */
- unsigned short matched_patterns_num; /* Number of matched patterns at this node */
- unsigned short matched_patterns_max; /* Max capacity of allocated memory for matched_patterns */
-
- /* Outgoing Edges */
- struct edge * outgoing; /* Array of outgoing edges */
- unsigned short outgoing_degree; /* Number of outgoing edges */
- unsigned short outgoing_max; /* Max capacity of allocated memory for outgoing */
-} AC_NODE_t;
-
-/* The Edge of the Node */
-struct edge
-{
- AC_ALPHABET_t alpha; /* Edge alpha */
- struct ac_node * next; /* Target of the edge */
-};
-
-
-AC_NODE_t * node_create (void);
-AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing);
-void node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha);
-AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
-void node_release (AC_NODE_t * thiz, u_int8_t free_pattern);
-void node_assign_id (AC_NODE_t * thiz);
-void node_sort_edges (AC_NODE_t * thiz);
-
-#endif
diff --git a/src/lib/third_party/include/sort.h b/src/lib/third_party/include/sort.h
deleted file mode 100644
index ee7df8a10..000000000
--- a/src/lib/third_party/include/sort.h
+++ /dev/null
@@ -1,6 +0,0 @@
-/* This is a function ported from the Linux kernel lib/sort.c */
-
-void sort(void *base, size_t num, size_t len,
- int (*cmp_func)(const void *, const void *),
- void (*swap_func)(void *, void *, int size));
-
diff --git a/src/lib/third_party/src/ahocorasick.c b/src/lib/third_party/src/ahocorasick.c
index b17383edd..8e36d7fdb 100644
--- a/src/lib/third_party/src/ahocorasick.c
+++ b/src/lib/third_party/src/ahocorasick.c
@@ -4,7 +4,7 @@
*
Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
Copyright 2012-21 ntop.org (Incremental improvements)
-
+
multifast is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
@@ -19,27 +19,131 @@
along with multifast. If not, see <http://www.gnu.org/licenses/>.
*/
+#ifndef __KERNEL__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#else
+#include <asm/byteorder.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+typedef __kernel_size_t size_t;
+#include <linux/string.h>
+#include <linux/slab.h>
+#endif
#include "ndpi_api.h"
#include "ahocorasick.h"
-/* Allocation step for automata.all_nodes */
-#define REALLOC_CHUNK_ALLNODES 200
+/* TODO: For different depth of node, number of outgoing edges differs
+ considerably, It is efficient to use different chunk size for
+ different depths */
+
+/* Private function prototype */
+static int node_edge_compare (struct edge * e, int a, int b);
+static int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr);
+
+static AC_NODE_t * node_create (void);
+static AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
+static int node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, int is_existing);
+static int node_register_outgoing (AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha);
+static AC_NODE_t * node_find_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha);
+static AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, uint8_t alpha);
+static AC_NODE_t * node_findbs_next_ac (AC_NODE_t * thiz, uint8_t alpha,int icase);
+static void node_release (AC_NODE_t * thiz, int free_pattern);
+static void node_release_pattern (AC_NODE_t * thiz);
+static int node_range_edges (AC_AUTOMATA_t *thiz, AC_NODE_t * node);
+static inline void node_sort_edges (AC_NODE_t * thiz);
+
+#ifndef __KERNEL__
+struct aho_dump_info {
+ size_t memcnt,node_oc,node_8c,node_xc,node_xr;
+ int buf_pos,ip;
+ char *bufstr;
+ size_t bufstr_len;
+};
+
+static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *);
+#endif
/* Private function prototype */
-static void ac_automata_register_nodeptr
-(AC_AUTOMATA_t * thiz, AC_NODE_t * node);
-static void ac_automata_union_matchstrs
-(AC_NODE_t * node);
+static int ac_automata_union_matchstrs (AC_NODE_t * node);
static void ac_automata_set_failure
-(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas);
+ (AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_NODE_t * next, int idx, void *);
static void ac_automata_traverse_setfailure
-(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas);
+ (AC_AUTOMATA_t * thiz);
+
+static inline AC_ALPHABET_t *edge_get_alpha(struct edge *e) {
+ return (AC_ALPHABET_t *)(&e->next[e->max]);
+}
+static inline size_t edge_data_size(int num) {
+ return sizeof(void *)*num + ((num + sizeof(void *) - 1) & ~(sizeof(void *)-1));
+}
+
+#ifdef __KERNEL__
+static inline void *acho_calloc(size_t nmemb, size_t size) {
+ return kcalloc(nmemb, size, GFP_ATOMIC);
+}
+static inline void *acho_malloc(size_t size) {
+ return kmalloc(size, GFP_ATOMIC);
+}
+static inline void acho_free(void *old) {
+ return kfree(old);
+}
+#else
+
+#define acho_calloc(a,b) ndpi_calloc(a,b)
+#define acho_malloc(a) ndpi_malloc(a)
+#define acho_free(a) ndpi_free(a)
+#endif
+
+static void acho_sort(struct edge *e, size_t num,
+ int (*cmp_func)(struct edge *e, int a, int b),
+ void (*swap_func)(struct edge *e, int a, int b));
+
+/* tolower() from glibc */
+static uint8_t aho_lc[256] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+};
+/* toupper() from glibc */
+static uint8_t aho_xc[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
/******************************************************************************
* FUNCTION: ac_automata_init
@@ -50,17 +154,42 @@ static void ac_automata_traverse_setfailure
******************************************************************************/
AC_AUTOMATA_t * ac_automata_init (MATCH_CALLBACK_f mc)
{
- AC_AUTOMATA_t * thiz = (AC_AUTOMATA_t *)ndpi_malloc(sizeof(AC_AUTOMATA_t));
- memset (thiz, 0, sizeof(AC_AUTOMATA_t));
+ AC_AUTOMATA_t * thiz;
+// if(!mc) return NULL;
+ thiz = (AC_AUTOMATA_t *)acho_calloc(1,sizeof(AC_AUTOMATA_t));
+ if(!thiz) return NULL;
thiz->root = node_create ();
- thiz->all_nodes_max = REALLOC_CHUNK_ALLNODES;
- thiz->all_nodes = (AC_NODE_t **) ndpi_malloc (thiz->all_nodes_max*sizeof(AC_NODE_t *));
- thiz->match_callback = mc;
- ac_automata_register_nodeptr (thiz, thiz->root);
+ if(!thiz->root) {
+ acho_free(thiz);
+ return NULL;
+ }
+ thiz->root->id = 1;
+ thiz->root->root = 1;
thiz->total_patterns = 0;
thiz->automata_open = 1;
+ thiz->match_handler = mc;
+ thiz->to_lc = 0;
+ thiz->no_root_range = 0;
+ thiz->add_to_range = REALLOC_CHUNK_OUTGOING*2;
return thiz;
}
+/******************************************************************************
+ * FUNCTION: ac_automata_casecmp
+ * Case-insensitive comparison mode
+ * PARAMS:
+ * AC_AUTOMATA_t * thiz: the pointer to the automata
+ * lc: 1 for case-insensitive comparison mode
+ * RETUERN VALUE: AC_ERROR_t
+ * the return value indicates the success or failure of changes
+ ******************************************************************************/
+AC_ERROR_t ac_automata_feature (AC_AUTOMATA_t * thiz, unsigned int feature)
+{
+ if(!thiz) return ACERR_ERROR;
+ if(thiz->all_nodes_num || thiz->total_patterns) return ACERR_ERROR;
+ thiz->to_lc = (feature & AC_FEATURE_LC) != 0;
+ thiz->no_root_range = (feature & AC_FEATURE_NO_ROOT_RANGE) != 0;
+ return ACERR_SUCCESS;
+}
/******************************************************************************
* FUNCTION: ac_automata_add
@@ -87,41 +216,124 @@ AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt)
if (patt->length > AC_PATTRN_MAX_LENGTH)
return ACERR_LONG_PATTERN;
- for (i=0; i<patt->length; i++)
- {
- alpha = patt->astring[i];
- if ((next = node_find_next(n, alpha)))
- {
- n = next;
- continue;
- }
- else
- {
- next = node_create_next(n, alpha);
- next->depth = n->depth + 1;
+ for (i=0; i<patt->length; i++) {
+ alpha = patt->astring[i];
+ if(thiz->to_lc)
+ alpha = (AC_ALPHABET_t)aho_lc[(uint8_t)alpha];
+
+ if((next = node_find_next(n, alpha)) != 0) {
+ n = next;
+ continue;
+ }
+ if(!(next = node_create_next(n, alpha)))
+ return ACERR_ERROR;
+ next->id = ++thiz->id;
+ thiz->all_nodes_num++;
n = next;
- ac_automata_register_nodeptr(thiz, n);
}
- }
+ if(thiz->max_str_len < patt->length)
+ thiz->max_str_len = patt->length;
if(n->final) {
-#if 0
- /* Original code */
+ patt->rep.number = n->matched_patterns->patterns[0].rep.number;
return ACERR_DUPLICATE_PATTERN;
-#else
- /* ntop */
- memcpy(&n->matched_patterns->rep, &patt->rep, sizeof(AC_REP_t));
- return ACERR_DUPLICATE_PATTERN; /* Caller might need to free patt->astring */
-#endif
}
-
- n->final = 1;
- node_register_matchstr(n, patt, 0);
+
+ if(node_register_matchstr(n, patt, 0))
+ return ACERR_ERROR;
+
thiz->total_patterns++;
return ACERR_SUCCESS;
}
+AC_ERROR_t ac_automata_walk(AC_AUTOMATA_t * thiz,
+ NODE_CALLBACK_f node_cb, ALPHA_CALLBACK_f alpha_cb, void *data)
+{
+ unsigned int ip;
+ AC_NODE_t *next, *n;
+ struct ac_path * path = thiz->ac_path;
+ AC_ERROR_t r;
+
+ ip = 1;
+ path[1].n = thiz->root;
+ path[1].idx = 0;
+
+ while(ip) {
+ unsigned int i,last;
+ n = path[ip].n;
+ i = path[ip].idx;
+ last = !n->outgoing || (n->one && i > 0) || (!n->one && i >= n->outgoing->degree);
+ if(node_cb && (!i || last)) {
+ r = node_cb(thiz, n, i, data);
+ if(r != ACERR_SUCCESS) return r;
+ }
+ if(last) {
+ ip--; continue;
+ }
+ next = NULL;
+ if(n->one) {
+ next = (AC_NODE_t *)n->outgoing;
+ } else {
+ while(i < n->outgoing->degree) {
+ next = n->outgoing->next[i];
+ if(next) break;
+ i++;
+ }
+ }
+ if(!next) {
+ if(!n->range || i >= n->outgoing->degree) {
+ r = node_cb ? node_cb(thiz, n, i, data):ACERR_SUCCESS;
+ if(r != ACERR_SUCCESS) return r;
+ }
+ ip--; continue;
+ }
+
+ if(n->depth < AC_PATTRN_MAX_LENGTH) {
+ path[n->depth].l = n->one ? n->one_alpha:
+ edge_get_alpha(n->outgoing)[i];
+ if(alpha_cb)
+ alpha_cb(thiz, n, next, i, data);
+ }
+
+ path[ip].idx = i+1;
+ if(ip >= AC_PATTRN_MAX_LENGTH)
+ continue;
+
+ ip++;
+
+ path[ip].n = next;
+ path[ip].idx = 0;
+
+ }
+ return ACERR_SUCCESS;
+}
+
+
+static AC_ERROR_t ac_finalize_node(AC_AUTOMATA_t * thiz,AC_NODE_t * n, int idx, void *data) {
+ if(!n->ff) {
+ n->id = ++(thiz->id);
+ n->ff = 1;
+ if(ac_automata_union_matchstrs (n))
+ return ACERR_ERROR;
+ if(n->use) {
+ if(!n->one) {
+ if(node_range_edges (thiz,n)) {
+ node_sort_edges (n);
+ thiz->n_range++;
+ } else
+ thiz->n_find++;
+ } else
+ thiz->n_oc++;
+ }
+ }
+ if(!n->a_ptr && n->outgoing && !n->one) {
+ n->a_ptr = edge_get_alpha(n->outgoing);
+ }
+ return ACERR_SUCCESS;
+}
+
+
/******************************************************************************
* FUNCTION: ac_automata_finalize
* Locate the failure node for all nodes and collect all matched pattern for
@@ -131,24 +343,51 @@ AC_ERROR_t ac_automata_add (AC_AUTOMATA_t * thiz, AC_PATTERN_t * patt)
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
******************************************************************************/
-void ac_automata_finalize (AC_AUTOMATA_t * thiz)
-{
- unsigned int i;
- AC_ALPHABET_t *alphas;
- AC_NODE_t * node;
- if((alphas = ndpi_malloc(AC_PATTRN_MAX_LENGTH)) != NULL) {
- ac_automata_traverse_setfailure (thiz, thiz->root, alphas);
+AC_ERROR_t ac_automata_finalize (AC_AUTOMATA_t * thiz) {
- for (i=0; i < thiz->all_nodes_num; i++)
- {
- node = thiz->all_nodes[i];
- ac_automata_union_matchstrs (node);
- node_sort_edges (node);
+ AC_ERROR_t r = ACERR_SUCCESS;
+ if(!thiz->automata_open) return r;
+
+ ac_automata_traverse_setfailure (thiz);
+ thiz->id=0;
+ thiz->n_oc = 0;
+ thiz->n_range = 0;
+ thiz->n_find = 0;
+ r = ac_automata_walk(thiz,ac_finalize_node,NULL,NULL);
+ if(r == ACERR_SUCCESS)
+ thiz->automata_open = 0;
+ return r;
+}
+
+int ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *txt) {
+ AC_PATTERN_t *patterns = mp->patterns;
+ AC_PATTERN_t **matched = txt->match.matched;
+ int i;
+ for(i=0; i < mp->num; i++,patterns++) {
+ do {
+ if(patterns->rep.from_start && patterns->rep.at_end) {
+ if(pos == txt->length && patterns->length == pos)
+ matched[0] = patterns;
+ break;
+ }
+ if(patterns->rep.from_start) {
+ if(patterns->length == pos)
+ if(!matched[1] || patterns->length > matched[1]->length)
+ matched[1] = patterns;
+ break;
+ }
+ if(patterns->rep.at_end) {
+ if(pos == txt->length)
+ if(!matched[2] || patterns->length > matched[2]->length)
+ matched[2] = patterns;
+ break;
+ }
+ if(!matched[3] || patterns->length > matched[3]->length)
+ matched[3] = patterns;
+ } while(0);
}
- thiz->automata_open = 0; /* do not accept patterns any more */
- ndpi_free(alphas);
- }
+ return 0;
}
/******************************************************************************
@@ -167,49 +406,69 @@ void ac_automata_finalize (AC_AUTOMATA_t * thiz)
* 0: success; continue searching; call-back sent me a 0 value
* 1: success; stop searching; call-back sent me a non-0 value
******************************************************************************/
-int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, AC_REP_t * param) {
+int ac_automata_search (AC_AUTOMATA_t * thiz,
+ AC_TEXT_t * txt, AC_REP_t * param)
+{
+ uint8_t alpha;
unsigned long position;
+ int icase = 0,i;
+ AC_MATCH_t *match;
AC_NODE_t *curr;
AC_NODE_t *next;
- AC_SEARCH_t s;
-
+ AC_ALPHABET_t *apos;
+
if(thiz->automata_open)
/* you must call ac_automata_locate_failure() first */
return -1;
-
- /* Reset search */
- s.current_node = thiz->root;
- s.base_position = 0;
-
position = 0;
- curr = s.current_node;
+ curr = thiz->root;
+ apos = txt->astring;
+ match = &txt->match;
+ memset((char*)match,0,sizeof(*match));
+ icase = !thiz->to_lc;
+ /* The 'txt->ignore_case' option is checked
+ * separately otherwise clang will detect
+ * uninitialized memory usage much later. */
+ if(txt->ignore_case == 1) icase = 1;
/* This is the main search loop.
* it must be keep as lightweight as possible. */
while (position < txt->length) {
- if(!(next = node_findbs_next(curr, txt->astring[position]))) {
- if(curr->failure_node /* we are not in the root node */)
- curr = curr->failure_node;
- else
- position++;
- } else {
- curr = next;
- position++;
- }
-
- if(curr->final && next) {
- /* We check 'next' to find out if we came here after a alphabet
- * transition or due to a fail. in second case we should not report
- * matching because it was reported in previous node */
- thiz->match.position = position + s.base_position;
- thiz->match.match_num = curr->matched_patterns_num;
- thiz->match.patterns = curr->matched_patterns;
- /* we found a match! do call-back */
- if (thiz->match_callback(&thiz->match, txt, param))
- return 1;
- }
+ alpha = (uint8_t)apos[position];
+ if(thiz->to_lc) alpha = aho_lc[alpha];
+ if(!(next = node_findbs_next_ac(curr, (uint8_t)alpha, icase))) {
+ if(curr->failure_node) /* we are not in the root node */
+ curr = curr->failure_node;
+ else
+ position++;
+ } else {
+ curr = next;
+ position++;
+ if(curr->final) {
+ match->match_counter++; /* we have a matching */
+ /* select best match */
+ ac_automata_exact_match(curr->matched_patterns,position,txt);
+ if(thiz->match_handler) {
+ /* We check 'next' to find out if we came here after a alphabet
+ * transition or due to a fail. in second case we should not report
+ * matching because it was reported in previous node */
+ match->position = position;
+ match->match_num = curr->matched_patterns->num;
+ match->patterns = curr->matched_patterns->patterns;
+ if (thiz->match_handler(match, txt, param))
+ return 1;
+ }
+ }
+ }
}
+ if(thiz->match_handler)
+ return match->match_counter > 0 ? 1:0;
+ for(i = 0; i < 4; i++)
+ if(txt->match.matched[i]) {
+ *param = (txt->match.matched[i])->rep;
+ return 1;
+ }
return 0;
}
@@ -218,112 +477,182 @@ int ac_automata_search (AC_AUTOMATA_t * thiz, AC_TEXT_t * txt, AC_REP_t * param)
* Release all allocated memories to the automata
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
- * uint8_t free_pattern: if true, deallocate the patterns strings
+ * free_pattern:
+ * 0 - free all struct w/o free pattern
+ * 1 - free all struct and pattern
+ * 2 - clean struct w/o free pattern
******************************************************************************/
-void ac_automata_release (AC_AUTOMATA_t * thiz, u_int8_t free_pattern)
-{
- unsigned int i;
- AC_NODE_t * n;
- for (i=0; i < thiz->all_nodes_num; i++)
- {
- n = thiz->all_nodes[i];
- node_release(n, free_pattern);
- }
- ndpi_free(thiz->all_nodes);
- ndpi_free(thiz);
+static AC_ERROR_t ac_automata_release_node(AC_AUTOMATA_t * thiz,
+ AC_NODE_t *n, int idx, void *data) {
+
+ if(!n->outgoing || idx) {
+ if(n->outgoing) {
+ if(n->one) thiz->n_oc--;
+ else if(n->range) thiz->n_range--;
+ else thiz->n_find--;
+ }
+ node_release(n,data != NULL);
+ }
+
+ return ACERR_SUCCESS;
+}
+void ac_automata_release (AC_AUTOMATA_t * thiz, uint8_t free_pattern) {
+
+ ac_automata_walk(thiz,ac_automata_release_node,NULL,free_pattern ? (void *)1:NULL);
+
+ if(free_pattern <= 1) {
+ node_release(thiz->root,free_pattern | 0x4);
+ thiz->root = NULL;
+ acho_free(thiz);
+ } else {
+ AC_NODE_t *n;
+ thiz->all_nodes_num = 0;
+ thiz->total_patterns = 0;
+ thiz->max_str_len = 0;
+ thiz->automata_open = 1;
+
+ n = thiz->root;
+ n->failure_node = NULL;
+ n->id = 0;
+ n->final = 0;
+ n->depth = 0;
+ if(n->outgoing) {
+ acho_free(n->outgoing);
+ n->outgoing = NULL;
+ }
+ if(n->matched_patterns) {
+ acho_free(n->matched_patterns);
+ n->matched_patterns=NULL;
+ }
+ n->use = 0;
+ n->one = 0;
+ }
+}
+
+#ifndef __KERNEL__
+
+static void dump_node_header(AC_NODE_t * n, struct aho_dump_info *ai) {
+ char *c;
+ int i;
+ printf("%04d: ",n->id);
+ if(n->failure_node) printf(" failure %04d:",n->failure_node->id);
+ printf(" d:%d %c",n->depth, n->use ? '+':'-');
+ ai->memcnt += sizeof(*n);
+ if(n->matched_patterns) {
+ ai->memcnt += sizeof(n->matched_patterns) +
+ n->matched_patterns->max*sizeof(n->matched_patterns->patterns[0]);
+ }
+ if(!n->use) { printf("\n"); return; }
+ if(n->one) {
+ (ai->node_oc)++;
+ printf(" '%c' next->%d\n",n->one_alpha,
+ n->outgoing ? ((AC_NODE_t *)n->outgoing)->id : -1);
+ return;
+ }
+ if(!n->outgoing) {
+ printf(" BUG! !outgoing\n");
+ return;
+ }
+ printf("%s\n",n->range ? " RANGE":"");
+ c = (char *)edge_get_alpha(n->outgoing);
+ if(n->outgoing->degree <= 8)
+ (ai->node_8c)++;
+ else
+ (ai->node_xc)++;
+ if(n->range)
+ (ai->node_xr)++;
+ for(i=0; i < n->outgoing->degree; i++) {
+ printf(" %d: \"%c\" -> %d\n",i,c[i],
+ n->outgoing->next[i] ? n->outgoing->next[i]->id:-1);
+ }
+ ai->memcnt += sizeof(n->outgoing) + edge_data_size(n->outgoing->max);
+}
+
+static AC_ERROR_t dump_node_common(AC_AUTOMATA_t * thiz,
+ AC_NODE_t * n, int idx, void *data) {
+ struct aho_dump_info *ai = (struct aho_dump_info *)data;
+ char *rstr = ai->bufstr;
+
+ if(idx) return ACERR_SUCCESS;
+ dump_node_header(n,ai);
+ if (n->matched_patterns && n->matched_patterns->num && n->final) {
+ char lbuf[300];
+ int nl = 0,j;
+
+ nl = snprintf(lbuf,sizeof(lbuf),"'%.100s' N:%d{",rstr,n->matched_patterns->num);
+ for (j=0; j<n->matched_patterns->num; j++) {
+ AC_PATTERN_t *sid = &n->matched_patterns->patterns[j];
+ if(j) nl += snprintf(&lbuf[nl],sizeof(lbuf)-nl-1,", ");
+ nl += snprintf(&lbuf[nl],sizeof(lbuf)-nl-1,"%d %c%.100s%c",
+ sid->rep.number & 0x3fff,
+ sid->rep.number & 0x8000 ? '^':' ',
+ sid->astring,
+ sid->rep.number & 0x4000 ? '$':' ');
+ }
+ printf("%s}\n",lbuf);
+ }
+ return ACERR_SUCCESS;
+}
+static void dump_node_str(AC_AUTOMATA_t * thiz, AC_NODE_t * node,
+ AC_NODE_t * next, int idx, void *data) {
+ struct aho_dump_info *ai = (struct aho_dump_info *)data;
+ ai->bufstr[node->depth] = thiz->ac_path[node->depth].l;
+ ai->bufstr[node->depth+1] = 0;
}
/******************************************************************************
- * FUNCTION: ac_automata_display
+ * FUNCTION: ac_automata_dump
* Prints the automata to output in human readable form. it is useful for
* debugging purpose.
* PARAMS:
* AC_AUTOMATA_t * thiz: the pointer to the automata
+ * rstr: char[] buffer
+ * rstr_size: size of rstr buffser
* char repcast: 'n': print AC_REP_t as number, 's': print AC_REP_t as string
******************************************************************************/
-void ac_automata_display (AC_AUTOMATA_t * thiz, char repcast)
-{
- unsigned int i, j;
- AC_NODE_t * n;
- struct edge * e;
- AC_PATTERN_t sid;
-
- printf("---------------------------------\n");
-
- for (i=0; i<thiz->all_nodes_num; i++)
- {
- n = thiz->all_nodes[i];
- printf("NODE(%3d)/----fail----> NODE(%3d)\n",
- n->id, (n->failure_node)?n->failure_node->id:1);
- for (j=0; j<n->outgoing_degree; j++)
- {
- e = &n->outgoing[j];
- printf(" |----(");
- if(isgraph(e->alpha))
- printf("%c)---", e->alpha);
- else
- printf("0x%x)", e->alpha);
- printf("--> NODE(%3d)\n", e->next->id);
- }
- if (n->matched_patterns_num) {
- printf("Accepted patterns: {");
- for (j=0; j<n->matched_patterns_num; j++)
- {
- sid = n->matched_patterns[j];
- if(j) printf(", ");
- switch (repcast)
- {
- case 'n':
- printf("%u/%u/%u",
- sid.rep.number,
- sid.rep.category,
- sid.rep.breed);
- break;
- }
- }
- printf("}\n");
- }
- printf("---------------------------------\n");
- }
-}
-/******************************************************************************
- * FUNCTION: ac_automata_register_nodeptr
- * Adds the node pointer to all_nodes.
- ******************************************************************************/
-static void ac_automata_register_nodeptr (AC_AUTOMATA_t * thiz, AC_NODE_t * node)
-{
- if(thiz->all_nodes_num >= thiz->all_nodes_max)
- {
- thiz->all_nodes = ndpi_realloc(thiz->all_nodes,
- thiz->all_nodes_max*sizeof(AC_NODE_t *),
- (REALLOC_CHUNK_ALLNODES+thiz->all_nodes_max)*sizeof(AC_NODE_t *)
- );
- thiz->all_nodes_max += REALLOC_CHUNK_ALLNODES;
- }
- thiz->all_nodes[thiz->all_nodes_num++] = node;
+void ac_automata_dump(AC_AUTOMATA_t * thiz, char *rstr, size_t rstr_size, char repcast) {
+ struct aho_dump_info ai;
+
+ memset((char *)&ai,0,sizeof(ai));
+
+ printf("---DUMP- all nodes %u - max strlen %u -%s---\n",
+ (unsigned int)thiz->all_nodes_num,
+ (unsigned int)thiz->max_str_len,
+ thiz->automata_open ? "open":"ready");
+ printf("root: %px\n",thiz->root);
+ *rstr = '\0';
+ ai.bufstr = rstr;
+ ai.bufstr_len = rstr_size;
+
+ ac_automata_walk(thiz,dump_node_common,dump_node_str,(void *)&ai);
+ printf("---\n mem size %zu avg node size %d, node one char %d, <=8c %d, >8c %d, range %d\n---DUMP-END-\n",
+ ai.memcnt,(int)ai.memcnt/(thiz->all_nodes_num+1),(int)ai.node_oc,(int)ai.node_8c,(int)ai.node_xc,(int)ai.node_xr);
}
+#endif
/******************************************************************************
* FUNCTION: ac_automata_union_matchstrs
* Collect accepted patterns of the node. the accepted patterns consist of the
* node's own accepted pattern plus accepted patterns of its failure node.
******************************************************************************/
-static void ac_automata_union_matchstrs (AC_NODE_t * node)
+static int ac_automata_union_matchstrs (AC_NODE_t * node)
{
unsigned int i;
- AC_NODE_t * m = node;
+ AC_NODE_t * m;
- while ((m = m->failure_node))
- {
- for (i=0; i < m->matched_patterns_num; i++)
- node_register_matchstr(node, &(m->matched_patterns[i]), 1 /* this is an existing node */);
+ for (m = node; m; m = m->failure_node) {
+ if(!m->matched_patterns) continue;
- if (m->final)
- node->final = 1;
- }
- // TODO : sort matched_patterns? is that necessary? I don't think so.
+ for (i=0; i < m->matched_patterns->num; i++)
+ if(node_register_matchstr(node, &(m->matched_patterns->patterns[i]), 1))
+ return 1;
+
+ if (m->final)
+ node->final = 1;
+ }
+ return 0;
}
/******************************************************************************
@@ -331,24 +660,24 @@ static void ac_automata_union_matchstrs (AC_NODE_t * node)
* find failure node for the given node.
******************************************************************************/
static void ac_automata_set_failure
-(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
+(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_NODE_t * next, int idx, void *data)
{
unsigned int i, j;
AC_NODE_t * m;
+ struct ac_path * path = thiz->ac_path;
- for (i=1; i < node->depth; i++)
- {
- m = thiz->root;
- for (j=i; j < node->depth && m; j++)
- m = node_find_next (m, alphas[j]);
- if (m)
- {
- node->failure_node = m;
- break;
- }
+ for (i=1; i < next->depth; i++) {
+ m = thiz->root;
+ for (j=i; j < next->depth && m; j++) {
+ m = node_find_next (m, path[j].l);
+ }
+ if (m) {
+ next->failure_node = m;
+ break;
+ }
}
- if (!node->failure_node)
- node->failure_node = thiz->root;
+ if (!next->failure_node)
+ next->failure_node = thiz->root;
}
/******************************************************************************
@@ -358,20 +687,509 @@ static void ac_automata_set_failure
* called after adding last pattern to automata. i.e. after calling this you
* can not add further pattern to automata.
******************************************************************************/
-static void ac_automata_traverse_setfailure
-(AC_AUTOMATA_t * thiz, AC_NODE_t * node, AC_ALPHABET_t * alphas)
+static inline void ac_automata_traverse_setfailure (AC_AUTOMATA_t * thiz)
+{
+ ac_automata_walk(thiz,NULL,ac_automata_set_failure,NULL);
+}
+
+/******************************************************************************
+ * FUNCTION: node_create
+ * Create the node
+ ******************************************************************************/
+static inline AC_NODE_t * node_create(void)
+{
+ return (AC_NODE_t *) acho_calloc (1,sizeof(AC_NODE_t));
+}
+
+
+static void node_release_pattern(AC_NODE_t * thiz)
+{
+ int i;
+ AC_PATTERN_t * str;
+
+ if(!thiz->matched_patterns) return;
+ str = thiz->matched_patterns->patterns;
+
+ for (i=0; i < thiz->matched_patterns->num; str++,i++)
+ {
+ if(!str->is_existing && str->astring) {
+ acho_free(str->astring);
+ str->astring = NULL;
+ }
+ }
+}
+
+
+/******************************************************************************
+ * FUNCTION: node_release
+ * Release node
+ ******************************************************************************/
+static void node_release(AC_NODE_t * thiz, int free_pattern)
+{
+ if(thiz->root && (free_pattern & 0x4) == 0) return;
+
+ if(free_pattern & 1) node_release_pattern(thiz);
+
+ if(thiz->matched_patterns) {
+ acho_free(thiz->matched_patterns);
+ thiz->matched_patterns = NULL;
+ }
+ if(!thiz->one && thiz->outgoing) {
+ acho_free(thiz->outgoing);
+ }
+ thiz->outgoing = NULL;
+ acho_free(thiz);
+}
+
+/* Nonzero if X is not aligned on a "long" boundary. */
+#define UNALIGNED(X) ((long)X & (__SIZEOF_LONG__ - 1))
+#define LBLOCKSIZE __SIZEOF_LONG__
+
+#if __SIZEOF_LONG__ == 4
+#define DETECTNULL(X) (((X) - 0x01010101UL) & ~(X) & 0x80808080UL)
+#define DUPC 0x01010101UL
+
+static inline size_t bsf(uint32_t bits)
+{
+#ifdef __GNUC__
+ return __builtin_ctz(bits);
+#else
+ size_t i=0;
+ if(!bits) return i;
+ if((bits & 0xffff)bits == 0) { i+=16; bits >>=16; }
+ if((bits & 0xff) == 0) i+=8;
+ return i;
+#endif
+}
+
+#else
+#define DETECTNULL(X) (((X) - 0x0101010101010101ULL) & ~(X) & 0x8080808080808080ULL)
+#define DUPC 0x0101010101010101UL
+
+static inline size_t bsf(uint64_t bits)
+{
+#ifdef __GNUC__
+ return __builtin_ctzll(bits);
+#else
+ size_t i=0;
+ if(!bits) return i;
+ if((bits & 0xffffffff) == 0) { i+=32; bits >>=32; }
+ if((bits & 0xffff) == 0) { i+=16; bits >>=16; }
+ if((bits & 0xff) == 0) i+=8;
+ return i;
+#endif
+}
+#endif
+
+static inline char *
+xmemchr(char *s, char i,int n)
+{
+ unsigned char c = (unsigned char)i;
+
+ while(n > 0) {
+ if (n >= LBLOCKSIZE && !UNALIGNED (s)) {
+ unsigned long int mask;
+ mask = c * DUPC;
+
+ while (n >= LBLOCKSIZE) {
+ unsigned long int nc = DETECTNULL((*(unsigned long int *)s) ^ mask);
+ if(nc)
+ return s + (bsf(nc) >> 3);
+ s += LBLOCKSIZE;
+ n -= LBLOCKSIZE;
+ }
+ if(!n) return NULL;
+ }
+ if (*s == c) return s;
+ s++;
+ n--;
+ }
+ return NULL;
+}
+
+
+/******************************************************************************
+ * FUNCTION: node_find_next
+ * Find out the next node for a given Alpha to move. this function is used in
+ * the pre-processing stage in which edge array is not sorted. so it uses
+ * linear search.
+ ******************************************************************************/
+static AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha)
+{
+ AC_ALPHABET_t *alphas, *fc;
+
+ if(thiz->one) return alpha == thiz->one_alpha ? (AC_NODE_t *)thiz->outgoing:NULL;
+ if(!thiz->outgoing) return NULL;
+
+ alphas = edge_get_alpha(thiz->outgoing);
+ fc = xmemchr((char *)alphas,(char)alpha,thiz->outgoing->degree);
+ return fc ? thiz->outgoing->next[fc-alphas] : NULL;
+}
+
+
+/******************************************************************************
+ * FUNCTION: node_findbs_next
+ * Find out the next node for a given Alpha. this function is used after the
+ * pre-processing stage in which we sort edges. so it uses Binary Search.
+ ******************************************************************************/
+
+static inline AC_NODE_t *node_findbs_next (AC_NODE_t * thiz, uint8_t alpha)
+{
+
+ if(thiz->one)
+ return alpha == thiz->one_alpha ? (AC_NODE_t *)thiz->outgoing:NULL;
+
+ if(!(thiz->outgoing->cmap[(uint8_t)alpha >> 5] & (1u << (alpha & 0x1f))))
+ return NULL;
+
+ if(thiz->range)
+ return thiz->outgoing->next[(uint8_t)alpha - (uint8_t)thiz->one_alpha];
+
+ return thiz->outgoing->next[
+ xmemchr((char *)thiz->a_ptr,(char)alpha,thiz->outgoing->degree) - thiz->a_ptr];
+}
+
+static AC_NODE_t *node_findbs_next_ac (AC_NODE_t * thiz, uint8_t alpha,int icase) {
+ AC_NODE_t *next;
+ uint8_t alpha_c;
+
+ if(!thiz->outgoing) return NULL;
+
+ next = node_findbs_next(thiz,alpha);
+ if(next || !icase) return next;
+
+ alpha_c = aho_xc[alpha];
+ if(!alpha_c) return NULL;
+ return node_findbs_next(thiz, alpha ^ alpha_c);
+}
+
+/******************************************************************************
+ * FUNCTION: node_has_matchstr
+ * Determine if a final node contains a pattern in its accepted pattern list
+ * or not. return values: 1 = it has, 0 = it hasn't
+ ******************************************************************************/
+static int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr)
+{
+ int i;
+ AC_PATTERN_t * str;
+ if(!thiz->matched_patterns) return 0;
+ str = thiz->matched_patterns->patterns;
+
+ for (i=0; i < thiz->matched_patterns->num; str++,i++)
+ {
+ if (str->length != newstr->length)
+ continue;
+
+ if(!memcmp(str->astring,newstr->astring,str->length))
+ return 1;
+
+ }
+ return 0;
+}
+
+/******************************************************************************
+ * FUNCTION: node_create_next
+ * Create the next node for the given alpha.
+ ******************************************************************************/
+static AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha)
{
- unsigned int i;
AC_NODE_t * next;
+ next = node_find_next (thiz, alpha);
+ if (next)
+ /* The edge already exists */
+ return NULL;
+ /* Otherwise register new edge */
+ next = node_create ();
+ if(next) {
+ if(node_register_outgoing(thiz, next, alpha)) {
+ node_release(next,0);
+ return NULL;
+ }
+ next->depth = thiz->depth+1;
+ }
+
+ return next;
+}
+
+static inline int mp_data_size(int n) {
+ return sizeof(AC_PATTERNS_t) + n*sizeof(AC_PATTERN_t);
+}
- for (i=0; i < node->outgoing_degree; i++) {
- alphas[node->depth] = node->outgoing[i].alpha;
- next = node->outgoing[i].next;
+static AC_PATTERNS_t * node_resize_mp(AC_PATTERNS_t *m) {
+AC_PATTERNS_t *new_m;
- /* At every node look for its failure node */
- ac_automata_set_failure (thiz, next, alphas);
+ if(!m) {
+ m = acho_calloc(1,mp_data_size(REALLOC_CHUNK_MATCHSTR));
+ if(!m) return m;
+ m->max = REALLOC_CHUNK_MATCHSTR;
+ return m;
+ }
+ new_m = acho_malloc(mp_data_size(m->max+REALLOC_CHUNK_MATCHSTR));
+ if(!new_m) return new_m;
+ memcpy((char *)new_m,(char *)m,mp_data_size(m->max));
+ new_m->max += REALLOC_CHUNK_MATCHSTR;
+ acho_free(m);
+ return new_m;
+}
+
+/******************************************************************************
+ * FUNCTION: node_register_matchstr
+ * Adds the pattern to the list of accepted pattern.
+ ******************************************************************************/
+static int node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str,int is_existing)
+{
+ AC_PATTERN_t *l;
+
+ if(!is_existing)
+ thiz->final = 1;
+ /* Check if the new pattern already exists in the node list */
+ if (thiz->matched_patterns && node_has_matchstr(thiz, str))
+ return 0;
+
+ if(!thiz->matched_patterns)
+ thiz->matched_patterns = node_resize_mp(thiz->matched_patterns);
- /* Recursively call itself to traverse all nodes */
- ac_automata_traverse_setfailure (thiz, next, alphas);
+ /* Manage memory */
+ if (thiz->matched_patterns->num >= thiz->matched_patterns->max) {
+ AC_PATTERNS_t *new_mp = node_resize_mp(thiz->matched_patterns);
+ if(!new_mp) return 1;
+ thiz->matched_patterns = new_mp;
+ }
+ l = &thiz->matched_patterns->patterns[thiz->matched_patterns->num];
+ l->astring = str->astring;
+ l->length = str->length;
+ l->is_existing = is_existing;
+ l->rep = str->rep;
+ thiz->matched_patterns->num++;
+ return 0;
+}
+
+static struct edge *node_resize_outgoing(struct edge * e,size_t added) {
+struct edge *new_e;
+int ds;
+
+ if(!added) added = REALLOC_CHUNK_OUTGOING;
+ if(!e) {
+ e = acho_calloc(1,sizeof(struct edge) + edge_data_size(REALLOC_CHUNK_OUTGOING));
+ if(!e) return e;
+ e->max = REALLOC_CHUNK_OUTGOING;
+ return e;
+ }
+ ds = edge_data_size(e->max + added);
+ new_e = acho_calloc(1,sizeof(struct edge) + ds);
+ if(!new_e) return new_e;
+ memcpy(new_e,e,sizeof(struct edge) + sizeof(AC_NODE_t *)*e->max);
+ new_e->max += added;
+
+ if(e->degree)
+ memcpy(edge_get_alpha(new_e),edge_get_alpha(e),e->degree);
+
+ acho_free(e);
+ return new_e;
+}
+
+/******************************************************************************
+ * FUNCTION: node_register_outgoing
+ * Establish an edge between two nodes
+ ******************************************************************************/
+static int node_register_outgoing
+(AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha)
+{
+ struct edge *o;
+ if(!thiz->use) {
+ thiz->use = 1;
+ thiz->one = 1;
+ thiz->one_alpha = alpha;
+ thiz->outgoing = (struct edge *)next;
+ return 0;
}
+ if(thiz->one) {
+ o = node_resize_outgoing(NULL,0);
+ if(!o) return 1;
+ o->next[0] = (AC_NODE_t *)thiz->outgoing;
+ *edge_get_alpha(o) = thiz->one_alpha;
+ o->degree = 1;
+ thiz->one = 0;
+ thiz->one_alpha = 0;
+ thiz->outgoing = o;
+ } else
+ o = thiz->outgoing;
+
+ if(!o) return 1;
+
+ if(o->degree >= o->max)
+ {
+ struct edge *new_o = node_resize_outgoing(thiz->outgoing,0);
+ if(!new_o) return 1;
+
+ thiz->outgoing = new_o;
+ o = new_o;
+ }
+ edge_get_alpha(o)[o->degree] = alpha;
+ o->next[o->degree] = next;
+ o->degree++;
+ return 0;
}
+
+/******************************************************************************
+ * FUNCTION: node_edge_compare
+ * Comparison function for qsort. see man qsort.
+ ******************************************************************************/
+static int node_edge_compare (struct edge * e, int a, int b) {
+ AC_ALPHABET_t *c = edge_get_alpha(e);
+ return c[a] >= c[b] ? 1:0;
+}
+
+static void node_edge_swap (struct edge * e, int a, int b)
+{
+AC_ALPHABET_t *c,tc;
+AC_NODE_t *tn;
+ c = edge_get_alpha(e);
+ tc = c[a]; c[a] = c[b]; c[b] = tc;
+ tn = e->next[a]; e->next[a] = e->next[b]; e->next[b] = tn;
+}
+
+/******************************************************************************
+ * FUNCTION: acho_2range
+ * Adds missing characters in the range low - high
+ ******************************************************************************/
+static void acho_2range(AC_NODE_t * thiz,uint8_t low, uint8_t high) {
+ struct edge *e;
+ int i;
+ uint8_t *c = (uint8_t *)edge_get_alpha(thiz->outgoing);
+
+ thiz->range = 1;
+ thiz->one_alpha = (AC_ALPHABET_t)low;
+ e = thiz->outgoing;
+ for (i=0; low <= high && i < e->max; i++,low++) {
+ if(e->cmap[(low >> 5) & 0x7] & (1u << (low & 0x1f))) continue;
+ c[e->degree] = low;
+ e->next[e->degree] = NULL;
+ e->degree++;
+ }
+}
+
+/******************************************************************************
+ * FUNCTION: node_range_edges
+ * Converts to a range if possible.
+ ******************************************************************************/
+static int node_range_edges (AC_AUTOMATA_t *thiz, AC_NODE_t * node)
+{
+ struct edge *e = node->outgoing;
+ uint8_t *c = (uint8_t *)edge_get_alpha(node->outgoing);
+ uint8_t low = 0xff,high = 0;
+ int i;
+
+ memset((char *)&e->cmap,0,sizeof(e->cmap));
+ for(i = 0; i < e->degree; i++) {
+ uint8_t cc = c[i];
+ if(cc < low) low = cc;
+ if(cc > high) high = cc;
+ e->cmap[(cc >> 5) & 0x7] |= 1u << (cc & 0x1f);
+ }
+ if(high - low + 1 == e->degree) {
+ node->range = 1;
+ node->one_alpha = (AC_ALPHABET_t)low;
+ return 1;
+ }
+ if(high - low + 1 < e->max) {
+ acho_2range(node,low,high);
+ return 1;
+ }
+
+// if(e->degree < __SIZEOF_LONG__) return 0;
+
+ i = (high - low)/8;
+ if (i < thiz->add_to_range) i = thiz->add_to_range;
+ i += REALLOC_CHUNK_OUTGOING-1;
+ i -= i % REALLOC_CHUNK_OUTGOING;
+
+ if(high - low + 1 < e->max + i) {
+ int added = (high - low + 1) - e->max;
+ struct edge *new_o = node_resize_outgoing(node->outgoing,added);
+ if(new_o) {
+ node->outgoing = new_o;
+ acho_2range(node,low,high);
+ return 1;
+ }
+ return 0;
+ }
+
+ if(node->root && !thiz->no_root_range) {
+ struct edge *new_o;
+ int added = (high - low + 1) - e->max;
+ new_o = node_resize_outgoing(node->outgoing,added);
+ if(new_o) {
+ node->outgoing = new_o;
+ acho_2range(node,low,high);
+ return 1;
+ }
+ }
+ return 0;
+}
+/******************************************************************************
+ * FUNCTION: node_sort_edges
+ * sorts edges alphabets.
+ ******************************************************************************/
+static inline void node_sort_edges (AC_NODE_t * thiz)
+{
+
+ acho_sort (thiz->outgoing, thiz->outgoing->degree,
+ node_edge_compare, node_edge_swap);
+}
+
+/**
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp_func: pointer to comparison function
+ * @swap_func: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap_func function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+
+ void acho_sort(struct edge *e, size_t num,
+ int (*cmp_func)(struct edge *e, int a, int b),
+ void (*swap_func)(struct edge *e, int a, int b))
+{
+ /* pre-scale counters for performance */
+ int i = (num/2 - 1) , n = num, c, r;
+
+ if (!swap_func) return;
+ if (!cmp_func) return;
+
+ /* heapify */
+ for ( ; i >= 0; i -= 1) {
+ for (r = i; r * 2 + 1 < n; r = c) {
+ c = r * 2 + 1;
+ if (c < n - 1 && cmp_func(e, c, c + 1) == 0)
+ c += 1;
+ if (cmp_func(e, r, c) != 0)
+ break;
+ swap_func(e, r, c);
+ }
+ }
+
+ /* sort */
+ for (i = n - 1; i > 0; i -= 1) {
+ swap_func(e,0,i);
+ for (r = 0; r * 2 + 1 < i; r = c) {
+ c = r * 2 + 1;
+ if (c < i - 1 && cmp_func(e, c, c + 1) == 0)
+ c += 1;
+ if (cmp_func(e, r, c) != 0)
+ break;
+ swap_func(e, r, c);
+ }
+ }
+}
+
+/* vim: set ts=4 sw=4 et : */
+
diff --git a/src/lib/third_party/src/node.c b/src/lib/third_party/src/node.c
deleted file mode 100644
index 10a434308..000000000
--- a/src/lib/third_party/src/node.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * node.c: implementation of automata node
- * This file is part of multifast.
- *
- Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>
-
- multifast is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- multifast is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public License
- along with multifast. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include "ndpi_api.h"
-#include "../include/node.h"
-#include "sort.h"
-
-/* reallocation step for AC_NODE_t.matched_patterns */
-#define REALLOC_CHUNK_MATCHSTR 8
-
-/* reallocation step for AC_NODE_t.outgoing array */
-#define REALLOC_CHUNK_OUTGOING 8
-/* TODO: For different depth of node, number of outgoing edges differs
- considerably, It is efficient to use different chunk size for
- different depths */
-
-/* Private function prototype */
-void node_init (AC_NODE_t * thiz);
-int node_edge_compare (const void * l, const void * r);
-int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr);
-
-
-/******************************************************************************
- * FUNCTION: node_create
- * Create the node
- ******************************************************************************/
-AC_NODE_t * node_create(void)
-{
- AC_NODE_t * thiz = (AC_NODE_t *) ndpi_malloc (sizeof(AC_NODE_t));
- node_init(thiz);
- node_assign_id(thiz);
- return thiz;
-}
-
-/******************************************************************************
- * FUNCTION: node_init
- * Initialize node
- ******************************************************************************/
-void node_init(AC_NODE_t * thiz)
-{
- memset(thiz, 0, sizeof(AC_NODE_t));
-
- thiz->outgoing_max = REALLOC_CHUNK_OUTGOING;
- thiz->outgoing = (struct edge *) ndpi_malloc
- (thiz->outgoing_max*sizeof(struct edge));
-
- thiz->matched_patterns_max = REALLOC_CHUNK_MATCHSTR;
- thiz->matched_patterns = (AC_PATTERN_t *) ndpi_malloc
- (thiz->matched_patterns_max*sizeof(AC_PATTERN_t));
-}
-
-/******************************************************************************
- * FUNCTION: node_release
- * Release node
- ******************************************************************************/
-void node_release(AC_NODE_t * thiz, u_int8_t free_pattern)
-{
- if(free_pattern) {
- for(int i=0; i<thiz->matched_patterns_num; i++) {
- if(!thiz->matched_patterns[i].is_existing)
- ndpi_free(thiz->matched_patterns[i].astring);
- }
- }
-
- ndpi_free(thiz->matched_patterns);
- ndpi_free(thiz->outgoing);
- ndpi_free(thiz);
-}
-
-/******************************************************************************
- * FUNCTION: node_find_next
- * Find out the next node for a given Alpha to move. this function is used in
- * the pre-processing stage in which edge array is not sorted. so it uses
- * linear search.
- ******************************************************************************/
-AC_NODE_t * node_find_next(AC_NODE_t * thiz, AC_ALPHABET_t alpha)
-{
- int i;
-
- for (i=0; i < thiz->outgoing_degree; i++)
- {
- if(thiz->outgoing[i].alpha == alpha)
- return (thiz->outgoing[i].next);
- }
- return NULL;
-}
-
-/******************************************************************************
- * FUNCTION: node_findbs_next
- * Find out the next node for a given Alpha. this function is used after the
- * pre-processing stage in which we sort edges. so it uses Binary Search.
- ******************************************************************************/
-AC_NODE_t * node_findbs_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha)
-{
- int min, max, mid;
- AC_ALPHABET_t amid;
-
- min = 0;
- max = thiz->outgoing_degree - 1;
-
- while (min <= max)
- {
- mid = (min+max) >> 1;
- amid = thiz->outgoing[mid].alpha;
- if (alpha > amid)
- min = mid + 1;
- else if (alpha < amid)
- max = mid - 1;
- else
- return (thiz->outgoing[mid].next);
- }
- return NULL;
-}
-
-/******************************************************************************
- * FUNCTION: node_has_matchstr
- * Determine if a final node contains a pattern in its accepted pattern list
- * or not. return values: 1 = it has, 0 = it hasn't
- ******************************************************************************/
-int node_has_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * newstr)
-{
- int i;
-#if 0
- int j;
-#endif
- AC_PATTERN_t * str;
-
- for (i=0; i < thiz->matched_patterns_num; i++)
- {
- str = &thiz->matched_patterns[i];
-
- if (str->length != newstr->length)
- continue;
-
-#if 0
- /* Original code */
- for (j=0; j<(int)str->length; j++)
- if(str->astring[j] != newstr->astring[j])
- continue;
-
- if (j == str->length)
- return 1;
-#else
- /* https://github.com/ntop/nDPI/issues/837 */
- if (strncmp(str->astring, newstr->astring, str->length) == 0)
- return 1;
-#endif
- }
- return 0;
-}
-
-/******************************************************************************
- * FUNCTION: node_create_next
- * Create the next node for the given alpha.
- ******************************************************************************/
-AC_NODE_t * node_create_next (AC_NODE_t * thiz, AC_ALPHABET_t alpha)
-{
- AC_NODE_t * next;
- next = node_find_next (thiz, alpha);
- if (next)
- /* The edge already exists */
- return NULL;
- /* Otherwise register new edge */
- next = node_create ();
- node_register_outgoing(thiz, next, alpha);
-
- return next;
-}
-
-/******************************************************************************
- * FUNCTION: node_register_matchstr
- * Adds the pattern to the list of accepted pattern.
- ******************************************************************************/
-void node_register_matchstr (AC_NODE_t * thiz, AC_PATTERN_t * str, u_int8_t is_existing)
-{
- /* Check if the new pattern already exists in the node list */
- if (node_has_matchstr(thiz, str))
- return;
-
- /* Manage memory */
- if (thiz->matched_patterns_num >= thiz->matched_patterns_max)
- {
- thiz->matched_patterns = (AC_PATTERN_t *) ndpi_realloc
- (thiz->matched_patterns, thiz->matched_patterns_max*sizeof(AC_PATTERN_t),
- (REALLOC_CHUNK_MATCHSTR+thiz->matched_patterns_max)*sizeof(AC_PATTERN_t));
-
- thiz->matched_patterns_max += REALLOC_CHUNK_MATCHSTR;
- }
-
- thiz->matched_patterns[thiz->matched_patterns_num].astring = str->astring;
- thiz->matched_patterns[thiz->matched_patterns_num].length = str->length;
- thiz->matched_patterns[thiz->matched_patterns_num].is_existing = is_existing;
- memcpy(&thiz->matched_patterns[thiz->matched_patterns_num].rep, &str->rep, sizeof(AC_REP_t));
- thiz->matched_patterns_num++;
-}
-
-/******************************************************************************
- * FUNCTION: node_register_outgoing
- * Establish an edge between two nodes
- ******************************************************************************/
-void node_register_outgoing
-(AC_NODE_t * thiz, AC_NODE_t * next, AC_ALPHABET_t alpha)
-{
- if(thiz->outgoing_degree >= thiz->outgoing_max)
- {
- thiz->outgoing = (struct edge *) ndpi_realloc
- (thiz->outgoing, thiz->outgoing_max*sizeof(struct edge),
- (REALLOC_CHUNK_OUTGOING+thiz->outgoing_max)*sizeof(struct edge));
- thiz->outgoing_max += REALLOC_CHUNK_OUTGOING;
- }
-
- thiz->outgoing[thiz->outgoing_degree].alpha = alpha;
- thiz->outgoing[thiz->outgoing_degree++].next = next;
-}
-
-/******************************************************************************
- * FUNCTION: node_assign_id
- * assign a unique ID to the node (used for debugging purpose).
- ******************************************************************************/
-void node_assign_id (AC_NODE_t * thiz)
-{
- static int unique_id = 1;
- thiz->id = unique_id ++;
-}
-
-/******************************************************************************
- * FUNCTION: node_edge_compare
- * Comparison function for qsort. see man qsort.
- ******************************************************************************/
-int node_edge_compare (const void * l, const void * r)
-{
- /* According to man page:
- * The comparison function must return an integer less than, equal to, or
- * greater than zero if the first argument is considered to be
- * respectively less than, equal to, or greater than the second. if two
- * members compare as equal, their order in the sorted array is undefined.
- *
- * NOTE: Because edge alphabets are unique in every node we ignore
- * equivalence case.
- **/
- if ( ((struct edge *)l)->alpha >= ((struct edge *)r)->alpha )
- return 1;
- else
- return -1;
-}
-
-/******************************************************************************
- * FUNCTION: node_sort_edges
- * sorts edges alphabets.
- ******************************************************************************/
-void node_sort_edges (AC_NODE_t * thiz)
-{
- sort ((void *)thiz->outgoing, thiz->outgoing_degree, sizeof(struct edge), node_edge_compare, NULL);
-}
diff --git a/src/lib/third_party/src/sort.c b/src/lib/third_party/src/sort.c
deleted file mode 100644
index 35c8e9fdf..000000000
--- a/src/lib/third_party/src/sort.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
- *
- * Jan 23 2005 Matt Mackall <mpm@selenic.com>
- */
-
-#ifdef WIN32
-#include <stdint.h>
-typedef uint32_t u_int32_t;
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/types.h>
-
-/* This is a function ported from the Linux kernel lib/sort.c */
-
-static void u_int32_t_swap(void *a, void *b, int size)
-{
- u_int32_t t = *(u_int32_t *)a;
- *(u_int32_t *)a = *(u_int32_t *)b;
- *(u_int32_t *)b = t;
-}
-
-static void generic_swap(void *_a, void *_b, int size)
-{
- char t;
- char *a = (char*)_a;
- char *b = (char*)_b;
-
- do {
- t = *a;
- *a++ = *b;
- *b++ = t;
- } while (--size > 0);
-}
-
-/**
- * sort - sort an array of elements
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp_func: pointer to comparison function
- * @swap_func: pointer to swap function or NULL
- *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
- */
-
-void sort(void *_base, size_t num, size_t size,
- int (*cmp_func)(const void *, const void *),
- void (*swap_func)(void *, void *, int size))
-{
- /* pre-scale counters for performance */
- int i = (num/2 - 1) * size, n = num * size, c, r;
- char *base = (char*)_base;
-
- if (!swap_func)
- swap_func = (size == 4 ? u_int32_t_swap : generic_swap);
-
- /* heapify */
- for ( ; i >= 0; i -= size) {
- for (r = i; r * 2 + size < n; r = c) {
- c = r * 2 + size;
- if (c < n - size &&
- cmp_func(base + c, base + c + size) < 0)
- c += size;
- if (cmp_func(base + r, base + c) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-
- /* sort */
- for (i = n - size; i > 0; i -= size) {
- swap_func(base, base + i, size);
- for (r = 0; r * 2 + size < i; r = c) {
- c = r * 2 + size;
- if (c < i - size &&
- cmp_func(base + c, base + c + size) < 0)
- c += size;
- if (cmp_func(base + r, base + c) >= 0)
- break;
- swap_func(base + r, base + c, size);
- }
- }
-}