aboutsummaryrefslogtreecommitdiff
path: root/src/lib/third_party/include/ahocorasick.h
blob: 5efbc05f2652a285c5ae211760a0300899f181c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/*
 * ahocorasick.h: Includes all types of ahocorasick library
 * This file is part of multifast.
 *
 Copyright 2010-2012 Kamiar Kanani <kamiar.kanani@gmail.com>

 multifast is free software: you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 multifast is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public License
 along with multifast.  If not, see <http://www.gnu.org/licenses/>.
*/

#ifndef _AC_TYPES_H_
#define _AC_TYPES_H_

#define AC_AHOCORASICK_NEW
#define AC_PATTRN_MAX_LENGTH 256

/* reallocation step for AC_NODE_t.matched_patterns */
#define REALLOC_CHUNK_MATCHSTR 8

/* reallocation step for AC_NODE_t.outgoing array */
/* Must be a multiple of __SIZEOF_LONG__ ! */
#define REALLOC_CHUNK_OUTGOING 8

/* AC_ALPHABET_t:
 * defines the alphabet type.
 * Actually defining AC_ALPHABET_t as a char will work, but sometimes we deal
 * with streams of other (bigger) types e.g. integers, specific enum, objects.
 * Although they consists of string of bytes (chars), but using their specific
 * types for AC_ALPHABET_t will lead to a better performance. so instead of
 * dealing with strings of chars, we assume dealing with strings of
 * AC_ALPHABET_t and leave it optional for other developers to define their
 * own alphabets.
 **/
typedef char AC_ALPHABET_t;

/* AC_REP_t:
 * Provides a more readable representative for a pattern.
 * because patterns themselves are not always suitable for displaying
 * (e.g. for hex patterns), we offer this type to improve intelligibility
 * of output. furthermore, sometimes it is useful, for example while
 * retrieving patterns from a database, to maintain their identifiers in the
 * automata for further reference. we provisioned two possible types as a
 * union for this purpose. you can add your desired type in it.
 **/
typedef struct {
  uint32_t number; /* Often used to store procotolId */
  uint16_t breed, category;
  uint16_t        level,        /* Domain level for comparison */
                  from_start:1, /* match from start of string */
                  at_end:1,     /* match at end of string */
                  dot:1;        /* is domain name */
} AC_REP_t;

/* AC_PATTERN_t:
 * This is the pattern type that must be fed into AC automata.
 * the 'astring' field is not null-terminated, due to it can contain zero
 * value bytes. the 'length' field determines the number of AC_ALPHABET_t it
 * carries. the 'representative' field is described in AC_REP_t. despite
 * 'astring', 'representative' can have duplicate values for different given
 * AC_PATTERN_t. it is an optional field and you can just fill it with 0.
 * CAUTION:
 * Not always the 'astring' points to the correct position in memory.
 * it is the responsibility of your program to maintain a permanent allocation
 * for astring field of the added pattern to automata.
 **/

typedef struct
{
  AC_ALPHABET_t * astring; /* String of alphabets */
  uint16_t        length,  /* Length of pattern */
	          is_existing; /* not union_matchstr */
  AC_REP_t rep;   /* Representative string (optional) */
} AC_PATTERN_t;

typedef struct {
  unsigned short num; /* Number of matched patterns at this node */
  unsigned short max; /* Max capacity of allocated memory for matched_patterns */
  AC_PATTERN_t	patterns[];
} AC_PATTERNS_t;


/* AC_MATCH_t:
 * Provides the structure for reporting a match event.
 * a match event occurs when the automata reaches a final node. any final
 * node can match one or more pattern at a position in a text. the
 * 'patterns' field holds these matched patterns. obviously these
 * matched patterns have same end-position in the text. there is a relationship
 * between matched patterns: the shorter one is a factor (tail) of the longer
 * one. the 'position' maintains the end position of matched patterns. the
 * start position of patterns could be found by knowing their 'length' in
 * AC_PATTERN_t. e.g. suppose "recent" and "cent" are matched at
 * position 40 in the text, then the start position of them are 34 and 36
 * respectively. finally the field 'match_num' maintains the number of
 * matched patterns.
 **/

typedef struct
{
  AC_PATTERN_t *matched[4],   /* for ac_automata_exact_match() */
	       *last;         /* for callback              */
  AC_PATTERN_t *patterns;     /* Array of matched pattern  */
  unsigned int  match_map;    /* Matched patterns (bitmap) */
  unsigned int  position;     /* The end position of matching pattern(s) in the text */
  unsigned short int match_num;     /* Number of matched patterns */
  unsigned short int match_counter; /* Counter of found matches */
} AC_MATCH_t;

/* AC_TEXT_t:
 * The input text type that is fed to ac_automata_search() to be searched.
 * it is similar to AC_PATTERN_t. actually we could use AC_PATTERN_t as input
 * text, but for the purpose of being more readable, we defined this new type.
 **/
typedef struct
{
  AC_MATCH_t match;
  AC_ALPHABET_t * astring;    /* String of alphabets */
  unsigned short int  length, /* Length of string */
	              option; /* AC_FEATURE_LC | AC_FEATURE_DEBUG */;
} AC_TEXT_t;


/* AC_ERROR_t:
 * Error that may occur while adding a pattern to the automata.
 * it is returned by ac_automata_add().
 **/
typedef enum
  {
    ACERR_SUCCESS = 0, /* No error occurred */
    ACERR_DUPLICATE_PATTERN, /* Duplicate patterns */
    ACERR_LONG_PATTERN, /* Pattern length is longer than AC_PATTRN_MAX_LENGTH */
    ACERR_ZERO_PATTERN, /* Empty pattern (zero length) */
    ACERR_AUTOMATA_CLOSED, /* Automata is closed. after calling
			      ac_automata_finalize() you can not add new patterns to the automata. */
    ACERR_ERROR, /* common error */
  } AC_ERROR_t;

/* MATCH_CALLBACK_t:
 * This is the call-back function type that must be given to automata at
 * initialization to report match occurrence to the caller.
 * at a match event, the automata will reach you using this function and sends
 * you a pointer to AC_MATCH_t. using that pointer you can handle
 * matches. you can send parameters to the call-back function when you call
 * ac_automata_search(). at call-back, the automata will sent you those
 * parameters as the second parameter (void *) of MATCH_CALLBACK_t. inside
 * the call-back function you can cast it to whatever you want.
 * If you return 0 from MATCH_CALLBACK_t function to the automata, it will
 * continue searching, otherwise it will return from ac_automata_search()
 * to your calling function.
 **/
typedef int (*MATCH_CALLBACK_f)(AC_MATCH_t *, AC_TEXT_t *, AC_REP_t *);

/* AC_PATTRN_MAX_LENGTH:
 * Maximum acceptable pattern length in AC_PATTERN_t.length
 **/

/* Forward Declaration */
struct edge;

/*
 * automata node
 * 4 pointers + 8 bytes : 40/24 bytes for 64/32 bit
 */
typedef struct ac_node
{
  int id;                              /* Node ID : set after finalize(), only for ac_automata_dump */
  AC_ALPHABET_t  one_alpha,
		 one:1,                /* one_char: yes/no */
		 range:1,              /* range symbols start from one_alpha */
		 root:1,               /* is root node */
	  	 final:1,	       /* 0: no ; 1: yes, it is a final node */
		 use:1,	               /* use: yes/no */ 
		 ff:1;		       /* finalized node */
  unsigned short depth;                /* depth: distance between this node and the root */

  AC_PATTERNS_t  *matched_patterns;   /* Array of matched patterns */
  struct edge    *outgoing;           /* Array of outgoing edges */

  struct ac_node *failure_node;       /* The failure node of this node */
  AC_ALPHABET_t  *a_ptr;
} AC_NODE_t;

struct edge {
  unsigned short degree;      /* Number of outgoing edges */
  unsigned short max;         /* Max capacity of allocated memory for outgoing */
  uint32_t       cmap[8];      /* 256 bit */
  AC_NODE_t	 *next[];
 /*
  * first N elements used for 'next' pointers +
  * M elements used for symbols storage
  * M = (max + sizeof(void*)-1) & ( ~(sizeof(void*)-1))
  *
  * if sizeof(void*)==8
  * for max = 8  we must alloc next[9];
  * for max = 16 we must alloc next[18];
  *
  */
};

struct ac_path {
  AC_NODE_t * n;
  unsigned short int idx,l;
};

typedef struct
{
  /* The root of the Aho-Corasick trie */
  AC_NODE_t * root;
  MATCH_CALLBACK_f match_handler;
  unsigned int all_nodes_num; /* Number of all nodes in the automata */

  /* this flag indicates that if automata is finalized by
   * ac_automata_finalize() or not. 1 means finalized and 0
   * means not finalized (is open). after finalizing automata you can not
   * add pattern to automata anymore. */
  unsigned short automata_open,
		 to_lc:1, no_root_range:1,debug:1; /* lowercase match */

  /* Statistic Variables */
  unsigned long total_patterns; /* Total patterns in the automata */

  unsigned long max_str_len; /* largest pattern length. Update by ac_automata_finalize() */

  struct ac_path ac_path[AC_PATTRN_MAX_LENGTH+4]; /* for ac_automata_walk */
  int id;	/* node id */
  int add_to_range; /* for convert to range */
  int n_oc,n_range,n_find; /* statistics */
  char name[32]; /* if debug != 0 */
} AC_AUTOMATA_t;

typedef AC_ERROR_t (*NODE_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,int idx, void *);

typedef void (*ALPHA_CALLBACK_f)(AC_AUTOMATA_t *, AC_NODE_t *,AC_NODE_t *,int ,void *);

#define AC_FEATURE_DEBUG 1 
#define AC_FEATURE_LC 2
#define AC_FEATURE_NO_ROOT_RANGE 4

AC_AUTOMATA_t * ac_automata_init     (MATCH_CALLBACK_f mc);
AC_ERROR_t      ac_automata_feature  (AC_AUTOMATA_t * thiz, unsigned int feature);
AC_ERROR_t      ac_automata_name     (AC_AUTOMATA_t * thiz, char *name, int debug);
AC_ERROR_t      ac_automata_add      (AC_AUTOMATA_t * thiz, AC_PATTERN_t * str);
AC_ERROR_t      ac_automata_finalize (AC_AUTOMATA_t * thiz);
AC_ERROR_t      ac_automata_walk     (AC_AUTOMATA_t * thiz, NODE_CALLBACK_f node_cb,
						ALPHA_CALLBACK_f alpha_cb, void *);

int             ac_automata_search   (AC_AUTOMATA_t * thiz,
						AC_TEXT_t * str, 
						AC_REP_t * param);
int             ac_automata_exact_match(AC_PATTERNS_t *mp,int pos, AC_TEXT_t *);
void            ac_automata_clean    (AC_AUTOMATA_t * thiz);
void            ac_automata_release  (AC_AUTOMATA_t * thiz, uint8_t free_pattern);
#ifndef __KERNEL__
/* Global debug control. */
void            ac_automata_enable_debug (int debug);
/* See man open_memstream() for get result as string */
void            ac_automata_dump     (AC_AUTOMATA_t * thiz, FILE *);
#endif
#endif