From 21da53d3a03cad32dffa8447d9c4ae5bae62a3a2 Mon Sep 17 00:00:00 2001 From: Ivan Nardi <12729895+IvanNardi@users.noreply.github.com> Date: Wed, 6 Mar 2024 19:25:59 +0100 Subject: ahocorasick: improve matching with subdomains (#2331) The basic idea is to have the following logic: * pattern "DOMAIN" matches the domain itself (i.e exact match) *and* any subdomains (i.e. "ANYTHING.DOMAIN") * pattern "DOMAIN." matches *also* any strings for which is a prefix [please, note that this kind of match is handy but it is quite dangerous...] * pattern "-DOMAIN" matches *also* any strings for which is a postfix Examples: * pattern "wikipedia.it": * "wikipiedia.it" -> OK * "foo.wikipedia.it -> OK * "foowikipedia.it -> NO MATCH * "wikipedia.it.com -> NO MATCH * pattern "wikipedia.": * "wikipedia.it" -> OK * "foo.wikipedia.it -> OK * "foowikipedia.it -> NO MATCH * "wikipedia.it.com -> OK * pattern "-wikipedia.it": * "wikipedia.it" -> NO MATCH * "foo.wikipedia.it -> NO MATCH * "0001-wikipedia.it -> OK * "foo.0001-wikipedia.it -> OK Bottom line: * exact match * prefix with "." (always, implicit) * prefix with "-" (only if esplicitly set) * postfix with "." (only if esplicitly set) That means that the patterns cannot start with '.' anymore. Close #2330 --- example/ndpiReader.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'example/ndpiReader.c') diff --git a/example/ndpiReader.c b/example/ndpiReader.c index 361cc36a7..ef5307934 100644 --- a/example/ndpiReader.c +++ b/example/ndpiReader.c @@ -5005,6 +5005,48 @@ void automataUnitTest() { ndpi_free_automa(automa); } +/* *********************************************** */ + +void automataDomainsUnitTest() { + void *automa = ndpi_init_automa_domain(); + + assert(automa); + assert(ndpi_add_string_to_automa(automa, ndpi_strdup("wikipedia.it")) == 0); + ndpi_finalize_automa(automa); + assert(ndpi_match_string(automa, "wikipedia.it") == 1); + assert(ndpi_match_string(automa, "foo.wikipedia.it") == 1); + assert(ndpi_match_string(automa, "foowikipedia.it") == 0); + assert(ndpi_match_string(automa, "foowikipedia") == 0); + assert(ndpi_match_string(automa, "-wikipedia.it") == 0); + assert(ndpi_match_string(automa, "foo-wikipedia.it") == 0); + assert(ndpi_match_string(automa, "wikipedia.it.com") == 0); + ndpi_free_automa(automa); + + automa = ndpi_init_automa_domain(); + assert(automa); + assert(ndpi_add_string_to_automa(automa, ndpi_strdup("wikipedia.")) == 0); + ndpi_finalize_automa(automa); + assert(ndpi_match_string(automa, "wikipedia.it") == 1); + assert(ndpi_match_string(automa, "foo.wikipedia.it") == 1); + assert(ndpi_match_string(automa, "foowikipedia.it") == 0); + assert(ndpi_match_string(automa, "foowikipedia") == 0); + assert(ndpi_match_string(automa, "-wikipedia.it") == 0); + assert(ndpi_match_string(automa, "foo-wikipedia.it") == 0); + assert(ndpi_match_string(automa, "wikipediafoo") == 0); + assert(ndpi_match_string(automa, "wikipedia.it.com") == 1); + ndpi_free_automa(automa); + + automa = ndpi_init_automa_domain(); + assert(automa); + assert(ndpi_add_string_to_automa(automa, ndpi_strdup("-buy.itunes.apple.com")) == 0); + ndpi_finalize_automa(automa); + assert(ndpi_match_string(automa, "buy.itunes.apple.com") == 0); + assert(ndpi_match_string(automa, "p53-buy.itunes.apple.com") == 1); + assert(ndpi_match_string(automa, "p53buy.itunes.apple.com") == 0); + assert(ndpi_match_string(automa, "foo.p53-buy.itunes.apple.com") == 1); + ndpi_free_automa(automa); +} + #endif /* *********************************************** */ @@ -5927,6 +5969,7 @@ int main(int argc, char **argv) { bitmapUnitTest(); filterUnitTest(); automataUnitTest(); + automataDomainsUnitTest(); analyzeUnitTest(); ndpi_self_check_host_match(stderr); analysisUnitTest(); -- cgit v1.2.3