aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Deri <deri@ntop.org>2022-02-17 17:20:52 +0100
committerLuca Deri <deri@ntop.org>2022-02-17 17:20:52 +0100
commita2878af1eed26db8380bf8c29e5bb64a0181f935 (patch)
treea341c52e76f170f799a24bca3f7a3bc57071ca5d
parent8a2a47e62a0d7b1bc8815dc4f09c35b73393454e (diff)
Added newflow risk NDPI_HTTP_CRAWLER_BOT
-rw-r--r--doc/flow_risks.rst6
-rw-r--r--example/ndpiReader.c10
-rw-r--r--python/ndpi.py1
-rw-r--r--src/include/ndpi_typedefs.h1
-rw-r--r--src/lib/ndpi_main.c1
-rw-r--r--src/lib/ndpi_utils.c4
-rw-r--r--src/lib/protocols/http.c50
-rw-r--r--tests/pcap/bot.pcapbin0 -> 437580 bytes
-rw-r--r--tests/result/bot.pcap.out8
-rw-r--r--wireshark/ndpi.lua1
10 files changed, 65 insertions, 17 deletions
diff --git a/doc/flow_risks.rst b/doc/flow_risks.rst
index 95001098c..4b363365b 100644
--- a/doc/flow_risks.rst
+++ b/doc/flow_risks.rst
@@ -266,4 +266,10 @@ NDPI_ERROR_CODE_DETECTED
===================================
The risk is set whenever an error code is detected in the underlying protocol (e.g. HTTP and DNS).
+.. _Risk 044:
+
+NDPI_HTTP_CRAWLER_BOT
+===================================
+The risk is set whenever a crawler/bot/robot has been detected
+
diff --git a/example/ndpiReader.c b/example/ndpiReader.c
index f21fbc86a..5f7f5d2b2 100644
--- a/example/ndpiReader.c
+++ b/example/ndpiReader.c
@@ -735,7 +735,7 @@ void printCSVHeader() {
#if 0
fprintf(csv_fp, "tls_issuerDN,tls_subjectDN,");
#endif
- fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info,plen_bins");
+ fprintf(csv_fp, "ssh_client_hassh,ssh_server_hassh,flow_info,plen_bins,http_user_agent");
if(enable_flow_stats) {
fprintf(csv_fp, ",byte_dist_mean,byte_dist_std,entropy,total_entropy");
@@ -1230,9 +1230,11 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
/* TCP flags */
fprintf(csv_fp, "%d,%d,%d,%d,%d,%d,%d,%d,", flow->cwr_count, flow->ece_count, flow->urg_count, flow->ack_count, flow->psh_count, flow->rst_count, flow->syn_count, flow->fin_count);
- fprintf(csv_fp, "%d,%d,%d,%d,%d,%d,%d,%d,", flow->src2dst_cwr_count, flow->src2dst_ece_count, flow->src2dst_urg_count, flow->src2dst_ack_count, flow->src2dst_psh_count, flow->src2dst_rst_count, flow->src2dst_syn_count, flow->src2dst_fin_count);
+ fprintf(csv_fp, "%d,%d,%d,%d,%d,%d,%d,%d,", flow->src2dst_cwr_count, flow->src2dst_ece_count, flow->src2dst_urg_count, flow->src2dst_ack_count,
+ flow->src2dst_psh_count, flow->src2dst_rst_count, flow->src2dst_syn_count, flow->src2dst_fin_count);
- fprintf(csv_fp, "%d,%d,%d,%d,%d,%d,%d,%d,", flow->dst2src_cwr_count, flow->ece_count, flow->urg_count, flow->ack_count, flow->psh_count, flow->rst_count, flow->syn_count, flow->fin_count);
+ fprintf(csv_fp, "%d,%d,%d,%d,%d,%d,%d,%d,", flow->dst2src_cwr_count, flow->ece_count, flow->urg_count, flow->ack_count,
+ flow->psh_count, flow->rst_count, flow->syn_count, flow->fin_count);
/* TCP window */
fprintf(csv_fp, "%u,%u,", flow->c_to_s_init_win, flow->s_to_c_init_win);
@@ -1269,6 +1271,8 @@ static void printFlow(u_int32_t id, struct ndpi_flow_info *flow, u_int16_t threa
#ifndef DIRECTION_BINS
print_bin(csv_fp, NULL, &flow->payload_len_bin);
#endif
+
+ fprintf(csv_fp, ",%s", flow->http.user_agent);
}
if((verbose != 1) && (verbose != 2)) {
diff --git a/python/ndpi.py b/python/ndpi.py
index d296844e7..039f222ae 100644
--- a/python/ndpi.py
+++ b/python/ndpi.py
@@ -335,6 +335,7 @@ typedef enum {
NDPI_TLS_CERTIFICATE_ABOUT_TO_EXPIRE,
NDPI_PUNYCODE_IDN,
NDPI_ERROR_CODE_DETECTED,
+ NDPI_HTTP_CRAWLER_BOT,
/* Leave this as last member */
NDPI_MAX_RISK
diff --git a/src/include/ndpi_typedefs.h b/src/include/ndpi_typedefs.h
index 0b798c530..a86fa79da 100644
--- a/src/include/ndpi_typedefs.h
+++ b/src/include/ndpi_typedefs.h
@@ -116,6 +116,7 @@ typedef enum {
NDPI_TLS_CERTIFICATE_ABOUT_TO_EXPIRE,
NDPI_PUNYCODE_IDN, /* https://en.wikipedia.org/wiki/Punycode */
NDPI_ERROR_CODE_DETECTED,
+ NDPI_HTTP_CRAWLER_BOT,
/* Leave this as last member */
NDPI_MAX_RISK /* must be <= 63 due to (**) */
diff --git a/src/lib/ndpi_main.c b/src/lib/ndpi_main.c
index 7c0e8f3b1..2740ec8bb 100644
--- a/src/lib/ndpi_main.c
+++ b/src/lib/ndpi_main.c
@@ -125,6 +125,7 @@ static ndpi_risk_info ndpi_known_risks[] = {
{ NDPI_TLS_CERTIFICATE_ABOUT_TO_EXPIRE, NDPI_RISK_MEDIUM, CLIENT_LOW_RISK_PERCENTAGE },
{ NDPI_PUNYCODE_IDN, NDPI_RISK_LOW, CLIENT_LOW_RISK_PERCENTAGE },
{ NDPI_ERROR_CODE_DETECTED, NDPI_RISK_LOW, CLIENT_LOW_RISK_PERCENTAGE },
+ { NDPI_HTTP_CRAWLER_BOT, NDPI_RISK_LOW, CLIENT_LOW_RISK_PERCENTAGE },
/* Leave this as last member */
{ NDPI_MAX_RISK, NDPI_RISK_LOW, CLIENT_FAIR_RISK_PERCENTAGE }
diff --git a/src/lib/ndpi_utils.c b/src/lib/ndpi_utils.c
index fabc4db2a..29cb94695 100644
--- a/src/lib/ndpi_utils.c
+++ b/src/lib/ndpi_utils.c
@@ -1852,6 +1852,10 @@ const char* ndpi_risk2str(ndpi_risk_enum risk) {
return("Error Code Detected");
break;
+ case NDPI_HTTP_CRAWLER_BOT:
+ return("Crawler/Bot Detected");
+ break;
+
default:
snprintf(buf, sizeof(buf), "%d", (int)risk);
return(buf);
diff --git a/src/lib/protocols/http.c b/src/lib/protocols/http.c
index cf1e6282b..b34206271 100644
--- a/src/lib/protocols/http.c
+++ b/src/lib/protocols/http.c
@@ -422,31 +422,53 @@ static void ndpi_check_user_agent(struct ndpi_detection_module_struct *ndpi_stru
struct ndpi_flow_struct *flow,
char *ua) {
u_int len;
-
+ char *double_slash;
+
if((!ua) || (ua[0] == '\0'))
return;
else
len = strlen(ua);
- if(
- (!strncmp(ua, "<?", 2))
- || strchr(ua, '$')
- || strstr(ua, "://") // || (!strncmp(ua, "jndi:ldap://", 12)) /* Log4J */
- // || ndpi_check_dga_name(ndpi_struct, NULL, ua, 0)
- // || ndpi_match_bigram(ndpi_struct, &ndpi_struct->impossible_bigrams_automa, ua)
- ) {
+ if((!strncmp(ua, "<?", 2))
+ || strchr(ua, '$')
+ // || ndpi_check_dga_name(ndpi_struct, NULL, ua, 0)
+ // || ndpi_match_bigram(ndpi_struct, &ndpi_struct->impossible_bigrams_automa, ua)
+ )
ndpi_set_risk(ndpi_struct, flow, NDPI_HTTP_SUSPICIOUS_USER_AGENT);
+ if((double_slash = strstr(ua, "://")) != NULL) {
+ if(double_slash != ua) /* We're not at the beginning of the user agent */{
+ if((double_slash[-1] != 'p') /* http:// */
+ && (double_slash[-1] != 's') /* https:// */)
+ ndpi_set_risk(ndpi_struct, flow, NDPI_HTTP_SUSPICIOUS_USER_AGENT);
+ }
+ }
+
+ /* no else */
+ if(!strncmp(ua, "jndi:ldap://", 12)) /* Log4J */ {
ndpi_set_risk(ndpi_struct, flow, NDPI_POSSIBLE_EXPLOIT);
} else if(
- (len < 4) /* Too short */
- || (len > 256) /* Too long */
- || (!strncmp(ua, "test", 4))
- || strchr(ua, '{')
- || strchr(ua, '}')
- ) {
+ (len < 4) /* Too short */
+ || (len > 256) /* Too long */
+ || (!strncmp(ua, "test", 4))
+ || strchr(ua, '{')
+ || strchr(ua, '}')
+ ) {
ndpi_set_risk(ndpi_struct, flow, NDPI_HTTP_SUSPICIOUS_USER_AGENT);
}
+
+ /*
+ Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)
+ Amazon-Route53-Health-Check-Service (ref 68784dad-be98-49e4-a63c-9fbbe2816d7c; report http://amzn.to/1vsZADi)
+ Anonymous Crawler/1.0 (Webcrawler developed with StormCrawler; http://example.com/; webcrawler@example.com)
+ */
+ if((strstr(ua, "+http") != NULL)
+ || (strstr(ua, " http") != NULL)
+ || strcasestr(ua, "Crawler")
+ || strcasestr(ua, "Bot") /* bot/robot */
+ ) {
+ ndpi_set_risk(ndpi_struct, flow, NDPI_HTTP_CRAWLER_BOT);
+ }
}
/* ************************************************************* */
diff --git a/tests/pcap/bot.pcap b/tests/pcap/bot.pcap
new file mode 100644
index 000000000..016c71859
--- /dev/null
+++ b/tests/pcap/bot.pcap
Binary files differ
diff --git a/tests/result/bot.pcap.out b/tests/result/bot.pcap.out
new file mode 100644
index 000000000..2c3b2cf00
--- /dev/null
+++ b/tests/result/bot.pcap.out
@@ -0,0 +1,8 @@
+Guessed flow protos: 0
+
+DPI Packets (TCP): 6 (6.00 pkts/flow)
+Confidence DPI : 1 (flows)
+
+Azure 402 431124 1
+
+ 1 TCP 40.77.167.36:64768 <-> 89.31.72.220:80 [VLAN: 77][proto: 7.276/HTTP.Azure][ClearText][Confidence: DPI][cat: Cloud/13][115 pkts/7672 bytes <-> 287 pkts/423452 bytes][Goodput ratio: 4/96][5.66 sec][Hostname/SNI: atlanteditorino.it][bytes ratio: -0.964 (Download)][IAT c2s/s2c min/avg/max/stddev: 0/0 58/3 4532/106 489/16][Pkt Len c2s/s2c min/avg/max/stddev: 64/64 67/1475 374/1498 29/171][URL: atlanteditorino.it/quartieri/img/S.Donato_M.Vittoria1930_B.jpg][StatusCode: 200][Content-Type: image/jpeg][User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)][Risk: ** Crawler/Bot Detected **][Risk Score: 10][PLAIN TEXT (GET /quartieri/im)][Plen Bins: 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,0,0]
diff --git a/wireshark/ndpi.lua b/wireshark/ndpi.lua
index 68b71e9b4..28a1c6506 100644
--- a/wireshark/ndpi.lua
+++ b/wireshark/ndpi.lua
@@ -82,6 +82,7 @@ flow_risks[40] = ProtoField.bool("ndpi.flow_risk.possible_exploit", "Possible Ex
flow_risks[41] = ProtoField.bool("ndpi.flow_risk.cert_about_to_expire", "TLS cert about to expire", num_bits_flow_risks, nil, bit(9), "nDPI Flow Risk: TLS certificate about to expire")
flow_risks[42] = ProtoField.bool("ndpi.flow_risk.punycode_idn", "IDN Domain Name", num_bits_flow_risks, nil, bit(10), "nDPI Flow Risk: IDN Domain Name")
flow_risks[43] = ProtoField.bool("ndpi.flow_risk.error_code_detected", "Error Code Detected", num_bits_flow_risks, nil, bit(11), "nDPI Flow Risk: Error Code Detected")
+flow_risks[44] = ProtoField.bool("ndpi.flow_risk.crawler_bot", "Crawler/Bot Detected", num_bits_flow_risks, nil, bit(12), "nDPI Flow Risk: Crawler/Bot Detected")
-- Last one: keep in sync the bitmask when adding new risks!!
flow_risks[64] = ProtoField.new("Unused", "ndpi.flow_risk.unused", ftypes.UINT32, nil, base.HEX, bit(32) - bit(10))