diff options
author | Ivan Nardi <12729895+IvanNardi@users.noreply.github.com> | 2024-07-03 16:16:54 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-03 16:16:54 +0200 |
commit | d42f0e6ab35ee0a196ff6a0fff76cfe8ea00afb9 (patch) | |
tree | 8723873d00add5642ddcfac0fa08ba3ebee81a32 /utils | |
parent | dab8d3056ec4571a0343bd7fc3cdce9c4d944719 (diff) |
Add detection of Twitter bot (#2487)
Update the global list of crawlers ips
Diffstat (limited to 'utils')
-rwxr-xr-x | utils/crawlers_ip_addresses_download.sh | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/utils/crawlers_ip_addresses_download.sh b/utils/crawlers_ip_addresses_download.sh index 45dbdcd4a..d15c6e4c1 100755 --- a/utils/crawlers_ip_addresses_download.sh +++ b/utils/crawlers_ip_addresses_download.sh @@ -11,6 +11,7 @@ TMP2=/tmp/bot_google_c2.json TMP3=/tmp/bot_google_c3.json TMP_BING=/tmp/bot_bing.json TMP_FB=/tmp/bot_fb.list +TMP_TW=/tmp/bot_tw.list LIST=/tmp/bot.list LIST6=/tmp/bot.list6 LIST_MERGED=/tmp/bot.list_m @@ -24,6 +25,8 @@ ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggere #Bing Bot ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json" #Facebook Bot: https://developers.facebook.com/docs/sharing/webmasters/crawler/ +#TwitterBot +ORIGIN_TW="https://developer.x.com/en/docs/twitter-for-websites/cards/guides/troubleshooting-cards" echo "(1) Downloading file... ${ORIGIN1}" http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1}) @@ -49,6 +52,12 @@ echo "(1) Downloading FB crawlers routes... " whois -h whois.radb.net -- '-i origin AS32934' | grep ^route > $TMP_FB is_file_empty "${TMP_FB}" +echo "(1) Downloading page... ${ORIGIN_TW}" +http_response=$(curl -s -o $TMP_TW -w "%{http_code}" ${ORIGIN_TW}) +check_http_response "${http_response}" +is_file_empty "${TMP_TW}" + + echo "(2) Processing IP addresses..." { jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1 @@ -56,6 +65,7 @@ echo "(2) Processing IP addresses..." jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3 jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING grep -v route6 $TMP_FB | tr -d 'route:^ ' + grep "IP ranges are" $TMP_TW | grep -E -o "[^^][0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}" | tr -d ' ' # TODO: ipv4 only } > $LIST is_file_empty "${LIST}" ./mergeipaddrlist.py "${LIST}" > "${LIST_MERGED}" @@ -72,7 +82,7 @@ is_file_empty "${LIST6}" is_file_empty "${LIST6_MERGED}" ./ipaddr2list.py $LIST_MERGED NDPI_HTTP_CRAWLER_BOT $LIST6_MERGED > $DEST is_file_empty "${DEST}" -rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $LIST $LIST6 $LIST_MERGED $LIST6_MERGED +rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $TMP_TW $LIST $LIST6 $LIST_MERGED $LIST6_MERGED echo "(3) Crawlers IPs are available in $DEST" exit 0 |