aboutsummaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorIvan Nardi <12729895+IvanNardi@users.noreply.github.com>2024-07-03 16:16:54 +0200
committerGitHub <noreply@github.com>2024-07-03 16:16:54 +0200
commitd42f0e6ab35ee0a196ff6a0fff76cfe8ea00afb9 (patch)
tree8723873d00add5642ddcfac0fa08ba3ebee81a32 /utils
parentdab8d3056ec4571a0343bd7fc3cdce9c4d944719 (diff)
Add detection of Twitter bot (#2487)
Update the global list of crawlers ips
Diffstat (limited to 'utils')
-rwxr-xr-xutils/crawlers_ip_addresses_download.sh12
1 files changed, 11 insertions, 1 deletions
diff --git a/utils/crawlers_ip_addresses_download.sh b/utils/crawlers_ip_addresses_download.sh
index 45dbdcd4a..d15c6e4c1 100755
--- a/utils/crawlers_ip_addresses_download.sh
+++ b/utils/crawlers_ip_addresses_download.sh
@@ -11,6 +11,7 @@ TMP2=/tmp/bot_google_c2.json
TMP3=/tmp/bot_google_c3.json
TMP_BING=/tmp/bot_bing.json
TMP_FB=/tmp/bot_fb.list
+TMP_TW=/tmp/bot_tw.list
LIST=/tmp/bot.list
LIST6=/tmp/bot.list6
LIST_MERGED=/tmp/bot.list_m
@@ -24,6 +25,8 @@ ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggere
#Bing Bot
ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json"
#Facebook Bot: https://developers.facebook.com/docs/sharing/webmasters/crawler/
+#TwitterBot
+ORIGIN_TW="https://developer.x.com/en/docs/twitter-for-websites/cards/guides/troubleshooting-cards"
echo "(1) Downloading file... ${ORIGIN1}"
http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1})
@@ -49,6 +52,12 @@ echo "(1) Downloading FB crawlers routes... "
whois -h whois.radb.net -- '-i origin AS32934' | grep ^route > $TMP_FB
is_file_empty "${TMP_FB}"
+echo "(1) Downloading page... ${ORIGIN_TW}"
+http_response=$(curl -s -o $TMP_TW -w "%{http_code}" ${ORIGIN_TW})
+check_http_response "${http_response}"
+is_file_empty "${TMP_TW}"
+
+
echo "(2) Processing IP addresses..."
{
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1
@@ -56,6 +65,7 @@ echo "(2) Processing IP addresses..."
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING
grep -v route6 $TMP_FB | tr -d 'route:^ '
+ grep "IP ranges are" $TMP_TW | grep -E -o "[^^][0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}/[0-9]{1,2}" | tr -d ' ' # TODO: ipv4 only
} > $LIST
is_file_empty "${LIST}"
./mergeipaddrlist.py "${LIST}" > "${LIST_MERGED}"
@@ -72,7 +82,7 @@ is_file_empty "${LIST6}"
is_file_empty "${LIST6_MERGED}"
./ipaddr2list.py $LIST_MERGED NDPI_HTTP_CRAWLER_BOT $LIST6_MERGED > $DEST
is_file_empty "${DEST}"
-rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $LIST $LIST6 $LIST_MERGED $LIST6_MERGED
+rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $TMP_TW $LIST $LIST6 $LIST_MERGED $LIST6_MERGED
echo "(3) Crawlers IPs are available in $DEST"
exit 0