diff options
author | Ivan Nardi <12729895+IvanNardi@users.noreply.github.com> | 2023-05-09 16:42:29 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-09 16:42:29 +0200 |
commit | 684e041998406532c6ef1e899ebc94ca5049d938 (patch) | |
tree | ac4a642aca47fc0821f0d669bb63370903d29b56 /utils | |
parent | 79c1dbe0b23cbfd006a6203572495f6be063debc (diff) |
Improve detection of crawlers/bots (#1968)
Add support for Facebook crawler
Diffstat (limited to 'utils')
-rwxr-xr-x | utils/crawlers_ip_addresses_download.sh | 9 |
1 files changed, 7 insertions, 2 deletions
diff --git a/utils/crawlers_ip_addresses_download.sh b/utils/crawlers_ip_addresses_download.sh index c4f4daa0b..77e70c61b 100755 --- a/utils/crawlers_ip_addresses_download.sh +++ b/utils/crawlers_ip_addresses_download.sh @@ -9,6 +9,7 @@ TMP1=/tmp/bot_google_c1.json TMP2=/tmp/bot_google_c2.json TMP3=/tmp/bot_google_c3.json TMP_BING=/tmp/bot_bing.json +TMP_FB=/tmp/bot_fb.list LIST=/tmp/bot.list #Google Common crawlers ORIGIN1="https://developers.google.com/static/search/apis/ipranges/googlebot.json" @@ -18,7 +19,7 @@ ORIGIN2="https://developers.google.com/static/search/apis/ipranges/special-crawl ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json" #Bing Bot ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json" - +#Facebook Bot: https://developers.facebook.com/docs/sharing/webmasters/crawler/ echo "(1) Downloading file... ${ORIGIN1}" http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1}) @@ -48,15 +49,19 @@ if [ "$http_response" != "200" ]; then exit 1 fi +echo "(1) Downloading FB crawlers routes... " +whois -h whois.radb.net -- '-i origin AS32934' | grep ^route > $TMP_FB + echo "(2) Processing IP addresses..." { jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1 # TODO: ipv6 jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP2 # TODO: ipv6 jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3 # TODO: ipv6 jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING # TODO: ipv6 + grep -v route6 $TMP_FB | tr -d 'route:^ ' # TODO: ipv6 } > $LIST ./ipaddr2list.py $LIST NDPI_HTTP_CRAWLER_BOT > $DEST -rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $LIST +rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $LIST echo "(3) Crawlers IPs are available in $DEST" exit 0 |