aboutsummaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rwxr-xr-xutils/crawlers_ip_addresses_download.sh62
-rwxr-xr-xutils/update_every_lists.sh2
2 files changed, 64 insertions, 0 deletions
diff --git a/utils/crawlers_ip_addresses_download.sh b/utils/crawlers_ip_addresses_download.sh
new file mode 100755
index 000000000..c4f4daa0b
--- /dev/null
+++ b/utils/crawlers_ip_addresses_download.sh
@@ -0,0 +1,62 @@
+#!/bin/sh
+
+set -e
+
+cd "$(dirname "${0}")" || exit 1
+
+DEST=../src/lib/inc_generated/ndpi_crawlers_match.c.inc
+TMP1=/tmp/bot_google_c1.json
+TMP2=/tmp/bot_google_c2.json
+TMP3=/tmp/bot_google_c3.json
+TMP_BING=/tmp/bot_bing.json
+LIST=/tmp/bot.list
+#Google Common crawlers
+ORIGIN1="https://developers.google.com/static/search/apis/ipranges/googlebot.json"
+#Google Special-case crawlers
+ORIGIN2="https://developers.google.com/static/search/apis/ipranges/special-crawlers.json"
+#Google User-triggered fetchers
+ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json"
+#Bing Bot
+ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json"
+
+
+echo "(1) Downloading file... ${ORIGIN1}"
+http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1})
+if [ "$http_response" != "200" ]; then
+ echo "Error $http_response: you probably need to update the list url!"
+ exit 1
+fi
+
+echo "(1) Downloading file... ${ORIGIN2}"
+http_response=$(curl -s -o $TMP2 -w "%{http_code}" ${ORIGIN2})
+if [ "$http_response" != "200" ]; then
+ echo "Error $http_response: you probably need to update the list url!"
+ exit 1
+fi
+
+echo "(1) Downloading file... ${ORIGIN3}"
+http_response=$(curl -s -o $TMP3 -w "%{http_code}" ${ORIGIN3})
+if [ "$http_response" != "200" ]; then
+ echo "Error $http_response: you probably need to update the list url!"
+ exit 1
+fi
+
+echo "(1) Downloading file... ${ORIGIN_BING}"
+http_response=$(curl -s -o $TMP_BING -w "%{http_code}" ${ORIGIN_BING})
+if [ "$http_response" != "200" ]; then
+ echo "Error $http_response: you probably need to update the list url!"
+ exit 1
+fi
+
+echo "(2) Processing IP addresses..."
+{
+ jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1 # TODO: ipv6
+ jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP2 # TODO: ipv6
+ jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3 # TODO: ipv6
+ jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING # TODO: ipv6
+} > $LIST
+./ipaddr2list.py $LIST NDPI_HTTP_CRAWLER_BOT > $DEST
+rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $LIST
+
+echo "(3) Crawlers IPs are available in $DEST"
+exit 0
diff --git a/utils/update_every_lists.sh b/utils/update_every_lists.sh
index 514a2999a..e9702f1b1 100755
--- a/utils/update_every_lists.sh
+++ b/utils/update_every_lists.sh
@@ -28,6 +28,8 @@ RETVAL=$(( RETVAL + $? ))
RETVAL=$(( RETVAL + $? ))
./icloud_private_relay_ip_addresses_download.sh
RETVAL=$(( RETVAL + $? ))
+./crawlers_ip_addresses_download.sh
+RETVAL=$(( RETVAL + $? ))
./asn_update.sh
RETVAL=$(( RETVAL + $? ))