From abee1a2a6f1d8375831901e49ace85eaea0650e3 Mon Sep 17 00:00:00 2001 From: snicket2100 <57048005+snicket2100@users.noreply.github.com> Date: Fri, 14 Jul 2023 09:55:46 +0200 Subject: Included Gambling website data from the Polish `hazard.mf.gov.pl` list (#2041) * Refreshed the Belgium Gambling Site list data Unfortunately some hostnames have been removed from that list, which means they are disappearing from the `ndpi_gambling_match.c.inc` file as well. * build: added `libxml2-utils` (for `xmllint`) * Included Gambling website data from the Polish `hazard.mf.gov.pl` list The list contains over 30k gambling website hostnames as of today. --- utils/gambling_sites_download.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'utils') diff --git a/utils/gambling_sites_download.sh b/utils/gambling_sites_download.sh index 3340cf237..f80db68f6 100755 --- a/utils/gambling_sites_download.sh +++ b/utils/gambling_sites_download.sh @@ -12,8 +12,12 @@ printf '(1) %s\n' "Scraping Illegal Gambling Sites (Belgium)" DOMAINS="$(curl -s 'https://www.gamingcommission.be/en/gaming-commission/illegal-games-of-chance/list-of-illegal-gambling-sites' | sed -n 's/^]\+>\(.\+\.[a-zA-Z0-9]\+\)\(\|\/.*[^<]*\)<\/td>/\1/gp' || exit 1)" is_str_empty "${DOMAINS}" "Please check gambling sites URL and sed REGEX." +printf '(1) %s\n' "Downloading Gambling Sites (Poland)" +DOMAINS_PL="$(curl -s https://hazard.mf.gov.pl/api/Register | xmllint --xpath "/*[local-name(.)='Rejestr']/*[local-name(.)='PozycjaRejestru']/*[local-name(.)='AdresDomeny']/text()" -)" +is_str_empty "${DOMAINS_PL}" "Please check gambling sites URL and XPath." + printf '(2) %s\n' "Processing IP addresses..." -echo "${DOMAINS}" >${LIST} +echo "${DOMAINS}" "${DOMAINS_PL}" | sort | uniq >${LIST} ./hostname2list.py "${LIST}" "Gambling" NDPI_PROTOCOL_GAMBLING NDPI_PROTOCOL_CATEGORY_WEB NDPI_PROTOCOL_UNSAFE >${DEST} rm -f "${LIST}" is_file_empty "${DEST}" -- cgit v1.2.3