1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2022 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\Middleware; 21 22use Fig\Http\Message\StatusCodeInterface; 23use Fisharebest\Webtrees\Registry; 24use Fisharebest\Webtrees\Validator; 25use GuzzleHttp\Client; 26use GuzzleHttp\Exception\GuzzleException; 27use Iodev\Whois\Loaders\CurlLoader; 28use Iodev\Whois\Modules\Asn\AsnRouteInfo; 29use Iodev\Whois\Whois; 30use IPLib\Address\AddressInterface; 31use IPLib\Factory as IPFactory; 32use IPLib\Range\RangeInterface; 33use Psr\Http\Message\ResponseInterface; 34use Psr\Http\Message\ServerRequestInterface; 35use Psr\Http\Server\MiddlewareInterface; 36use Psr\Http\Server\RequestHandlerInterface; 37use Throwable; 38 39use function array_filter; 40use function array_map; 41use function assert; 42use function gethostbyaddr; 43use function gethostbyname; 44use function preg_match_all; 45use function random_int; 46use function response; 47use function str_contains; 48use function str_ends_with; 49 50/** 51 * Middleware to block bad robots before they waste our valuable CPU cycles. 52 */ 53class BadBotBlocker implements MiddlewareInterface 54{ 55 private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56 private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57 58 // Cache whois requests. Try to avoid all caches expiring at the same time. 59 private const WHOIS_TTL_MIN = 28 * 86400; 60 private const WHOIS_TTL_MAX = 35 * 86400; 61 private const WHOIS_TIMEOUT = 5; 62 63 // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64 public const BAD_ROBOTS = [ 65 'admantx', 66 'Adsbot', 67 'AhrefsBot', 68 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 69 'AspiegelBot', 70 'Barkrowler', 71 'BLEXBot', 72 'DataForSEO', 73 'DataForSeoBot', // https://dataforseo.com/dataforseo-bot 74 'DotBot', 75 'Grapeshot', 76 'Honolulu-bot', // Aggressive crawer, no info available 77 'ia_archiver', 78 'linabot', // Aggressive crawer, no info available 79 'Linguee', 80 'MJ12bot', 81 'netEstate NE', 82 'panscient', 83 'PetalBot', 84 'proximic', 85 'SemrushBot', 86 'serpstatbot', 87 'SEOkicks', 88 'SiteKiosk', 89 'Turnitin', 90 'XoviBot', 91 'ZoominfoBot', 92 ]; 93 94 /** 95 * Some search engines use reverse/forward DNS to verify the IP address. 96 * 97 * @see https://developer.amazon.com/support/amazonbot 98 * @see https://support.google.com/webmasters/answer/80553?hl=en 99 * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 100 * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 101 * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 102 * @see https://www.mojeek.com/bot.html 103 * @see https://support.apple.com/en-gb/HT204683 104 */ 105 private const ROBOT_REV_FWD_DNS = [ 106 'Amazonbot' => ['.crawl.amazon.com'], 107 'Applebot' => ['.applebot.apple.com'], 108 'bingbot' => ['.search.msn.com'], 109 'BingPreview' => ['.search.msn.com'], 110 'Google' => ['.google.com', '.googlebot.com'], 111 'MojeekBot' => ['.mojeek.com'], 112 'Mail.RU_Bot' => ['.mail.ru'], 113 'msnbot' => ['.search.msn.com'], 114 'Qwantify' => ['.search.qwant.com'], 115 'Sogou' => ['.crawl.sogou.com'], 116 'Yahoo' => ['.crawl.yahoo.net'], 117 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 118 ]; 119 120 /** 121 * Some search engines only use reverse DNS to verify the IP address. 122 * 123 * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 124 * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 125 * @see https://www.ionos.de/terms-gtc/faq-crawler 126 */ 127 private const ROBOT_REV_ONLY_DNS = [ 128 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 129 'FreshBot' => ['.seznam.cz'], 130 'IonCrawl' => ['.1und1.org'], 131 'Neevabot' => ['.neeva.com'], 132 ]; 133 134 /** 135 * Some search engines operate from designated IP addresses. 136 * 137 * @see https://www.apple.com/go/applebot 138 * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 139 */ 140 private const ROBOT_IPS = [ 141 'AppleBot' => [ 142 '17.0.0.0/8', 143 ], 144 'Ask Jeeves' => [ 145 '65.214.45.143', 146 '65.214.45.148', 147 '66.235.124.192', 148 '66.235.124.7', 149 '66.235.124.101', 150 '66.235.124.193', 151 '66.235.124.73', 152 '66.235.124.196', 153 '66.235.124.74', 154 '63.123.238.8', 155 '202.143.148.61', 156 ], 157 'DuckDuckBot' => [ 158 '23.21.227.69', 159 '50.16.241.113', 160 '50.16.241.114', 161 '50.16.241.117', 162 '50.16.247.234', 163 '52.204.97.54', 164 '52.5.190.19', 165 '54.197.234.188', 166 '54.208.100.253', 167 '54.208.102.37', 168 '107.21.1.8', 169 ], 170 ]; 171 172 /** 173 * Some search engines operate from designated IP addresses. 174 * 175 * @see https://bot.seekport.com/ 176 */ 177 private const ROBOT_IP_FILES = [ 178 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 179 ]; 180 181 /** 182 * Some search engines operate from within a designated autonomous system. 183 * 184 * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 185 * @see https://www.facebook.com/peering/ 186 */ 187 private const ROBOT_ASNS = [ 188 'facebook' => ['AS32934', 'AS63293'], 189 'twitter' => ['AS13414'], 190 ]; 191 192 /** 193 * @param ServerRequestInterface $request 194 * @param RequestHandlerInterface $handler 195 * 196 * @return ResponseInterface 197 */ 198 public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 199 { 200 $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 201 $ip = Validator::attributes($request)->string('client-ip'); 202 $address = IPFactory::parseAddressString($ip); 203 assert($address instanceof AddressInterface); 204 205 foreach (self::BAD_ROBOTS as $robot) { 206 if (str_contains($ua, $robot)) { 207 return $this->response(); 208 } 209 } 210 211 foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 212 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 213 return $this->response(); 214 } 215 } 216 217 foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 218 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 219 return $this->response(); 220 } 221 } 222 223 foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 224 if (str_contains($ua, $robot)) { 225 foreach ($valid_ip_ranges as $ip_range) { 226 $range = IPFactory::parseRangeString($ip_range); 227 228 if ($range instanceof RangeInterface && $range->contains($address)) { 229 continue 2; 230 } 231 } 232 233 return $this->response(); 234 } 235 } 236 237 foreach (self::ROBOT_IP_FILES as $robot => $url) { 238 if (str_contains($ua, $robot)) { 239 $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 240 241 foreach ($valid_ip_ranges as $ip_range) { 242 $range = IPFactory::parseRangeString($ip_range); 243 244 if ($range instanceof RangeInterface && $range->contains($address)) { 245 continue 2; 246 } 247 } 248 249 return $this->response(); 250 } 251 } 252 253 foreach (self::ROBOT_ASNS as $robot => $asns) { 254 foreach ($asns as $asn) { 255 if (str_contains($ua, $robot)) { 256 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 257 if ($range->contains($address)) { 258 continue 2; 259 } 260 } 261 262 return $this->response(); 263 } 264 } 265 } 266 267 // Allow sites to block access from entire networks. 268 $block_asn = Validator::attributes($request)->string('block_asn', ''); 269 preg_match_all('/(AS\d+)/', $block_asn, $matches); 270 271 foreach ($matches[1] as $asn) { 272 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 273 if ($range->contains($address)) { 274 return $this->response(); 275 } 276 } 277 } 278 279 return $handler->handle($request); 280 } 281 282 /** 283 * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 284 * 285 * @param string $ip 286 * @param array<string> $valid_domains 287 * @param bool $reverse_only 288 * 289 * @return bool 290 */ 291 private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 292 { 293 $host = gethostbyaddr($ip); 294 295 if ($host === false) { 296 return false; 297 } 298 299 foreach ($valid_domains as $domain) { 300 if (str_ends_with($host, $domain)) { 301 return $reverse_only || $ip === gethostbyname($host); 302 } 303 } 304 305 return false; 306 } 307 308 /** 309 * Perform a whois search for an ASN. 310 * 311 * @param string $asn - The autonomous system number to query 312 * 313 * @return array<RangeInterface> 314 */ 315 private function fetchIpRangesForAsn(string $asn): array 316 { 317 return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 318 $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 319 320 try { 321 $loader = new CurlLoader(self::WHOIS_TIMEOUT); 322 $whois = new Whois($loader); 323 $info = $whois->loadAsnInfo($asn); 324 $routes = $info->routes; 325 $ranges = array_map($mapper, $routes); 326 327 return array_filter($ranges); 328 } catch (Throwable) { 329 return []; 330 } 331 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 332 } 333 334 /** 335 * Fetch a list of IP addresses from a remote file. 336 * 337 * @param string $ua 338 * @param string $url 339 * 340 * @return array<string> 341 */ 342 private function fetchIpRangesForUrl(string $ua, string $url): array 343 { 344 return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 345 try { 346 $client = new Client(); 347 $response = $client->get($url, ['timeout' => 5]); 348 $contents = $response->getBody()->getContents(); 349 350 preg_match_all(self::REGEX_IPV4, $contents, $matches); 351 352 return $matches[0]; 353 } catch (GuzzleException) { 354 return []; 355 } 356 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 357 } 358 359 /** 360 * @return ResponseInterface 361 */ 362 private function response(): ResponseInterface 363 { 364 return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 365 } 366} 367