1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry; 24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator; 25d2d58874SGreg Roachuse GuzzleHttp\Client; 26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException; 27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 29089dadacSGreg Roachuse Iodev\Whois\Whois; 30089dadacSGreg Roachuse IPLib\Address\AddressInterface; 3169675509SGreg Roachuse IPLib\Factory as IPFactory; 32089dadacSGreg Roachuse IPLib\Range\RangeInterface; 33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 37089dadacSGreg Roachuse Throwable; 38089dadacSGreg Roach 39b7e8616fSGreg Roachuse function array_filter; 40089dadacSGreg Roachuse function array_map; 41089dadacSGreg Roachuse function assert; 42089dadacSGreg Roachuse function gethostbyaddr; 43089dadacSGreg Roachuse function gethostbyname; 44b7e8616fSGreg Roachuse function preg_match_all; 45b7e8616fSGreg Roachuse function random_int; 46089dadacSGreg Roachuse function response; 47dec352c1SGreg Roachuse function str_contains; 48dec352c1SGreg Roachuse function str_ends_with; 49089dadacSGreg Roach 50089dadacSGreg Roach/** 51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 52089dadacSGreg Roach */ 53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 54089dadacSGreg Roach{ 55d2d58874SGreg Roach private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56d2d58874SGreg Roach private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57d2d58874SGreg Roach 58089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 59089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 60089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 61089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 62089dadacSGreg Roach 63ffa287a1SGreg Roach // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64ffa287a1SGreg Roach public const BAD_ROBOTS = [ 65089dadacSGreg Roach 'admantx', 66be5f8e6aSGreg Roach 'Adsbot', 67089dadacSGreg Roach 'AhrefsBot', 687fa18cfdSGreg Roach 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 691dc9522fSGreg Roach 'AntBot', // Aggressive crawler 70227c6666SGreg Roach 'AspiegelBot', 710036e960SGreg Roach 'Awario', // Brand management 72*af07e945SGreg Roach 'Barkrowler', // Crawler for babbar.tech 73a10ff261SGreg Roach 'BLEXBot', 74*af07e945SGreg Roach 'Bytespider', // Aggressive crawler from Bytedance/TikTok 750d515f58SGreg Roach 'CCBot', // Used to train a number of LLMs 76*af07e945SGreg Roach 'CensysInspect', // Vulnerability scanner 770d515f58SGreg Roach 'ChatGPT-User', // Used by ChatGPT during operation 788d25fa6cSGreg Roach 'ClaudeBot', // Collects training data for LLMs 793a3594e9SGreg Roach 'DataForSeoBot', // https://dataforseo.com/dataforseo-bot 80089dadacSGreg Roach 'DotBot', 81*af07e945SGreg Roach 'Expanse', // Another pointless crawler 820d515f58SGreg Roach 'FacebookBot', // Collects training data for Facebook's LLM translator. 831dc9522fSGreg Roach 'fidget-spinner-bot', // Agressive crawler 84*af07e945SGreg Roach 'Foregenix', // Vulnerability scanner 85*af07e945SGreg Roach 'Go-http-client', // Crawler library used by many bots 860d515f58SGreg Roach 'Google-Extended', // Collects training data for Google Bard 87970c4733SGreg Roach 'GPTBot', // Collects training data for ChatGPT 88089dadacSGreg Roach 'Grapeshot', 89f3d48b69SGreg Roach 'Honolulu-bot', // Aggressive crawer, no info available 90089dadacSGreg Roach 'ia_archiver', 91*af07e945SGreg Roach 'internet-measurement', // Driftnet 92*af07e945SGreg Roach 'IonCrawl', 93*af07e945SGreg Roach 'Java', // Crawler library used by many bots 94c8614595SGreg Roach 'linabot', // Aggressive crawer, no info available 9503bad539SGreg Roach 'Linguee', 9610d27708SGreg Roach 'MegaIndex.ru', 97089dadacSGreg Roach 'MJ12bot', 98d5bb02daSGreg Roach 'netEstate NE', 990d515f58SGreg Roach 'Omgilibot', // Collects training data for LLMs 100227c6666SGreg Roach 'panscient', 101be5f8e6aSGreg Roach 'PetalBot', 102089dadacSGreg Roach 'proximic', 103*af07e945SGreg Roach 'python-requests', // Crawler library used by many bots 104*af07e945SGreg Roach 'Scrapy', // Scraping tool 10510d27708SGreg Roach 'SeekportBot', // Pretends to be a search engine - but isn't 106089dadacSGreg Roach 'SemrushBot', 107f4b15485SGreg Roach 'serpstatbot', 108d5bb02daSGreg Roach 'SEOkicks', 109d5bb02daSGreg Roach 'SiteKiosk', 1101dc9522fSGreg Roach 'test-bot', // Agressive crawler 11145d54b04SGreg Roach 'TinyTestBot', 112be5f8e6aSGreg Roach 'Turnitin', 1137d9d7ecaSGreg Roach 'wp_is_mobile', // Nothing to do with wordpress 114089dadacSGreg Roach 'XoviBot', 11552567a36SGreg Roach 'YisouSpider', 116a10ff261SGreg Roach 'ZoominfoBot', 117089dadacSGreg Roach ]; 118089dadacSGreg Roach 119089dadacSGreg Roach /** 1205c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 121089dadacSGreg Roach * 122891c4176SGreg Roach * @see https://developer.amazon.com/support/amazonbot 123089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 124089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 125089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 126089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 12777d0194eSGreg Roach * @see https://www.mojeek.com/bot.html 12877d0194eSGreg Roach * @see https://support.apple.com/en-gb/HT204683 129089dadacSGreg Roach */ 1305c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 131891c4176SGreg Roach 'Amazonbot' => ['.crawl.amazon.com'], 13277d0194eSGreg Roach 'Applebot' => ['.applebot.apple.com'], 133089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 134089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 135d5bb02daSGreg Roach 'Mail.RU_Bot' => ['.mail.ru'], 136e47c3c91SGreg Roach 'MicrosoftPreview' => ['.search.msn.com'], 137e47c3c91SGreg Roach 'MojeekBot' => ['.mojeek.com'], 1388d25fa6cSGreg Roach 'Qwantify' => ['.qwant.com'], 139089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 140089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 141089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 142e47c3c91SGreg Roach 'bingbot' => ['.search.msn.com'], 143e47c3c91SGreg Roach 'msnbot' => ['.search.msn.com'], 144089dadacSGreg Roach ]; 145089dadacSGreg Roach 146089dadacSGreg Roach /** 1475c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 1485c20d904SGreg Roach * 1495c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 1501ed9b76dSGreg Roach * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 151a9d55ce6SGreg Roach * @see https://www.ionos.de/terms-gtc/faq-crawler 1525c20d904SGreg Roach */ 1535c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 1546a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 1551ed9b76dSGreg Roach 'FreshBot' => ['.seznam.cz'], 156a9d55ce6SGreg Roach 'IonCrawl' => ['.1und1.org'], 157d5bb02daSGreg Roach 'Neevabot' => ['.neeva.com'], 1588e1afc64SGreg Roach 'SeznamBot' => ['.seznam.cz'], 1595c20d904SGreg Roach ]; 1605c20d904SGreg Roach 1615c20d904SGreg Roach /** 162089dadacSGreg Roach * Some search engines operate from designated IP addresses. 163089dadacSGreg Roach * 164ad3143ccSGreg Roach * @see https://www.apple.com/go/applebot 165089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 166089dadacSGreg Roach */ 167089dadacSGreg Roach private const ROBOT_IPS = [ 168813eb6c8SGreg Roach 'AppleBot' => [ 169813eb6c8SGreg Roach '17.0.0.0/8', 170813eb6c8SGreg Roach ], 171089dadacSGreg Roach 'Ask Jeeves' => [ 172089dadacSGreg Roach '65.214.45.143', 173089dadacSGreg Roach '65.214.45.148', 174089dadacSGreg Roach '66.235.124.192', 175089dadacSGreg Roach '66.235.124.7', 176089dadacSGreg Roach '66.235.124.101', 177089dadacSGreg Roach '66.235.124.193', 178089dadacSGreg Roach '66.235.124.73', 179089dadacSGreg Roach '66.235.124.196', 180089dadacSGreg Roach '66.235.124.74', 181089dadacSGreg Roach '63.123.238.8', 182089dadacSGreg Roach '202.143.148.61', 183089dadacSGreg Roach ], 184089dadacSGreg Roach 'DuckDuckBot' => [ 185089dadacSGreg Roach '23.21.227.69', 186089dadacSGreg Roach '50.16.241.113', 187089dadacSGreg Roach '50.16.241.114', 188089dadacSGreg Roach '50.16.241.117', 189089dadacSGreg Roach '50.16.247.234', 190089dadacSGreg Roach '52.204.97.54', 191089dadacSGreg Roach '52.5.190.19', 192089dadacSGreg Roach '54.197.234.188', 193089dadacSGreg Roach '54.208.100.253', 194089dadacSGreg Roach '54.208.102.37', 195089dadacSGreg Roach '107.21.1.8', 196089dadacSGreg Roach ], 197089dadacSGreg Roach ]; 198089dadacSGreg Roach 199089dadacSGreg Roach /** 200d2d58874SGreg Roach * Some search engines operate from designated IP addresses. 201d2d58874SGreg Roach * 202d2d58874SGreg Roach * @see https://bot.seekport.com/ 203d2d58874SGreg Roach */ 204d2d58874SGreg Roach private const ROBOT_IP_FILES = [ 205d2d58874SGreg Roach 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 206d2d58874SGreg Roach ]; 207d2d58874SGreg Roach 208d2d58874SGreg Roach /** 209089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 210089dadacSGreg Roach * 211089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 212cc7171a0SGreg Roach * @see https://www.facebook.com/peering/ 213089dadacSGreg Roach */ 214cc7171a0SGreg Roach private const ROBOT_ASNS = [ 215cc7171a0SGreg Roach 'facebook' => ['AS32934', 'AS63293'], 216cc7171a0SGreg Roach 'twitter' => ['AS13414'], 217089dadacSGreg Roach ]; 218089dadacSGreg Roach 219089dadacSGreg Roach /** 220089dadacSGreg Roach * @param ServerRequestInterface $request 221089dadacSGreg Roach * @param RequestHandlerInterface $handler 222089dadacSGreg Roach * 223089dadacSGreg Roach * @return ResponseInterface 224089dadacSGreg Roach */ 225089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 226089dadacSGreg Roach { 227b55cbc6bSGreg Roach $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 228b55cbc6bSGreg Roach $ip = Validator::attributes($request)->string('client-ip'); 2294a8d2484SGreg Roach $address = IPFactory::parseAddressString($ip); 230089dadacSGreg Roach assert($address instanceof AddressInterface); 231089dadacSGreg Roach 232dec352c1SGreg Roach foreach (self::BAD_ROBOTS as $robot) { 233dec352c1SGreg Roach if (str_contains($ua, $robot)) { 234089dadacSGreg Roach return $this->response(); 235089dadacSGreg Roach } 236dec352c1SGreg Roach } 237089dadacSGreg Roach 2385c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 239dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 2405c20d904SGreg Roach return $this->response(); 2415c20d904SGreg Roach } 2425c20d904SGreg Roach } 2435c20d904SGreg Roach 2445c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 245dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 246089dadacSGreg Roach return $this->response(); 247089dadacSGreg Roach } 248089dadacSGreg Roach } 249089dadacSGreg Roach 250d2d58874SGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 251dec352c1SGreg Roach if (str_contains($ua, $robot)) { 252d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 253d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 254d2d58874SGreg Roach 255d2d58874SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 256d2d58874SGreg Roach continue 2; 257d2d58874SGreg Roach } 258d2d58874SGreg Roach } 259d2d58874SGreg Roach 260d2d58874SGreg Roach return $this->response(); 261d2d58874SGreg Roach } 262d2d58874SGreg Roach } 263d2d58874SGreg Roach 264d2d58874SGreg Roach foreach (self::ROBOT_IP_FILES as $robot => $url) { 265d2d58874SGreg Roach if (str_contains($ua, $robot)) { 266d2d58874SGreg Roach $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 267d2d58874SGreg Roach 268d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 269d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 270813eb6c8SGreg Roach 271813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 272813eb6c8SGreg Roach continue 2; 273813eb6c8SGreg Roach } 274813eb6c8SGreg Roach } 275813eb6c8SGreg Roach 276089dadacSGreg Roach return $this->response(); 277089dadacSGreg Roach } 278089dadacSGreg Roach } 279089dadacSGreg Roach 280cc7171a0SGreg Roach foreach (self::ROBOT_ASNS as $robot => $asns) { 281cc7171a0SGreg Roach foreach ($asns as $asn) { 282dec352c1SGreg Roach if (str_contains($ua, $robot)) { 283089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 284089dadacSGreg Roach if ($range->contains($address)) { 285089dadacSGreg Roach continue 2; 286089dadacSGreg Roach } 287089dadacSGreg Roach } 288089dadacSGreg Roach 289089dadacSGreg Roach return $this->response(); 290089dadacSGreg Roach } 291089dadacSGreg Roach } 292cc7171a0SGreg Roach } 293089dadacSGreg Roach 294617057d4SGreg Roach // Allow sites to block access from entire networks. 295b55cbc6bSGreg Roach $block_asn = Validator::attributes($request)->string('block_asn', ''); 296b55cbc6bSGreg Roach preg_match_all('/(AS\d+)/', $block_asn, $matches); 297b55cbc6bSGreg Roach 298617057d4SGreg Roach foreach ($matches[1] as $asn) { 299617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 300617057d4SGreg Roach if ($range->contains($address)) { 301617057d4SGreg Roach return $this->response(); 302617057d4SGreg Roach } 303617057d4SGreg Roach } 304617057d4SGreg Roach } 305089dadacSGreg Roach 306089dadacSGreg Roach return $handler->handle($request); 307089dadacSGreg Roach } 308089dadacSGreg Roach 309089dadacSGreg Roach /** 310089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 311089dadacSGreg Roach * 312089dadacSGreg Roach * @param string $ip 313089dadacSGreg Roach * @param array<string> $valid_domains 3145c20d904SGreg Roach * @param bool $reverse_only 315089dadacSGreg Roach * 316089dadacSGreg Roach * @return bool 317089dadacSGreg Roach */ 3185c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 319089dadacSGreg Roach { 320089dadacSGreg Roach $host = gethostbyaddr($ip); 321089dadacSGreg Roach 322dec352c1SGreg Roach if ($host === false) { 323089dadacSGreg Roach return false; 324089dadacSGreg Roach } 325089dadacSGreg Roach 326dec352c1SGreg Roach foreach ($valid_domains as $domain) { 327dec352c1SGreg Roach if (str_ends_with($host, $domain)) { 3285c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 329089dadacSGreg Roach } 330dec352c1SGreg Roach } 331dec352c1SGreg Roach 332dec352c1SGreg Roach return false; 333dec352c1SGreg Roach } 334089dadacSGreg Roach 335089dadacSGreg Roach /** 336089dadacSGreg Roach * Perform a whois search for an ASN. 337089dadacSGreg Roach * 338e5766395SGreg Roach * @param string $asn The autonomous system number to query 339089dadacSGreg Roach * 340089dadacSGreg Roach * @return array<RangeInterface> 341089dadacSGreg Roach */ 342089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 343089dadacSGreg Roach { 3446b9cb339SGreg Roach return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 3454a8d2484SGreg Roach $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 346273a564eSGreg Roach 347089dadacSGreg Roach try { 348089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 349089dadacSGreg Roach $whois = new Whois($loader); 350089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 351273a564eSGreg Roach $routes = $info->routes; 352273a564eSGreg Roach $ranges = array_map($mapper, $routes); 353089dadacSGreg Roach 354089dadacSGreg Roach return array_filter($ranges); 35528d026adSGreg Roach } catch (Throwable) { 356089dadacSGreg Roach return []; 357089dadacSGreg Roach } 358089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 359089dadacSGreg Roach } 360089dadacSGreg Roach 361089dadacSGreg Roach /** 362d2d58874SGreg Roach * Fetch a list of IP addresses from a remote file. 363d2d58874SGreg Roach * 364d2d58874SGreg Roach * @param string $ua 365d2d58874SGreg Roach * @param string $url 366d2d58874SGreg Roach * 367d2d58874SGreg Roach * @return array<string> 368d2d58874SGreg Roach */ 369d2d58874SGreg Roach private function fetchIpRangesForUrl(string $ua, string $url): array 370d2d58874SGreg Roach { 371d2d58874SGreg Roach return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 372d2d58874SGreg Roach try { 373d2d58874SGreg Roach $client = new Client(); 374d2d58874SGreg Roach $response = $client->get($url, ['timeout' => 5]); 375d2d58874SGreg Roach $contents = $response->getBody()->getContents(); 376d2d58874SGreg Roach 377d2d58874SGreg Roach preg_match_all(self::REGEX_IPV4, $contents, $matches); 378d2d58874SGreg Roach 379d2d58874SGreg Roach return $matches[0]; 380d2d58874SGreg Roach } catch (GuzzleException) { 381d2d58874SGreg Roach return []; 382d2d58874SGreg Roach } 383d2d58874SGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 384d2d58874SGreg Roach } 385d2d58874SGreg Roach 386d2d58874SGreg Roach /** 387089dadacSGreg Roach * @return ResponseInterface 388089dadacSGreg Roach */ 389089dadacSGreg Roach private function response(): ResponseInterface 390089dadacSGreg Roach { 391089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 392089dadacSGreg Roach } 393089dadacSGreg Roach} 394