1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry; 24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator; 25d2d58874SGreg Roachuse GuzzleHttp\Client; 26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException; 27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 29089dadacSGreg Roachuse Iodev\Whois\Whois; 30089dadacSGreg Roachuse IPLib\Address\AddressInterface; 3169675509SGreg Roachuse IPLib\Factory as IPFactory; 32089dadacSGreg Roachuse IPLib\Range\RangeInterface; 33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 37089dadacSGreg Roachuse Throwable; 38089dadacSGreg Roach 39b7e8616fSGreg Roachuse function array_filter; 40089dadacSGreg Roachuse function array_map; 41089dadacSGreg Roachuse function assert; 42089dadacSGreg Roachuse function gethostbyaddr; 43089dadacSGreg Roachuse function gethostbyname; 44b7e8616fSGreg Roachuse function preg_match_all; 45b7e8616fSGreg Roachuse function random_int; 46089dadacSGreg Roachuse function response; 47dec352c1SGreg Roachuse function str_contains; 48dec352c1SGreg Roachuse function str_ends_with; 49089dadacSGreg Roach 50089dadacSGreg Roach/** 51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 52089dadacSGreg Roach */ 53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 54089dadacSGreg Roach{ 55d2d58874SGreg Roach private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56d2d58874SGreg Roach private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57d2d58874SGreg Roach 58089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 59089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 60089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 61089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 62089dadacSGreg Roach 63ffa287a1SGreg Roach // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64ffa287a1SGreg Roach public const BAD_ROBOTS = [ 65089dadacSGreg Roach 'admantx', 66be5f8e6aSGreg Roach 'Adsbot', 67089dadacSGreg Roach 'AhrefsBot', 687fa18cfdSGreg Roach 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 69*1dc9522fSGreg Roach 'AntBot', // Aggressive crawler 70227c6666SGreg Roach 'AspiegelBot', 710036e960SGreg Roach 'Awario', // Brand management 7261e93e26SGreg Roach 'Barkrowler', 73a10ff261SGreg Roach 'BLEXBot', 74a6224258SGreg Roach 'Bytespider', 750d515f58SGreg Roach 'CCBot', // Used to train a number of LLMs 760d515f58SGreg Roach 'ChatGPT-User', // Used by ChatGPT during operation 773a3594e9SGreg Roach 'DataForSeoBot', // https://dataforseo.com/dataforseo-bot 78089dadacSGreg Roach 'DotBot', 790d515f58SGreg Roach 'FacebookBot', // Collects training data for Facebook's LLM translator. 80*1dc9522fSGreg Roach 'fidget-spinner-bot', // Agressive crawler 810d515f58SGreg Roach 'Google-Extended', // Collects training data for Google Bard 82970c4733SGreg Roach 'GPTBot', // Collects training data for ChatGPT 83089dadacSGreg Roach 'Grapeshot', 84f3d48b69SGreg Roach 'Honolulu-bot', // Aggressive crawer, no info available 85089dadacSGreg Roach 'ia_archiver', 86c8614595SGreg Roach 'linabot', // Aggressive crawer, no info available 8703bad539SGreg Roach 'Linguee', 8810d27708SGreg Roach 'MegaIndex.ru', 89089dadacSGreg Roach 'MJ12bot', 90d5bb02daSGreg Roach 'netEstate NE', 910d515f58SGreg Roach 'Omgilibot', // Collects training data for LLMs 92227c6666SGreg Roach 'panscient', 93be5f8e6aSGreg Roach 'PetalBot', 94089dadacSGreg Roach 'proximic', 9510d27708SGreg Roach 'SeekportBot', // Pretends to be a search engine - but isn't 96089dadacSGreg Roach 'SemrushBot', 97f4b15485SGreg Roach 'serpstatbot', 98d5bb02daSGreg Roach 'SEOkicks', 99d5bb02daSGreg Roach 'SiteKiosk', 100*1dc9522fSGreg Roach 'test-bot', // Agressive crawler 10145d54b04SGreg Roach 'TinyTestBot', 102be5f8e6aSGreg Roach 'Turnitin', 1037d9d7ecaSGreg Roach 'wp_is_mobile', // Nothing to do with wordpress 104089dadacSGreg Roach 'XoviBot', 10552567a36SGreg Roach 'YisouSpider', 106a10ff261SGreg Roach 'ZoominfoBot', 107089dadacSGreg Roach ]; 108089dadacSGreg Roach 109089dadacSGreg Roach /** 1105c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 111089dadacSGreg Roach * 112891c4176SGreg Roach * @see https://developer.amazon.com/support/amazonbot 113089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 114089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 115089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 116089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 11777d0194eSGreg Roach * @see https://www.mojeek.com/bot.html 11877d0194eSGreg Roach * @see https://support.apple.com/en-gb/HT204683 119089dadacSGreg Roach */ 1205c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 121891c4176SGreg Roach 'Amazonbot' => ['.crawl.amazon.com'], 12277d0194eSGreg Roach 'Applebot' => ['.applebot.apple.com'], 123089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 124089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 125d5bb02daSGreg Roach 'Mail.RU_Bot' => ['.mail.ru'], 126e47c3c91SGreg Roach 'MicrosoftPreview' => ['.search.msn.com'], 127e47c3c91SGreg Roach 'MojeekBot' => ['.mojeek.com'], 128089dadacSGreg Roach 'Qwantify' => ['.search.qwant.com'], 129089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 130089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 131089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 132e47c3c91SGreg Roach 'bingbot' => ['.search.msn.com'], 133e47c3c91SGreg Roach 'msnbot' => ['.search.msn.com'], 134089dadacSGreg Roach ]; 135089dadacSGreg Roach 136089dadacSGreg Roach /** 1375c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 1385c20d904SGreg Roach * 1395c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 1401ed9b76dSGreg Roach * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 141a9d55ce6SGreg Roach * @see https://www.ionos.de/terms-gtc/faq-crawler 1425c20d904SGreg Roach */ 1435c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 1446a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 1451ed9b76dSGreg Roach 'FreshBot' => ['.seznam.cz'], 146a9d55ce6SGreg Roach 'IonCrawl' => ['.1und1.org'], 147d5bb02daSGreg Roach 'Neevabot' => ['.neeva.com'], 1488e1afc64SGreg Roach 'SeznamBot' => ['.seznam.cz'], 1495c20d904SGreg Roach ]; 1505c20d904SGreg Roach 1515c20d904SGreg Roach /** 152089dadacSGreg Roach * Some search engines operate from designated IP addresses. 153089dadacSGreg Roach * 154ad3143ccSGreg Roach * @see https://www.apple.com/go/applebot 155089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 156089dadacSGreg Roach */ 157089dadacSGreg Roach private const ROBOT_IPS = [ 158813eb6c8SGreg Roach 'AppleBot' => [ 159813eb6c8SGreg Roach '17.0.0.0/8', 160813eb6c8SGreg Roach ], 161089dadacSGreg Roach 'Ask Jeeves' => [ 162089dadacSGreg Roach '65.214.45.143', 163089dadacSGreg Roach '65.214.45.148', 164089dadacSGreg Roach '66.235.124.192', 165089dadacSGreg Roach '66.235.124.7', 166089dadacSGreg Roach '66.235.124.101', 167089dadacSGreg Roach '66.235.124.193', 168089dadacSGreg Roach '66.235.124.73', 169089dadacSGreg Roach '66.235.124.196', 170089dadacSGreg Roach '66.235.124.74', 171089dadacSGreg Roach '63.123.238.8', 172089dadacSGreg Roach '202.143.148.61', 173089dadacSGreg Roach ], 174089dadacSGreg Roach 'DuckDuckBot' => [ 175089dadacSGreg Roach '23.21.227.69', 176089dadacSGreg Roach '50.16.241.113', 177089dadacSGreg Roach '50.16.241.114', 178089dadacSGreg Roach '50.16.241.117', 179089dadacSGreg Roach '50.16.247.234', 180089dadacSGreg Roach '52.204.97.54', 181089dadacSGreg Roach '52.5.190.19', 182089dadacSGreg Roach '54.197.234.188', 183089dadacSGreg Roach '54.208.100.253', 184089dadacSGreg Roach '54.208.102.37', 185089dadacSGreg Roach '107.21.1.8', 186089dadacSGreg Roach ], 187089dadacSGreg Roach ]; 188089dadacSGreg Roach 189089dadacSGreg Roach /** 190d2d58874SGreg Roach * Some search engines operate from designated IP addresses. 191d2d58874SGreg Roach * 192d2d58874SGreg Roach * @see https://bot.seekport.com/ 193d2d58874SGreg Roach */ 194d2d58874SGreg Roach private const ROBOT_IP_FILES = [ 195d2d58874SGreg Roach 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 196d2d58874SGreg Roach ]; 197d2d58874SGreg Roach 198d2d58874SGreg Roach /** 199089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 200089dadacSGreg Roach * 201089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 202cc7171a0SGreg Roach * @see https://www.facebook.com/peering/ 203089dadacSGreg Roach */ 204cc7171a0SGreg Roach private const ROBOT_ASNS = [ 205cc7171a0SGreg Roach 'facebook' => ['AS32934', 'AS63293'], 206cc7171a0SGreg Roach 'twitter' => ['AS13414'], 207089dadacSGreg Roach ]; 208089dadacSGreg Roach 209089dadacSGreg Roach /** 210089dadacSGreg Roach * @param ServerRequestInterface $request 211089dadacSGreg Roach * @param RequestHandlerInterface $handler 212089dadacSGreg Roach * 213089dadacSGreg Roach * @return ResponseInterface 214089dadacSGreg Roach */ 215089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 216089dadacSGreg Roach { 217b55cbc6bSGreg Roach $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 218b55cbc6bSGreg Roach $ip = Validator::attributes($request)->string('client-ip'); 2194a8d2484SGreg Roach $address = IPFactory::parseAddressString($ip); 220089dadacSGreg Roach assert($address instanceof AddressInterface); 221089dadacSGreg Roach 222dec352c1SGreg Roach foreach (self::BAD_ROBOTS as $robot) { 223dec352c1SGreg Roach if (str_contains($ua, $robot)) { 224089dadacSGreg Roach return $this->response(); 225089dadacSGreg Roach } 226dec352c1SGreg Roach } 227089dadacSGreg Roach 2285c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 229dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 2305c20d904SGreg Roach return $this->response(); 2315c20d904SGreg Roach } 2325c20d904SGreg Roach } 2335c20d904SGreg Roach 2345c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 235dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 236089dadacSGreg Roach return $this->response(); 237089dadacSGreg Roach } 238089dadacSGreg Roach } 239089dadacSGreg Roach 240d2d58874SGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 241dec352c1SGreg Roach if (str_contains($ua, $robot)) { 242d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 243d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 244d2d58874SGreg Roach 245d2d58874SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 246d2d58874SGreg Roach continue 2; 247d2d58874SGreg Roach } 248d2d58874SGreg Roach } 249d2d58874SGreg Roach 250d2d58874SGreg Roach return $this->response(); 251d2d58874SGreg Roach } 252d2d58874SGreg Roach } 253d2d58874SGreg Roach 254d2d58874SGreg Roach foreach (self::ROBOT_IP_FILES as $robot => $url) { 255d2d58874SGreg Roach if (str_contains($ua, $robot)) { 256d2d58874SGreg Roach $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 257d2d58874SGreg Roach 258d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 259d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 260813eb6c8SGreg Roach 261813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 262813eb6c8SGreg Roach continue 2; 263813eb6c8SGreg Roach } 264813eb6c8SGreg Roach } 265813eb6c8SGreg Roach 266089dadacSGreg Roach return $this->response(); 267089dadacSGreg Roach } 268089dadacSGreg Roach } 269089dadacSGreg Roach 270cc7171a0SGreg Roach foreach (self::ROBOT_ASNS as $robot => $asns) { 271cc7171a0SGreg Roach foreach ($asns as $asn) { 272dec352c1SGreg Roach if (str_contains($ua, $robot)) { 273089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 274089dadacSGreg Roach if ($range->contains($address)) { 275089dadacSGreg Roach continue 2; 276089dadacSGreg Roach } 277089dadacSGreg Roach } 278089dadacSGreg Roach 279089dadacSGreg Roach return $this->response(); 280089dadacSGreg Roach } 281089dadacSGreg Roach } 282cc7171a0SGreg Roach } 283089dadacSGreg Roach 284617057d4SGreg Roach // Allow sites to block access from entire networks. 285b55cbc6bSGreg Roach $block_asn = Validator::attributes($request)->string('block_asn', ''); 286b55cbc6bSGreg Roach preg_match_all('/(AS\d+)/', $block_asn, $matches); 287b55cbc6bSGreg Roach 288617057d4SGreg Roach foreach ($matches[1] as $asn) { 289617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 290617057d4SGreg Roach if ($range->contains($address)) { 291617057d4SGreg Roach return $this->response(); 292617057d4SGreg Roach } 293617057d4SGreg Roach } 294617057d4SGreg Roach } 295089dadacSGreg Roach 296089dadacSGreg Roach return $handler->handle($request); 297089dadacSGreg Roach } 298089dadacSGreg Roach 299089dadacSGreg Roach /** 300089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 301089dadacSGreg Roach * 302089dadacSGreg Roach * @param string $ip 303089dadacSGreg Roach * @param array<string> $valid_domains 3045c20d904SGreg Roach * @param bool $reverse_only 305089dadacSGreg Roach * 306089dadacSGreg Roach * @return bool 307089dadacSGreg Roach */ 3085c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 309089dadacSGreg Roach { 310089dadacSGreg Roach $host = gethostbyaddr($ip); 311089dadacSGreg Roach 312dec352c1SGreg Roach if ($host === false) { 313089dadacSGreg Roach return false; 314089dadacSGreg Roach } 315089dadacSGreg Roach 316dec352c1SGreg Roach foreach ($valid_domains as $domain) { 317dec352c1SGreg Roach if (str_ends_with($host, $domain)) { 3185c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 319089dadacSGreg Roach } 320dec352c1SGreg Roach } 321dec352c1SGreg Roach 322dec352c1SGreg Roach return false; 323dec352c1SGreg Roach } 324089dadacSGreg Roach 325089dadacSGreg Roach /** 326089dadacSGreg Roach * Perform a whois search for an ASN. 327089dadacSGreg Roach * 328e5766395SGreg Roach * @param string $asn The autonomous system number to query 329089dadacSGreg Roach * 330089dadacSGreg Roach * @return array<RangeInterface> 331089dadacSGreg Roach */ 332089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 333089dadacSGreg Roach { 3346b9cb339SGreg Roach return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 3354a8d2484SGreg Roach $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 336273a564eSGreg Roach 337089dadacSGreg Roach try { 338089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 339089dadacSGreg Roach $whois = new Whois($loader); 340089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 341273a564eSGreg Roach $routes = $info->routes; 342273a564eSGreg Roach $ranges = array_map($mapper, $routes); 343089dadacSGreg Roach 344089dadacSGreg Roach return array_filter($ranges); 34528d026adSGreg Roach } catch (Throwable) { 346089dadacSGreg Roach return []; 347089dadacSGreg Roach } 348089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 349089dadacSGreg Roach } 350089dadacSGreg Roach 351089dadacSGreg Roach /** 352d2d58874SGreg Roach * Fetch a list of IP addresses from a remote file. 353d2d58874SGreg Roach * 354d2d58874SGreg Roach * @param string $ua 355d2d58874SGreg Roach * @param string $url 356d2d58874SGreg Roach * 357d2d58874SGreg Roach * @return array<string> 358d2d58874SGreg Roach */ 359d2d58874SGreg Roach private function fetchIpRangesForUrl(string $ua, string $url): array 360d2d58874SGreg Roach { 361d2d58874SGreg Roach return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 362d2d58874SGreg Roach try { 363d2d58874SGreg Roach $client = new Client(); 364d2d58874SGreg Roach $response = $client->get($url, ['timeout' => 5]); 365d2d58874SGreg Roach $contents = $response->getBody()->getContents(); 366d2d58874SGreg Roach 367d2d58874SGreg Roach preg_match_all(self::REGEX_IPV4, $contents, $matches); 368d2d58874SGreg Roach 369d2d58874SGreg Roach return $matches[0]; 370d2d58874SGreg Roach } catch (GuzzleException) { 371d2d58874SGreg Roach return []; 372d2d58874SGreg Roach } 373d2d58874SGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 374d2d58874SGreg Roach } 375d2d58874SGreg Roach 376d2d58874SGreg Roach /** 377089dadacSGreg Roach * @return ResponseInterface 378089dadacSGreg Roach */ 379089dadacSGreg Roach private function response(): ResponseInterface 380089dadacSGreg Roach { 381089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 382089dadacSGreg Roach } 383089dadacSGreg Roach} 384