1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry; 24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator; 25d2d58874SGreg Roachuse GuzzleHttp\Client; 26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException; 27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 29089dadacSGreg Roachuse Iodev\Whois\Whois; 30089dadacSGreg Roachuse IPLib\Address\AddressInterface; 3169675509SGreg Roachuse IPLib\Factory as IPFactory; 32089dadacSGreg Roachuse IPLib\Range\RangeInterface; 33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 37089dadacSGreg Roachuse Throwable; 38089dadacSGreg Roach 39b7e8616fSGreg Roachuse function array_filter; 40089dadacSGreg Roachuse function array_map; 41089dadacSGreg Roachuse function assert; 42089dadacSGreg Roachuse function gethostbyaddr; 43089dadacSGreg Roachuse function gethostbyname; 44b7e8616fSGreg Roachuse function preg_match_all; 45b7e8616fSGreg Roachuse function random_int; 46089dadacSGreg Roachuse function response; 47dec352c1SGreg Roachuse function str_contains; 48dec352c1SGreg Roachuse function str_ends_with; 49089dadacSGreg Roach 50089dadacSGreg Roach/** 51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 52089dadacSGreg Roach */ 53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 54089dadacSGreg Roach{ 55d2d58874SGreg Roach private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56d2d58874SGreg Roach private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57d2d58874SGreg Roach 58089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 59089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 60089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 61089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 62089dadacSGreg Roach 63ffa287a1SGreg Roach // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64ffa287a1SGreg Roach public const BAD_ROBOTS = [ 65089dadacSGreg Roach 'admantx', 66be5f8e6aSGreg Roach 'Adsbot', 67089dadacSGreg Roach 'AhrefsBot', 687fa18cfdSGreg Roach 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 69227c6666SGreg Roach 'AspiegelBot', 700036e960SGreg Roach 'Awario', // Brand management 7161e93e26SGreg Roach 'Barkrowler', 72a10ff261SGreg Roach 'BLEXBot', 73*a6224258SGreg Roach 'Bytespider', 741763aecaSGreg Roach 'DataForSEO', 753a3594e9SGreg Roach 'DataForSeoBot', // https://dataforseo.com/dataforseo-bot 76089dadacSGreg Roach 'DotBot', 77089dadacSGreg Roach 'Grapeshot', 78f3d48b69SGreg Roach 'Honolulu-bot', // Aggressive crawer, no info available 79089dadacSGreg Roach 'ia_archiver', 80c8614595SGreg Roach 'linabot', // Aggressive crawer, no info available 8103bad539SGreg Roach 'Linguee', 8210d27708SGreg Roach 'MegaIndex.ru', 83089dadacSGreg Roach 'MJ12bot', 84d5bb02daSGreg Roach 'netEstate NE', 85227c6666SGreg Roach 'panscient', 86be5f8e6aSGreg Roach 'PetalBot', 87089dadacSGreg Roach 'proximic', 8810d27708SGreg Roach 'SeekportBot', // Pretends to be a search engine - but isn't 89089dadacSGreg Roach 'SemrushBot', 90f4b15485SGreg Roach 'serpstatbot', 91d5bb02daSGreg Roach 'SEOkicks', 92d5bb02daSGreg Roach 'SiteKiosk', 93be5f8e6aSGreg Roach 'Turnitin', 947d9d7ecaSGreg Roach 'wp_is_mobile', // Nothing to do with wordpress 95089dadacSGreg Roach 'XoviBot', 96a10ff261SGreg Roach 'ZoominfoBot', 97089dadacSGreg Roach ]; 98089dadacSGreg Roach 99089dadacSGreg Roach /** 1005c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 101089dadacSGreg Roach * 102891c4176SGreg Roach * @see https://developer.amazon.com/support/amazonbot 103089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 104089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 105089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 106089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 10777d0194eSGreg Roach * @see https://www.mojeek.com/bot.html 10877d0194eSGreg Roach * @see https://support.apple.com/en-gb/HT204683 109089dadacSGreg Roach */ 1105c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 111891c4176SGreg Roach 'Amazonbot' => ['.crawl.amazon.com'], 11277d0194eSGreg Roach 'Applebot' => ['.applebot.apple.com'], 113089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 114089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 115d5bb02daSGreg Roach 'Mail.RU_Bot' => ['.mail.ru'], 116e47c3c91SGreg Roach 'MicrosoftPreview' => ['.search.msn.com'], 117e47c3c91SGreg Roach 'MojeekBot' => ['.mojeek.com'], 118089dadacSGreg Roach 'Qwantify' => ['.search.qwant.com'], 119089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 120089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 121089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 122e47c3c91SGreg Roach 'bingbot' => ['.search.msn.com'], 123e47c3c91SGreg Roach 'msnbot' => ['.search.msn.com'], 124089dadacSGreg Roach ]; 125089dadacSGreg Roach 126089dadacSGreg Roach /** 1275c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 1285c20d904SGreg Roach * 1295c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 1301ed9b76dSGreg Roach * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 131a9d55ce6SGreg Roach * @see https://www.ionos.de/terms-gtc/faq-crawler 1325c20d904SGreg Roach */ 1335c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 1346a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 1351ed9b76dSGreg Roach 'FreshBot' => ['.seznam.cz'], 136a9d55ce6SGreg Roach 'IonCrawl' => ['.1und1.org'], 137d5bb02daSGreg Roach 'Neevabot' => ['.neeva.com'], 1388e1afc64SGreg Roach 'SeznamBot' => ['.seznam.cz'], 1395c20d904SGreg Roach ]; 1405c20d904SGreg Roach 1415c20d904SGreg Roach /** 142089dadacSGreg Roach * Some search engines operate from designated IP addresses. 143089dadacSGreg Roach * 144ad3143ccSGreg Roach * @see https://www.apple.com/go/applebot 145089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 146089dadacSGreg Roach */ 147089dadacSGreg Roach private const ROBOT_IPS = [ 148813eb6c8SGreg Roach 'AppleBot' => [ 149813eb6c8SGreg Roach '17.0.0.0/8', 150813eb6c8SGreg Roach ], 151089dadacSGreg Roach 'Ask Jeeves' => [ 152089dadacSGreg Roach '65.214.45.143', 153089dadacSGreg Roach '65.214.45.148', 154089dadacSGreg Roach '66.235.124.192', 155089dadacSGreg Roach '66.235.124.7', 156089dadacSGreg Roach '66.235.124.101', 157089dadacSGreg Roach '66.235.124.193', 158089dadacSGreg Roach '66.235.124.73', 159089dadacSGreg Roach '66.235.124.196', 160089dadacSGreg Roach '66.235.124.74', 161089dadacSGreg Roach '63.123.238.8', 162089dadacSGreg Roach '202.143.148.61', 163089dadacSGreg Roach ], 164089dadacSGreg Roach 'DuckDuckBot' => [ 165089dadacSGreg Roach '23.21.227.69', 166089dadacSGreg Roach '50.16.241.113', 167089dadacSGreg Roach '50.16.241.114', 168089dadacSGreg Roach '50.16.241.117', 169089dadacSGreg Roach '50.16.247.234', 170089dadacSGreg Roach '52.204.97.54', 171089dadacSGreg Roach '52.5.190.19', 172089dadacSGreg Roach '54.197.234.188', 173089dadacSGreg Roach '54.208.100.253', 174089dadacSGreg Roach '54.208.102.37', 175089dadacSGreg Roach '107.21.1.8', 176089dadacSGreg Roach ], 177089dadacSGreg Roach ]; 178089dadacSGreg Roach 179089dadacSGreg Roach /** 180d2d58874SGreg Roach * Some search engines operate from designated IP addresses. 181d2d58874SGreg Roach * 182d2d58874SGreg Roach * @see https://bot.seekport.com/ 183d2d58874SGreg Roach */ 184d2d58874SGreg Roach private const ROBOT_IP_FILES = [ 185d2d58874SGreg Roach 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 186d2d58874SGreg Roach ]; 187d2d58874SGreg Roach 188d2d58874SGreg Roach /** 189089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 190089dadacSGreg Roach * 191089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 192cc7171a0SGreg Roach * @see https://www.facebook.com/peering/ 193089dadacSGreg Roach */ 194cc7171a0SGreg Roach private const ROBOT_ASNS = [ 195cc7171a0SGreg Roach 'facebook' => ['AS32934', 'AS63293'], 196cc7171a0SGreg Roach 'twitter' => ['AS13414'], 197089dadacSGreg Roach ]; 198089dadacSGreg Roach 199089dadacSGreg Roach /** 200089dadacSGreg Roach * @param ServerRequestInterface $request 201089dadacSGreg Roach * @param RequestHandlerInterface $handler 202089dadacSGreg Roach * 203089dadacSGreg Roach * @return ResponseInterface 204089dadacSGreg Roach */ 205089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 206089dadacSGreg Roach { 207b55cbc6bSGreg Roach $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 208b55cbc6bSGreg Roach $ip = Validator::attributes($request)->string('client-ip'); 2094a8d2484SGreg Roach $address = IPFactory::parseAddressString($ip); 210089dadacSGreg Roach assert($address instanceof AddressInterface); 211089dadacSGreg Roach 212dec352c1SGreg Roach foreach (self::BAD_ROBOTS as $robot) { 213dec352c1SGreg Roach if (str_contains($ua, $robot)) { 214089dadacSGreg Roach return $this->response(); 215089dadacSGreg Roach } 216dec352c1SGreg Roach } 217089dadacSGreg Roach 2185c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 219dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 2205c20d904SGreg Roach return $this->response(); 2215c20d904SGreg Roach } 2225c20d904SGreg Roach } 2235c20d904SGreg Roach 2245c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 225dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 226089dadacSGreg Roach return $this->response(); 227089dadacSGreg Roach } 228089dadacSGreg Roach } 229089dadacSGreg Roach 230d2d58874SGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 231dec352c1SGreg Roach if (str_contains($ua, $robot)) { 232d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 233d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 234d2d58874SGreg Roach 235d2d58874SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 236d2d58874SGreg Roach continue 2; 237d2d58874SGreg Roach } 238d2d58874SGreg Roach } 239d2d58874SGreg Roach 240d2d58874SGreg Roach return $this->response(); 241d2d58874SGreg Roach } 242d2d58874SGreg Roach } 243d2d58874SGreg Roach 244d2d58874SGreg Roach foreach (self::ROBOT_IP_FILES as $robot => $url) { 245d2d58874SGreg Roach if (str_contains($ua, $robot)) { 246d2d58874SGreg Roach $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 247d2d58874SGreg Roach 248d2d58874SGreg Roach foreach ($valid_ip_ranges as $ip_range) { 249d2d58874SGreg Roach $range = IPFactory::parseRangeString($ip_range); 250813eb6c8SGreg Roach 251813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 252813eb6c8SGreg Roach continue 2; 253813eb6c8SGreg Roach } 254813eb6c8SGreg Roach } 255813eb6c8SGreg Roach 256089dadacSGreg Roach return $this->response(); 257089dadacSGreg Roach } 258089dadacSGreg Roach } 259089dadacSGreg Roach 260cc7171a0SGreg Roach foreach (self::ROBOT_ASNS as $robot => $asns) { 261cc7171a0SGreg Roach foreach ($asns as $asn) { 262dec352c1SGreg Roach if (str_contains($ua, $robot)) { 263089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 264089dadacSGreg Roach if ($range->contains($address)) { 265089dadacSGreg Roach continue 2; 266089dadacSGreg Roach } 267089dadacSGreg Roach } 268089dadacSGreg Roach 269089dadacSGreg Roach return $this->response(); 270089dadacSGreg Roach } 271089dadacSGreg Roach } 272cc7171a0SGreg Roach } 273089dadacSGreg Roach 274617057d4SGreg Roach // Allow sites to block access from entire networks. 275b55cbc6bSGreg Roach $block_asn = Validator::attributes($request)->string('block_asn', ''); 276b55cbc6bSGreg Roach preg_match_all('/(AS\d+)/', $block_asn, $matches); 277b55cbc6bSGreg Roach 278617057d4SGreg Roach foreach ($matches[1] as $asn) { 279617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 280617057d4SGreg Roach if ($range->contains($address)) { 281617057d4SGreg Roach return $this->response(); 282617057d4SGreg Roach } 283617057d4SGreg Roach } 284617057d4SGreg Roach } 285089dadacSGreg Roach 286089dadacSGreg Roach return $handler->handle($request); 287089dadacSGreg Roach } 288089dadacSGreg Roach 289089dadacSGreg Roach /** 290089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 291089dadacSGreg Roach * 292089dadacSGreg Roach * @param string $ip 293089dadacSGreg Roach * @param array<string> $valid_domains 2945c20d904SGreg Roach * @param bool $reverse_only 295089dadacSGreg Roach * 296089dadacSGreg Roach * @return bool 297089dadacSGreg Roach */ 2985c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 299089dadacSGreg Roach { 300089dadacSGreg Roach $host = gethostbyaddr($ip); 301089dadacSGreg Roach 302dec352c1SGreg Roach if ($host === false) { 303089dadacSGreg Roach return false; 304089dadacSGreg Roach } 305089dadacSGreg Roach 306dec352c1SGreg Roach foreach ($valid_domains as $domain) { 307dec352c1SGreg Roach if (str_ends_with($host, $domain)) { 3085c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 309089dadacSGreg Roach } 310dec352c1SGreg Roach } 311dec352c1SGreg Roach 312dec352c1SGreg Roach return false; 313dec352c1SGreg Roach } 314089dadacSGreg Roach 315089dadacSGreg Roach /** 316089dadacSGreg Roach * Perform a whois search for an ASN. 317089dadacSGreg Roach * 318e5766395SGreg Roach * @param string $asn The autonomous system number to query 319089dadacSGreg Roach * 320089dadacSGreg Roach * @return array<RangeInterface> 321089dadacSGreg Roach */ 322089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 323089dadacSGreg Roach { 3246b9cb339SGreg Roach return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 3254a8d2484SGreg Roach $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 326273a564eSGreg Roach 327089dadacSGreg Roach try { 328089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 329089dadacSGreg Roach $whois = new Whois($loader); 330089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 331273a564eSGreg Roach $routes = $info->routes; 332273a564eSGreg Roach $ranges = array_map($mapper, $routes); 333089dadacSGreg Roach 334089dadacSGreg Roach return array_filter($ranges); 33528d026adSGreg Roach } catch (Throwable) { 336089dadacSGreg Roach return []; 337089dadacSGreg Roach } 338089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 339089dadacSGreg Roach } 340089dadacSGreg Roach 341089dadacSGreg Roach /** 342d2d58874SGreg Roach * Fetch a list of IP addresses from a remote file. 343d2d58874SGreg Roach * 344d2d58874SGreg Roach * @param string $ua 345d2d58874SGreg Roach * @param string $url 346d2d58874SGreg Roach * 347d2d58874SGreg Roach * @return array<string> 348d2d58874SGreg Roach */ 349d2d58874SGreg Roach private function fetchIpRangesForUrl(string $ua, string $url): array 350d2d58874SGreg Roach { 351d2d58874SGreg Roach return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 352d2d58874SGreg Roach try { 353d2d58874SGreg Roach $client = new Client(); 354d2d58874SGreg Roach $response = $client->get($url, ['timeout' => 5]); 355d2d58874SGreg Roach $contents = $response->getBody()->getContents(); 356d2d58874SGreg Roach 357d2d58874SGreg Roach preg_match_all(self::REGEX_IPV4, $contents, $matches); 358d2d58874SGreg Roach 359d2d58874SGreg Roach return $matches[0]; 360d2d58874SGreg Roach } catch (GuzzleException) { 361d2d58874SGreg Roach return []; 362d2d58874SGreg Roach } 363d2d58874SGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 364d2d58874SGreg Roach } 365d2d58874SGreg Roach 366d2d58874SGreg Roach /** 367089dadacSGreg Roach * @return ResponseInterface 368089dadacSGreg Roach */ 369089dadacSGreg Roach private function response(): ResponseInterface 370089dadacSGreg Roach { 371089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 372089dadacSGreg Roach } 373089dadacSGreg Roach} 374