1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry; 24*b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator; 25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 27089dadacSGreg Roachuse Iodev\Whois\Whois; 28089dadacSGreg Roachuse IPLib\Address\AddressInterface; 2969675509SGreg Roachuse IPLib\Factory as IPFactory; 30089dadacSGreg Roachuse IPLib\Range\RangeInterface; 31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 35089dadacSGreg Roachuse Throwable; 36089dadacSGreg Roach 37b7e8616fSGreg Roachuse function array_filter; 38089dadacSGreg Roachuse function array_map; 39089dadacSGreg Roachuse function assert; 40089dadacSGreg Roachuse function gethostbyaddr; 41089dadacSGreg Roachuse function gethostbyname; 42b7e8616fSGreg Roachuse function preg_match_all; 43b7e8616fSGreg Roachuse function random_int; 44089dadacSGreg Roachuse function response; 45dec352c1SGreg Roachuse function str_contains; 46dec352c1SGreg Roachuse function str_ends_with; 47089dadacSGreg Roach 48089dadacSGreg Roach/** 49089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 50089dadacSGreg Roach */ 51089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 52089dadacSGreg Roach{ 53089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 54089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 55089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 56089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 57089dadacSGreg Roach 58ffa287a1SGreg Roach // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 59ffa287a1SGreg Roach public const BAD_ROBOTS = [ 60089dadacSGreg Roach 'admantx', 61be5f8e6aSGreg Roach 'Adsbot', 62089dadacSGreg Roach 'AhrefsBot', 63227c6666SGreg Roach 'AspiegelBot', 6461e93e26SGreg Roach 'Barkrowler', 65a10ff261SGreg Roach 'BLEXBot', 66089dadacSGreg Roach 'DotBot', 67089dadacSGreg Roach 'Grapeshot', 68089dadacSGreg Roach 'ia_archiver', 6903bad539SGreg Roach 'Linguee', 70089dadacSGreg Roach 'MJ12bot', 71227c6666SGreg Roach 'panscient', 72be5f8e6aSGreg Roach 'PetalBot', 73089dadacSGreg Roach 'proximic', 74089dadacSGreg Roach 'SemrushBot', 75be5f8e6aSGreg Roach 'Turnitin', 76089dadacSGreg Roach 'XoviBot', 77a10ff261SGreg Roach 'ZoominfoBot', 78089dadacSGreg Roach ]; 79089dadacSGreg Roach 80089dadacSGreg Roach /** 815c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 82089dadacSGreg Roach * 83089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 84089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 85089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 86089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 87089dadacSGreg Roach */ 885c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 89089dadacSGreg Roach 'bingbot' => ['.search.msn.com'], 90089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 91089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 92ffa287a1SGreg Roach 'Mail.RU_Bot' => ['mail.ru'], 93089dadacSGreg Roach 'msnbot' => ['.search.msn.com'], 94089dadacSGreg Roach 'Qwantify' => ['.search.qwant.com'], 95089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 96089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 97089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 98089dadacSGreg Roach ]; 99089dadacSGreg Roach 100089dadacSGreg Roach /** 1015c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 1025c20d904SGreg Roach * 1035c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 1041ed9b76dSGreg Roach * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 1055c20d904SGreg Roach */ 1065c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 1076a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 1081ed9b76dSGreg Roach 'FreshBot' => ['.seznam.cz'], 1091ed9b76dSGreg Roach 'Seznam' => ['.seznam.cz'], 1105c20d904SGreg Roach ]; 1115c20d904SGreg Roach 1125c20d904SGreg Roach /** 113089dadacSGreg Roach * Some search engines operate from designated IP addresses. 114089dadacSGreg Roach * 115ad3143ccSGreg Roach * @see https://www.apple.com/go/applebot 116089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 117089dadacSGreg Roach */ 118089dadacSGreg Roach private const ROBOT_IPS = [ 119813eb6c8SGreg Roach 'AppleBot' => [ 120813eb6c8SGreg Roach '17.0.0.0/8', 121813eb6c8SGreg Roach ], 122089dadacSGreg Roach 'Ask Jeeves' => [ 123089dadacSGreg Roach '65.214.45.143', 124089dadacSGreg Roach '65.214.45.148', 125089dadacSGreg Roach '66.235.124.192', 126089dadacSGreg Roach '66.235.124.7', 127089dadacSGreg Roach '66.235.124.101', 128089dadacSGreg Roach '66.235.124.193', 129089dadacSGreg Roach '66.235.124.73', 130089dadacSGreg Roach '66.235.124.196', 131089dadacSGreg Roach '66.235.124.74', 132089dadacSGreg Roach '63.123.238.8', 133089dadacSGreg Roach '202.143.148.61', 134089dadacSGreg Roach ], 135089dadacSGreg Roach 'DuckDuckBot' => [ 136089dadacSGreg Roach '23.21.227.69', 137089dadacSGreg Roach '50.16.241.113', 138089dadacSGreg Roach '50.16.241.114', 139089dadacSGreg Roach '50.16.241.117', 140089dadacSGreg Roach '50.16.247.234', 141089dadacSGreg Roach '52.204.97.54', 142089dadacSGreg Roach '52.5.190.19', 143089dadacSGreg Roach '54.197.234.188', 144089dadacSGreg Roach '54.208.100.253', 145089dadacSGreg Roach '54.208.102.37', 146089dadacSGreg Roach '107.21.1.8', 147089dadacSGreg Roach ], 148089dadacSGreg Roach ]; 149089dadacSGreg Roach 150089dadacSGreg Roach /** 151089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 152089dadacSGreg Roach * 153089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 154cc7171a0SGreg Roach * @see https://www.facebook.com/peering/ 155089dadacSGreg Roach */ 156cc7171a0SGreg Roach private const ROBOT_ASNS = [ 157cc7171a0SGreg Roach 'facebook' => ['AS32934', 'AS63293'], 158cc7171a0SGreg Roach 'twitter' => ['AS13414'], 159089dadacSGreg Roach ]; 160089dadacSGreg Roach 161089dadacSGreg Roach /** 162089dadacSGreg Roach * @param ServerRequestInterface $request 163089dadacSGreg Roach * @param RequestHandlerInterface $handler 164089dadacSGreg Roach * 165089dadacSGreg Roach * @return ResponseInterface 166089dadacSGreg Roach */ 167089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 168089dadacSGreg Roach { 169*b55cbc6bSGreg Roach $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 170*b55cbc6bSGreg Roach $ip = Validator::attributes($request)->string('client-ip'); 1714a8d2484SGreg Roach $address = IPFactory::parseAddressString($ip); 172089dadacSGreg Roach assert($address instanceof AddressInterface); 173089dadacSGreg Roach 174dec352c1SGreg Roach foreach (self::BAD_ROBOTS as $robot) { 175dec352c1SGreg Roach if (str_contains($ua, $robot)) { 176089dadacSGreg Roach return $this->response(); 177089dadacSGreg Roach } 178dec352c1SGreg Roach } 179089dadacSGreg Roach 1805c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 181dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 1825c20d904SGreg Roach return $this->response(); 1835c20d904SGreg Roach } 1845c20d904SGreg Roach } 1855c20d904SGreg Roach 1865c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 187dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 188089dadacSGreg Roach return $this->response(); 189089dadacSGreg Roach } 190089dadacSGreg Roach } 191089dadacSGreg Roach 192089dadacSGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ips) { 193dec352c1SGreg Roach if (str_contains($ua, $robot)) { 194813eb6c8SGreg Roach foreach ($valid_ips as $ip) { 1954a8d2484SGreg Roach $range = IPFactory::parseRangeString($ip); 196813eb6c8SGreg Roach 197813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 198813eb6c8SGreg Roach continue 2; 199813eb6c8SGreg Roach } 200813eb6c8SGreg Roach } 201813eb6c8SGreg Roach 202089dadacSGreg Roach return $this->response(); 203089dadacSGreg Roach } 204089dadacSGreg Roach } 205089dadacSGreg Roach 206cc7171a0SGreg Roach foreach (self::ROBOT_ASNS as $robot => $asns) { 207cc7171a0SGreg Roach foreach ($asns as $asn) { 208dec352c1SGreg Roach if (str_contains($ua, $robot)) { 209089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 210089dadacSGreg Roach if ($range->contains($address)) { 211089dadacSGreg Roach continue 2; 212089dadacSGreg Roach } 213089dadacSGreg Roach } 214089dadacSGreg Roach 215089dadacSGreg Roach return $this->response(); 216089dadacSGreg Roach } 217089dadacSGreg Roach } 218cc7171a0SGreg Roach } 219089dadacSGreg Roach 220617057d4SGreg Roach // Allow sites to block access from entire networks. 221*b55cbc6bSGreg Roach $block_asn = Validator::attributes($request)->string('block_asn', ''); 222*b55cbc6bSGreg Roach preg_match_all('/(AS\d+)/', $block_asn, $matches); 223*b55cbc6bSGreg Roach 224617057d4SGreg Roach foreach ($matches[1] as $asn) { 225617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 226617057d4SGreg Roach if ($range->contains($address)) { 227617057d4SGreg Roach return $this->response(); 228617057d4SGreg Roach } 229617057d4SGreg Roach } 230617057d4SGreg Roach } 231089dadacSGreg Roach 232089dadacSGreg Roach return $handler->handle($request); 233089dadacSGreg Roach } 234089dadacSGreg Roach 235089dadacSGreg Roach /** 236089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 237089dadacSGreg Roach * 238089dadacSGreg Roach * @param string $ip 239089dadacSGreg Roach * @param array<string> $valid_domains 2405c20d904SGreg Roach * @param bool $reverse_only 241089dadacSGreg Roach * 242089dadacSGreg Roach * @return bool 243089dadacSGreg Roach */ 2445c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 245089dadacSGreg Roach { 246089dadacSGreg Roach $host = gethostbyaddr($ip); 247089dadacSGreg Roach 248dec352c1SGreg Roach if ($host === false) { 249089dadacSGreg Roach return false; 250089dadacSGreg Roach } 251089dadacSGreg Roach 252dec352c1SGreg Roach foreach ($valid_domains as $domain) { 253dec352c1SGreg Roach if (str_ends_with($host, $domain)) { 2545c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 255089dadacSGreg Roach } 256dec352c1SGreg Roach } 257dec352c1SGreg Roach 258dec352c1SGreg Roach return false; 259dec352c1SGreg Roach } 260089dadacSGreg Roach 261089dadacSGreg Roach /** 262089dadacSGreg Roach * Perform a whois search for an ASN. 263089dadacSGreg Roach * 264089dadacSGreg Roach * @param string $asn - The autonomous system number to query 265089dadacSGreg Roach * 266089dadacSGreg Roach * @return array<RangeInterface> 267089dadacSGreg Roach */ 268089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 269089dadacSGreg Roach { 2706b9cb339SGreg Roach return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 2714a8d2484SGreg Roach $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 272273a564eSGreg Roach 273089dadacSGreg Roach try { 274089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 275089dadacSGreg Roach $whois = new Whois($loader); 276089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 277273a564eSGreg Roach $routes = $info->routes; 278273a564eSGreg Roach $ranges = array_map($mapper, $routes); 279089dadacSGreg Roach 280089dadacSGreg Roach return array_filter($ranges); 281089dadacSGreg Roach } catch (Throwable $ex) { 282089dadacSGreg Roach return []; 283089dadacSGreg Roach } 284089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 285089dadacSGreg Roach } 286089dadacSGreg Roach 287089dadacSGreg Roach /** 288089dadacSGreg Roach * @return ResponseInterface 289089dadacSGreg Roach */ 290089dadacSGreg Roach private function response(): ResponseInterface 291089dadacSGreg Roach { 292089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 293089dadacSGreg Roach } 294089dadacSGreg Roach} 295