1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry; 24089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 25089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 26089dadacSGreg Roachuse Iodev\Whois\Whois; 27089dadacSGreg Roachuse IPLib\Address\AddressInterface; 2869675509SGreg Roachuse IPLib\Factory as IPFactory; 29089dadacSGreg Roachuse IPLib\Range\RangeInterface; 30089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 31089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 32089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 33089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 34089dadacSGreg Roachuse Throwable; 35089dadacSGreg Roach 36089dadacSGreg Roachuse function array_map; 37089dadacSGreg Roachuse function assert; 38089dadacSGreg Roachuse function gethostbyaddr; 39089dadacSGreg Roachuse function gethostbyname; 40089dadacSGreg Roachuse function response; 41dec352c1SGreg Roachuse function str_contains; 42dec352c1SGreg Roachuse function str_ends_with; 43089dadacSGreg Roach 44089dadacSGreg Roach/** 45089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 46089dadacSGreg Roach */ 47089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 48089dadacSGreg Roach{ 49089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 50089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 51089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 52089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 53089dadacSGreg Roach 54089dadacSGreg Roach // Bad robots - SEO optimisers, advertisers, etc 55089dadacSGreg Roach private const BAD_ROBOTS = [ 56089dadacSGreg Roach 'admantx', 57be5f8e6aSGreg Roach 'Adsbot', 58089dadacSGreg Roach 'AhrefsBot', 59227c6666SGreg Roach 'AspiegelBot', 60*61e93e26SGreg Roach 'Barkrowler', 61089dadacSGreg Roach 'DotBot', 62089dadacSGreg Roach 'Grapeshot', 63089dadacSGreg Roach 'ia_archiver', 64089dadacSGreg Roach 'MJ12bot', 65227c6666SGreg Roach 'panscient', 66be5f8e6aSGreg Roach 'PetalBot', 67089dadacSGreg Roach 'proximic', 68089dadacSGreg Roach 'SemrushBot', 69be5f8e6aSGreg Roach 'Turnitin', 70089dadacSGreg Roach 'XoviBot', 71089dadacSGreg Roach ]; 72089dadacSGreg Roach 73089dadacSGreg Roach /** 745c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 75089dadacSGreg Roach * 76089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 77089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 78089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 79089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 80089dadacSGreg Roach */ 815c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 82089dadacSGreg Roach 'bingbot' => ['.search.msn.com'], 83089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 84089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 85*61e93e26SGreg Roach 'Mail.ru' => ['mail.ru'], 86089dadacSGreg Roach 'msnbot' => ['.search.msn.com'], 87089dadacSGreg Roach 'Qwantify' => ['.search.qwant.com'], 88089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 89089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 90089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 91089dadacSGreg Roach ]; 92089dadacSGreg Roach 93089dadacSGreg Roach /** 945c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 955c20d904SGreg Roach * 965c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 975c20d904SGreg Roach */ 985c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 996a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 1005c20d904SGreg Roach ]; 1015c20d904SGreg Roach 1025c20d904SGreg Roach /** 103089dadacSGreg Roach * Some search engines operate from designated IP addresses. 104089dadacSGreg Roach * 105813eb6c8SGreg Roach * @see http://www.apple.com/go/applebot 106089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 107089dadacSGreg Roach */ 108089dadacSGreg Roach private const ROBOT_IPS = [ 109813eb6c8SGreg Roach 'AppleBot' => [ 110813eb6c8SGreg Roach '17.0.0.0/8', 111813eb6c8SGreg Roach ], 112089dadacSGreg Roach 'Ask Jeeves' => [ 113089dadacSGreg Roach '65.214.45.143', 114089dadacSGreg Roach '65.214.45.148', 115089dadacSGreg Roach '66.235.124.192', 116089dadacSGreg Roach '66.235.124.7', 117089dadacSGreg Roach '66.235.124.101', 118089dadacSGreg Roach '66.235.124.193', 119089dadacSGreg Roach '66.235.124.73', 120089dadacSGreg Roach '66.235.124.196', 121089dadacSGreg Roach '66.235.124.74', 122089dadacSGreg Roach '63.123.238.8', 123089dadacSGreg Roach '202.143.148.61', 124089dadacSGreg Roach ], 125089dadacSGreg Roach 'DuckDuckBot' => [ 126089dadacSGreg Roach '23.21.227.69', 127089dadacSGreg Roach '50.16.241.113', 128089dadacSGreg Roach '50.16.241.114', 129089dadacSGreg Roach '50.16.241.117', 130089dadacSGreg Roach '50.16.247.234', 131089dadacSGreg Roach '52.204.97.54', 132089dadacSGreg Roach '52.5.190.19', 133089dadacSGreg Roach '54.197.234.188', 134089dadacSGreg Roach '54.208.100.253', 135089dadacSGreg Roach '54.208.102.37', 136089dadacSGreg Roach '107.21.1.8', 137089dadacSGreg Roach ], 138089dadacSGreg Roach ]; 139089dadacSGreg Roach 140089dadacSGreg Roach /** 141089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 142089dadacSGreg Roach * 143089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 144089dadacSGreg Roach */ 145089dadacSGreg Roach private const ROBOT_ASN = [ 146089dadacSGreg Roach 'facebook' => 'AS32934', 147089dadacSGreg Roach 'twitter' => 'AS13414', 148089dadacSGreg Roach ]; 149089dadacSGreg Roach 150089dadacSGreg Roach /** 151089dadacSGreg Roach * @param ServerRequestInterface $request 152089dadacSGreg Roach * @param RequestHandlerInterface $handler 153089dadacSGreg Roach * 154089dadacSGreg Roach * @return ResponseInterface 155089dadacSGreg Roach */ 156089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 157089dadacSGreg Roach { 158089dadacSGreg Roach $ua = $request->getServerParams()['HTTP_USER_AGENT'] ?? ''; 159089dadacSGreg Roach $ip = $request->getAttribute('client-ip'); 16069675509SGreg Roach $address = IPFactory::addressFromString($ip); 161089dadacSGreg Roach assert($address instanceof AddressInterface); 162089dadacSGreg Roach 163dec352c1SGreg Roach foreach (self::BAD_ROBOTS as $robot) { 164dec352c1SGreg Roach if (str_contains($ua, $robot)) { 165089dadacSGreg Roach return $this->response(); 166089dadacSGreg Roach } 167dec352c1SGreg Roach } 168089dadacSGreg Roach 1695c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 170dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 1715c20d904SGreg Roach return $this->response(); 1725c20d904SGreg Roach } 1735c20d904SGreg Roach } 1745c20d904SGreg Roach 1755c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 176dec352c1SGreg Roach if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 177089dadacSGreg Roach return $this->response(); 178089dadacSGreg Roach } 179089dadacSGreg Roach } 180089dadacSGreg Roach 181089dadacSGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ips) { 182dec352c1SGreg Roach if (str_contains($ua, $robot)) { 183813eb6c8SGreg Roach foreach ($valid_ips as $ip) { 18469675509SGreg Roach $range = IPFactory::rangeFromString($ip); 185813eb6c8SGreg Roach 186813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 187813eb6c8SGreg Roach continue 2; 188813eb6c8SGreg Roach } 189813eb6c8SGreg Roach } 190813eb6c8SGreg Roach 191089dadacSGreg Roach return $this->response(); 192089dadacSGreg Roach } 193089dadacSGreg Roach } 194089dadacSGreg Roach 195089dadacSGreg Roach foreach (self::ROBOT_ASN as $robot => $asn) { 196dec352c1SGreg Roach if (str_contains($ua, $robot)) { 197089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 198089dadacSGreg Roach if ($range->contains($address)) { 199089dadacSGreg Roach continue 2; 200089dadacSGreg Roach } 201089dadacSGreg Roach } 202089dadacSGreg Roach 203089dadacSGreg Roach return $this->response(); 204089dadacSGreg Roach } 205089dadacSGreg Roach } 206089dadacSGreg Roach 207617057d4SGreg Roach // Allow sites to block access from entire networks. 208617057d4SGreg Roach preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches); 209617057d4SGreg Roach foreach ($matches[1] as $asn) { 210617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 211617057d4SGreg Roach if ($range->contains($address)) { 212617057d4SGreg Roach return $this->response(); 213617057d4SGreg Roach } 214617057d4SGreg Roach } 215617057d4SGreg Roach } 216089dadacSGreg Roach 217089dadacSGreg Roach return $handler->handle($request); 218089dadacSGreg Roach } 219089dadacSGreg Roach 220089dadacSGreg Roach /** 221089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 222089dadacSGreg Roach * 223089dadacSGreg Roach * @param string $ip 224089dadacSGreg Roach * @param array<string> $valid_domains 2255c20d904SGreg Roach * @param bool $reverse_only 226089dadacSGreg Roach * 227089dadacSGreg Roach * @return bool 228089dadacSGreg Roach */ 2295c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 230089dadacSGreg Roach { 231089dadacSGreg Roach $host = gethostbyaddr($ip); 232089dadacSGreg Roach 233dec352c1SGreg Roach if ($host === false) { 234089dadacSGreg Roach return false; 235089dadacSGreg Roach } 236089dadacSGreg Roach 237dec352c1SGreg Roach foreach ($valid_domains as $domain) { 238dec352c1SGreg Roach if (str_ends_with($host, $domain)) { 2395c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 240089dadacSGreg Roach } 241dec352c1SGreg Roach } 242dec352c1SGreg Roach 243dec352c1SGreg Roach return false; 244dec352c1SGreg Roach } 245089dadacSGreg Roach 246089dadacSGreg Roach /** 247089dadacSGreg Roach * Perform a whois search for an ASN. 248089dadacSGreg Roach * 249089dadacSGreg Roach * @param string $asn - The autonomous system number to query 250089dadacSGreg Roach * 251089dadacSGreg Roach * @return array<RangeInterface> 252089dadacSGreg Roach */ 253089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 254089dadacSGreg Roach { 2556b9cb339SGreg Roach return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 256089dadacSGreg Roach try { 257089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 258089dadacSGreg Roach $whois = new Whois($loader); 259089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 260089dadacSGreg Roach $routes = $info->getRoutes(); 261089dadacSGreg Roach $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface { 26269675509SGreg Roach return IPFactory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6()); 263089dadacSGreg Roach }, $routes); 264089dadacSGreg Roach 265089dadacSGreg Roach return array_filter($ranges); 266089dadacSGreg Roach } catch (Throwable $ex) { 267089dadacSGreg Roach return []; 268089dadacSGreg Roach } 269089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 270089dadacSGreg Roach } 271089dadacSGreg Roach 272089dadacSGreg Roach /** 273089dadacSGreg Roach * @return ResponseInterface 274089dadacSGreg Roach */ 275089dadacSGreg Roach private function response(): ResponseInterface 276089dadacSGreg Roach { 277089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 278089dadacSGreg Roach } 279089dadacSGreg Roach} 280