1089dadacSGreg Roach<?php 2089dadacSGreg Roach 3089dadacSGreg Roach/** 4089dadacSGreg Roach * webtrees: online genealogy 5089dadacSGreg Roach * Copyright (C) 2019 webtrees development team 6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify 7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by 8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or 9089dadacSGreg Roach * (at your option) any later version. 10089dadacSGreg Roach * This program is distributed in the hope that it will be useful, 11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of 12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13089dadacSGreg Roach * GNU General Public License for more details. 14089dadacSGreg Roach * You should have received a copy of the GNU General Public License 15089dadacSGreg Roach * along with this program. If not, see <http://www.gnu.org/licenses/>. 16089dadacSGreg Roach */ 17089dadacSGreg Roach 18089dadacSGreg Roachdeclare(strict_types=1); 19089dadacSGreg Roach 20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware; 21089dadacSGreg Roach 22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface; 23089dadacSGreg Roachuse Fisharebest\Webtrees\Cache; 24089dadacSGreg Roachuse Illuminate\Support\Str; 25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader; 26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo; 27089dadacSGreg Roachuse Iodev\Whois\Whois; 28089dadacSGreg Roachuse IPLib\Address\AddressInterface; 29089dadacSGreg Roachuse IPLib\Factory; 30089dadacSGreg Roachuse IPLib\Range\RangeInterface; 31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface; 32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface; 33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface; 34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface; 35089dadacSGreg Roachuse Throwable; 36089dadacSGreg Roach 37089dadacSGreg Roachuse function app; 38089dadacSGreg Roachuse function array_map; 39089dadacSGreg Roachuse function assert; 40089dadacSGreg Roachuse function gethostbyaddr; 41089dadacSGreg Roachuse function gethostbyname; 42089dadacSGreg Roachuse function in_array; 43089dadacSGreg Roachuse function response; 44089dadacSGreg Roach 45089dadacSGreg Roach/** 46089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles. 47089dadacSGreg Roach */ 48089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface 49089dadacSGreg Roach{ 50089dadacSGreg Roach // Cache whois requests. Try to avoid all caches expiring at the same time. 51089dadacSGreg Roach private const WHOIS_TTL_MIN = 28 * 86400; 52089dadacSGreg Roach private const WHOIS_TTL_MAX = 35 * 86400; 53089dadacSGreg Roach private const WHOIS_TIMEOUT = 5; 54089dadacSGreg Roach 55089dadacSGreg Roach // Bad robots - SEO optimisers, advertisers, etc 56089dadacSGreg Roach private const BAD_ROBOTS = [ 57089dadacSGreg Roach 'admantx', 58089dadacSGreg Roach 'AhrefsBot', 59227c6666SGreg Roach 'AspiegelBot', 60089dadacSGreg Roach 'DotBot', 61089dadacSGreg Roach 'Grapeshot', 62089dadacSGreg Roach 'ia_archiver', 63089dadacSGreg Roach 'MJ12bot', 64227c6666SGreg Roach 'panscient', 65089dadacSGreg Roach 'proximic', 66089dadacSGreg Roach 'SemrushBot', 67089dadacSGreg Roach 'XoviBot', 68089dadacSGreg Roach ]; 69089dadacSGreg Roach 70089dadacSGreg Roach /** 715c20d904SGreg Roach * Some search engines use reverse/forward DNS to verify the IP address. 72089dadacSGreg Roach * 73089dadacSGreg Roach * @see https://support.google.com/webmasters/answer/80553?hl=en 74089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 75089dadacSGreg Roach * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 76089dadacSGreg Roach * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 77089dadacSGreg Roach */ 785c20d904SGreg Roach private const ROBOT_REV_FWD_DNS = [ 79089dadacSGreg Roach 'bingbot' => ['.search.msn.com'], 80089dadacSGreg Roach 'BingPreview' => ['.search.msn.com'], 81089dadacSGreg Roach 'Google' => ['.google.com', '.googlebot.com'], 82089dadacSGreg Roach 'msnbot' => ['.search.msn.com'], 83089dadacSGreg Roach 'Qwantify' => ['.search.qwant.com'], 84089dadacSGreg Roach 'Sogou' => ['.crawl.sogou.com'], 85089dadacSGreg Roach 'Yahoo' => ['.crawl.yahoo.net'], 86089dadacSGreg Roach 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 87089dadacSGreg Roach ]; 88089dadacSGreg Roach 89089dadacSGreg Roach /** 905c20d904SGreg Roach * Some search engines only use reverse DNS to verify the IP address. 915c20d904SGreg Roach * 925c20d904SGreg Roach * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 935c20d904SGreg Roach */ 945c20d904SGreg Roach private const ROBOT_REV_ONLY_DNS = [ 956a8ee1d2SGreg Roach 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 965c20d904SGreg Roach ]; 975c20d904SGreg Roach 985c20d904SGreg Roach /** 99089dadacSGreg Roach * Some search engines operate from designated IP addresses. 100089dadacSGreg Roach * 101813eb6c8SGreg Roach * @see http://www.apple.com/go/applebot 102089dadacSGreg Roach * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 103089dadacSGreg Roach */ 104089dadacSGreg Roach private const ROBOT_IPS = [ 105813eb6c8SGreg Roach 'AppleBot' => [ 106813eb6c8SGreg Roach '17.0.0.0/8', 107813eb6c8SGreg Roach ], 108089dadacSGreg Roach 'Ask Jeeves' => [ 109089dadacSGreg Roach '65.214.45.143', 110089dadacSGreg Roach '65.214.45.148', 111089dadacSGreg Roach '66.235.124.192', 112089dadacSGreg Roach '66.235.124.7', 113089dadacSGreg Roach '66.235.124.101', 114089dadacSGreg Roach '66.235.124.193', 115089dadacSGreg Roach '66.235.124.73', 116089dadacSGreg Roach '66.235.124.196', 117089dadacSGreg Roach '66.235.124.74', 118089dadacSGreg Roach '63.123.238.8', 119089dadacSGreg Roach '202.143.148.61', 120089dadacSGreg Roach ], 121089dadacSGreg Roach 'DuckDuckBot' => [ 122089dadacSGreg Roach '23.21.227.69', 123089dadacSGreg Roach '50.16.241.113', 124089dadacSGreg Roach '50.16.241.114', 125089dadacSGreg Roach '50.16.241.117', 126089dadacSGreg Roach '50.16.247.234', 127089dadacSGreg Roach '52.204.97.54', 128089dadacSGreg Roach '52.5.190.19', 129089dadacSGreg Roach '54.197.234.188', 130089dadacSGreg Roach '54.208.100.253', 131089dadacSGreg Roach '54.208.102.37', 132089dadacSGreg Roach '107.21.1.8', 133089dadacSGreg Roach ], 134089dadacSGreg Roach ]; 135089dadacSGreg Roach 136089dadacSGreg Roach /** 137089dadacSGreg Roach * Some search engines operate from within a designated autonomous system. 138089dadacSGreg Roach * 139089dadacSGreg Roach * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 140089dadacSGreg Roach */ 141089dadacSGreg Roach private const ROBOT_ASN = [ 142089dadacSGreg Roach 'facebook' => 'AS32934', 143089dadacSGreg Roach 'twitter' => 'AS13414', 144089dadacSGreg Roach ]; 145089dadacSGreg Roach 146089dadacSGreg Roach /** 147089dadacSGreg Roach * @param ServerRequestInterface $request 148089dadacSGreg Roach * @param RequestHandlerInterface $handler 149089dadacSGreg Roach * 150089dadacSGreg Roach * @return ResponseInterface 151089dadacSGreg Roach */ 152089dadacSGreg Roach public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 153089dadacSGreg Roach { 154089dadacSGreg Roach $ua = $request->getServerParams()['HTTP_USER_AGENT'] ?? ''; 155089dadacSGreg Roach $ip = $request->getAttribute('client-ip'); 156089dadacSGreg Roach $address = Factory::addressFromString($ip); 157089dadacSGreg Roach assert($address instanceof AddressInterface); 158089dadacSGreg Roach 159089dadacSGreg Roach if (Str::contains($ua, self::BAD_ROBOTS)) { 160089dadacSGreg Roach return $this->response(); 161089dadacSGreg Roach } 162089dadacSGreg Roach 1635c20d904SGreg Roach foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 1645c20d904SGreg Roach if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 1655c20d904SGreg Roach return $this->response(); 1665c20d904SGreg Roach } 1675c20d904SGreg Roach } 1685c20d904SGreg Roach 1695c20d904SGreg Roach foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 1705c20d904SGreg Roach if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 171089dadacSGreg Roach return $this->response(); 172089dadacSGreg Roach } 173089dadacSGreg Roach } 174089dadacSGreg Roach 175089dadacSGreg Roach foreach (self::ROBOT_IPS as $robot => $valid_ips) { 176813eb6c8SGreg Roach if (Str::contains($ua, $robot)) { 177813eb6c8SGreg Roach foreach ($valid_ips as $ip) { 178813eb6c8SGreg Roach $range = Factory::rangeFromString($ip); 179813eb6c8SGreg Roach 180813eb6c8SGreg Roach if ($range instanceof RangeInterface && $range->contains($address)) { 181813eb6c8SGreg Roach continue 2; 182813eb6c8SGreg Roach } 183813eb6c8SGreg Roach } 184813eb6c8SGreg Roach 185089dadacSGreg Roach return $this->response(); 186089dadacSGreg Roach } 187089dadacSGreg Roach } 188089dadacSGreg Roach 189089dadacSGreg Roach foreach (self::ROBOT_ASN as $robot => $asn) { 190089dadacSGreg Roach if (Str::contains($ua, $robot)) { 191089dadacSGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 192089dadacSGreg Roach if ($range->contains($address)) { 193089dadacSGreg Roach continue 2; 194089dadacSGreg Roach } 195089dadacSGreg Roach } 196089dadacSGreg Roach 197089dadacSGreg Roach return $this->response(); 198089dadacSGreg Roach } 199089dadacSGreg Roach } 200089dadacSGreg Roach 201*617057d4SGreg Roach // Allow sites to block access from entire networks. 202*617057d4SGreg Roach preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches); 203*617057d4SGreg Roach foreach ($matches[1] as $asn) { 204*617057d4SGreg Roach foreach ($this->fetchIpRangesForAsn($asn) as $range) { 205*617057d4SGreg Roach if ($range->contains($address)) { 206*617057d4SGreg Roach return $this->response(); 207*617057d4SGreg Roach } 208*617057d4SGreg Roach } 209*617057d4SGreg Roach } 210089dadacSGreg Roach 211089dadacSGreg Roach return $handler->handle($request); 212089dadacSGreg Roach } 213089dadacSGreg Roach 214089dadacSGreg Roach /** 215089dadacSGreg Roach * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 216089dadacSGreg Roach * 217089dadacSGreg Roach * @param string $ip 218089dadacSGreg Roach * @param array<string> $valid_domains 2195c20d904SGreg Roach * @param bool $reverse_only 220089dadacSGreg Roach * 221089dadacSGreg Roach * @return bool 222089dadacSGreg Roach */ 2235c20d904SGreg Roach private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 224089dadacSGreg Roach { 225089dadacSGreg Roach $host = gethostbyaddr($ip); 226089dadacSGreg Roach 227089dadacSGreg Roach if ($host === false || !Str::endsWith($host, $valid_domains)) { 228089dadacSGreg Roach return false; 229089dadacSGreg Roach } 230089dadacSGreg Roach 2315c20d904SGreg Roach return $reverse_only || $ip === gethostbyname($host); 232089dadacSGreg Roach } 233089dadacSGreg Roach 234089dadacSGreg Roach /** 235089dadacSGreg Roach * Perform a whois search for an ASN. 236089dadacSGreg Roach * 237089dadacSGreg Roach * @param string $asn - The autonomous system number to query 238089dadacSGreg Roach * 239089dadacSGreg Roach * @return array<RangeInterface> 240089dadacSGreg Roach */ 241089dadacSGreg Roach private function fetchIpRangesForAsn(string $asn): array 242089dadacSGreg Roach { 243089dadacSGreg Roach $cache = app('cache.files'); 244089dadacSGreg Roach assert($cache instanceof Cache); 245089dadacSGreg Roach 246089dadacSGreg Roach return $cache->remember('whois-asn-' . $asn, static function () use ($asn): array { 247089dadacSGreg Roach try { 248089dadacSGreg Roach $loader = new CurlLoader(self::WHOIS_TIMEOUT); 249089dadacSGreg Roach $whois = new Whois($loader); 250089dadacSGreg Roach $info = $whois->loadAsnInfo($asn); 251089dadacSGreg Roach $routes = $info->getRoutes(); 252089dadacSGreg Roach $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface { 253089dadacSGreg Roach return Factory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6()); 254089dadacSGreg Roach }, $routes); 255089dadacSGreg Roach 256089dadacSGreg Roach return array_filter($ranges); 257089dadacSGreg Roach } catch (Throwable $ex) { 258089dadacSGreg Roach return []; 259089dadacSGreg Roach } 260089dadacSGreg Roach }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 261089dadacSGreg Roach } 262089dadacSGreg Roach 263089dadacSGreg Roach /** 264089dadacSGreg Roach * @return ResponseInterface 265089dadacSGreg Roach */ 266089dadacSGreg Roach private function response(): ResponseInterface 267089dadacSGreg Roach { 268089dadacSGreg Roach return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 269089dadacSGreg Roach } 270089dadacSGreg Roach} 271