1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2021 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\Middleware; 21 22use Fig\Http\Message\StatusCodeInterface; 23use Fisharebest\Webtrees\Registry; 24use Iodev\Whois\Loaders\CurlLoader; 25use Iodev\Whois\Modules\Asn\AsnRouteInfo; 26use Iodev\Whois\Whois; 27use IPLib\Address\AddressInterface; 28use IPLib\Factory as IPFactory; 29use IPLib\Range\RangeInterface; 30use Psr\Http\Message\ResponseInterface; 31use Psr\Http\Message\ServerRequestInterface; 32use Psr\Http\Server\MiddlewareInterface; 33use Psr\Http\Server\RequestHandlerInterface; 34use Throwable; 35 36use function array_map; 37use function assert; 38use function gethostbyaddr; 39use function gethostbyname; 40use function response; 41use function str_contains; 42use function str_ends_with; 43 44/** 45 * Middleware to block bad robots before they waste our valuable CPU cycles. 46 */ 47class BadBotBlocker implements MiddlewareInterface 48{ 49 // Cache whois requests. Try to avoid all caches expiring at the same time. 50 private const WHOIS_TTL_MIN = 28 * 86400; 51 private const WHOIS_TTL_MAX = 35 * 86400; 52 private const WHOIS_TIMEOUT = 5; 53 54 // Bad robots - SEO optimisers, advertisers, etc 55 private const BAD_ROBOTS = [ 56 'admantx', 57 'Adsbot', 58 'AhrefsBot', 59 'AspiegelBot', 60 'DotBot', 61 'Grapeshot', 62 'ia_archiver', 63 'MJ12bot', 64 'panscient', 65 'PetalBot', 66 'proximic', 67 'SemrushBot', 68 'Turnitin', 69 'XoviBot', 70 ]; 71 72 /** 73 * Some search engines use reverse/forward DNS to verify the IP address. 74 * 75 * @see https://support.google.com/webmasters/answer/80553?hl=en 76 * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 77 * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 78 * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 79 */ 80 private const ROBOT_REV_FWD_DNS = [ 81 'bingbot' => ['.search.msn.com'], 82 'BingPreview' => ['.search.msn.com'], 83 'Google' => ['.google.com', '.googlebot.com'], 84 'msnbot' => ['.search.msn.com'], 85 'Qwantify' => ['.search.qwant.com'], 86 'Sogou' => ['.crawl.sogou.com'], 87 'Yahoo' => ['.crawl.yahoo.net'], 88 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 89 ]; 90 91 /** 92 * Some search engines only use reverse DNS to verify the IP address. 93 * 94 * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 95 */ 96 private const ROBOT_REV_ONLY_DNS = [ 97 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 98 ]; 99 100 /** 101 * Some search engines operate from designated IP addresses. 102 * 103 * @see http://www.apple.com/go/applebot 104 * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 105 */ 106 private const ROBOT_IPS = [ 107 'AppleBot' => [ 108 '17.0.0.0/8', 109 ], 110 'Ask Jeeves' => [ 111 '65.214.45.143', 112 '65.214.45.148', 113 '66.235.124.192', 114 '66.235.124.7', 115 '66.235.124.101', 116 '66.235.124.193', 117 '66.235.124.73', 118 '66.235.124.196', 119 '66.235.124.74', 120 '63.123.238.8', 121 '202.143.148.61', 122 ], 123 'DuckDuckBot' => [ 124 '23.21.227.69', 125 '50.16.241.113', 126 '50.16.241.114', 127 '50.16.241.117', 128 '50.16.247.234', 129 '52.204.97.54', 130 '52.5.190.19', 131 '54.197.234.188', 132 '54.208.100.253', 133 '54.208.102.37', 134 '107.21.1.8', 135 ], 136 ]; 137 138 /** 139 * Some search engines operate from within a designated autonomous system. 140 * 141 * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 142 */ 143 private const ROBOT_ASN = [ 144 'facebook' => 'AS32934', 145 'twitter' => 'AS13414', 146 ]; 147 148 /** 149 * @param ServerRequestInterface $request 150 * @param RequestHandlerInterface $handler 151 * 152 * @return ResponseInterface 153 */ 154 public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 155 { 156 $ua = $request->getServerParams()['HTTP_USER_AGENT'] ?? ''; 157 $ip = $request->getAttribute('client-ip'); 158 $address = IPFactory::addressFromString($ip); 159 assert($address instanceof AddressInterface); 160 161 foreach (self::BAD_ROBOTS as $robot) { 162 if (str_contains($ua, $robot)) { 163 return $this->response(); 164 } 165 } 166 167 foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 168 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 169 return $this->response(); 170 } 171 } 172 173 foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 174 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 175 return $this->response(); 176 } 177 } 178 179 foreach (self::ROBOT_IPS as $robot => $valid_ips) { 180 if (str_contains($ua, $robot)) { 181 foreach ($valid_ips as $ip) { 182 $range = IPFactory::rangeFromString($ip); 183 184 if ($range instanceof RangeInterface && $range->contains($address)) { 185 continue 2; 186 } 187 } 188 189 return $this->response(); 190 } 191 } 192 193 foreach (self::ROBOT_ASN as $robot => $asn) { 194 if (str_contains($ua, $robot)) { 195 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 196 if ($range->contains($address)) { 197 continue 2; 198 } 199 } 200 201 return $this->response(); 202 } 203 } 204 205 // Allow sites to block access from entire networks. 206 preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches); 207 foreach ($matches[1] as $asn) { 208 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 209 if ($range->contains($address)) { 210 return $this->response(); 211 } 212 } 213 } 214 215 return $handler->handle($request); 216 } 217 218 /** 219 * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 220 * 221 * @param string $ip 222 * @param array<string> $valid_domains 223 * @param bool $reverse_only 224 * 225 * @return bool 226 */ 227 private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 228 { 229 $host = gethostbyaddr($ip); 230 231 if ($host === false) { 232 return false; 233 } 234 235 foreach ($valid_domains as $domain) { 236 if (str_ends_with($host, $domain)) { 237 return $reverse_only || $ip === gethostbyname($host); 238 } 239 } 240 241 return false; 242 } 243 244 /** 245 * Perform a whois search for an ASN. 246 * 247 * @param string $asn - The autonomous system number to query 248 * 249 * @return array<RangeInterface> 250 */ 251 private function fetchIpRangesForAsn(string $asn): array 252 { 253 return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 254 try { 255 $loader = new CurlLoader(self::WHOIS_TIMEOUT); 256 $whois = new Whois($loader); 257 $info = $whois->loadAsnInfo($asn); 258 $routes = $info->getRoutes(); 259 $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface { 260 return IPFactory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6()); 261 }, $routes); 262 263 return array_filter($ranges); 264 } catch (Throwable $ex) { 265 return []; 266 } 267 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 268 } 269 270 /** 271 * @return ResponseInterface 272 */ 273 private function response(): ResponseInterface 274 { 275 return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 276 } 277} 278