1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2022 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\Middleware; 21 22use Fig\Http\Message\StatusCodeInterface; 23use Fisharebest\Webtrees\Registry; 24use Fisharebest\Webtrees\Validator; 25use GuzzleHttp\Client; 26use GuzzleHttp\Exception\GuzzleException; 27use Iodev\Whois\Loaders\CurlLoader; 28use Iodev\Whois\Modules\Asn\AsnRouteInfo; 29use Iodev\Whois\Whois; 30use IPLib\Address\AddressInterface; 31use IPLib\Factory as IPFactory; 32use IPLib\Range\RangeInterface; 33use Psr\Http\Message\ResponseInterface; 34use Psr\Http\Message\ServerRequestInterface; 35use Psr\Http\Server\MiddlewareInterface; 36use Psr\Http\Server\RequestHandlerInterface; 37use Throwable; 38 39use function array_filter; 40use function array_map; 41use function assert; 42use function gethostbyaddr; 43use function gethostbyname; 44use function preg_match_all; 45use function random_int; 46use function response; 47use function str_contains; 48use function str_ends_with; 49 50/** 51 * Middleware to block bad robots before they waste our valuable CPU cycles. 52 */ 53class BadBotBlocker implements MiddlewareInterface 54{ 55 private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56 private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57 58 // Cache whois requests. Try to avoid all caches expiring at the same time. 59 private const WHOIS_TTL_MIN = 28 * 86400; 60 private const WHOIS_TTL_MAX = 35 * 86400; 61 private const WHOIS_TIMEOUT = 5; 62 63 // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64 public const BAD_ROBOTS = [ 65 'admantx', 66 'Adsbot', 67 'AhrefsBot', 68 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 69 'AspiegelBot', 70 'Barkrowler', 71 'BLEXBot', 72 'DataForSEO', 73 'DotBot', 74 'Grapeshot', 75 'Honolulu-bot', // Aggressive crawer, no info available 76 'ia_archiver', 77 'linabot', // Aggressive crawer, no info available 78 'Linguee', 79 'MJ12bot', 80 'netEstate NE', 81 'panscient', 82 'PetalBot', 83 'proximic', 84 'SemrushBot', 85 'serpstatbot', 86 'SEOkicks', 87 'SiteKiosk', 88 'Turnitin', 89 'XoviBot', 90 'ZoominfoBot', 91 ]; 92 93 /** 94 * Some search engines use reverse/forward DNS to verify the IP address. 95 * 96 * @see https://developer.amazon.com/support/amazonbot 97 * @see https://support.google.com/webmasters/answer/80553?hl=en 98 * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 99 * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 100 * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 101 * @see https://www.mojeek.com/bot.html 102 * @see https://support.apple.com/en-gb/HT204683 103 */ 104 private const ROBOT_REV_FWD_DNS = [ 105 'Amazonbot' => ['.crawl.amazon.com'], 106 'Applebot' => ['.applebot.apple.com'], 107 'bingbot' => ['.search.msn.com'], 108 'BingPreview' => ['.search.msn.com'], 109 'Google' => ['.google.com', '.googlebot.com'], 110 'MojeekBot' => ['.mojeek.com'], 111 'Mail.RU_Bot' => ['.mail.ru'], 112 'msnbot' => ['.search.msn.com'], 113 'Qwantify' => ['.search.qwant.com'], 114 'Sogou' => ['.crawl.sogou.com'], 115 'Yahoo' => ['.crawl.yahoo.net'], 116 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 117 ]; 118 119 /** 120 * Some search engines only use reverse DNS to verify the IP address. 121 * 122 * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 123 * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 124 * @see https://www.ionos.de/terms-gtc/faq-crawler 125 */ 126 private const ROBOT_REV_ONLY_DNS = [ 127 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 128 'FreshBot' => ['.seznam.cz'], 129 'IonCrawl' => ['.1und1.org'], 130 'Neevabot' => ['.neeva.com'], 131 ]; 132 133 /** 134 * Some search engines operate from designated IP addresses. 135 * 136 * @see https://www.apple.com/go/applebot 137 * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 138 */ 139 private const ROBOT_IPS = [ 140 'AppleBot' => [ 141 '17.0.0.0/8', 142 ], 143 'Ask Jeeves' => [ 144 '65.214.45.143', 145 '65.214.45.148', 146 '66.235.124.192', 147 '66.235.124.7', 148 '66.235.124.101', 149 '66.235.124.193', 150 '66.235.124.73', 151 '66.235.124.196', 152 '66.235.124.74', 153 '63.123.238.8', 154 '202.143.148.61', 155 ], 156 'DuckDuckBot' => [ 157 '23.21.227.69', 158 '50.16.241.113', 159 '50.16.241.114', 160 '50.16.241.117', 161 '50.16.247.234', 162 '52.204.97.54', 163 '52.5.190.19', 164 '54.197.234.188', 165 '54.208.100.253', 166 '54.208.102.37', 167 '107.21.1.8', 168 ], 169 ]; 170 171 /** 172 * Some search engines operate from designated IP addresses. 173 * 174 * @see https://bot.seekport.com/ 175 */ 176 private const ROBOT_IP_FILES = [ 177 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 178 ]; 179 180 /** 181 * Some search engines operate from within a designated autonomous system. 182 * 183 * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 184 * @see https://www.facebook.com/peering/ 185 */ 186 private const ROBOT_ASNS = [ 187 'facebook' => ['AS32934', 'AS63293'], 188 'twitter' => ['AS13414'], 189 ]; 190 191 /** 192 * @param ServerRequestInterface $request 193 * @param RequestHandlerInterface $handler 194 * 195 * @return ResponseInterface 196 */ 197 public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 198 { 199 $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 200 $ip = Validator::attributes($request)->string('client-ip'); 201 $address = IPFactory::parseAddressString($ip); 202 assert($address instanceof AddressInterface); 203 204 foreach (self::BAD_ROBOTS as $robot) { 205 if (str_contains($ua, $robot)) { 206 return $this->response(); 207 } 208 } 209 210 foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 211 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 212 return $this->response(); 213 } 214 } 215 216 foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 217 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 218 return $this->response(); 219 } 220 } 221 222 foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 223 if (str_contains($ua, $robot)) { 224 foreach ($valid_ip_ranges as $ip_range) { 225 $range = IPFactory::parseRangeString($ip_range); 226 227 if ($range instanceof RangeInterface && $range->contains($address)) { 228 continue 2; 229 } 230 } 231 232 return $this->response(); 233 } 234 } 235 236 foreach (self::ROBOT_IP_FILES as $robot => $url) { 237 if (str_contains($ua, $robot)) { 238 $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 239 240 foreach ($valid_ip_ranges as $ip_range) { 241 $range = IPFactory::parseRangeString($ip_range); 242 243 if ($range instanceof RangeInterface && $range->contains($address)) { 244 continue 2; 245 } 246 } 247 248 return $this->response(); 249 } 250 } 251 252 foreach (self::ROBOT_ASNS as $robot => $asns) { 253 foreach ($asns as $asn) { 254 if (str_contains($ua, $robot)) { 255 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 256 if ($range->contains($address)) { 257 continue 2; 258 } 259 } 260 261 return $this->response(); 262 } 263 } 264 } 265 266 // Allow sites to block access from entire networks. 267 $block_asn = Validator::attributes($request)->string('block_asn', ''); 268 preg_match_all('/(AS\d+)/', $block_asn, $matches); 269 270 foreach ($matches[1] as $asn) { 271 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 272 if ($range->contains($address)) { 273 return $this->response(); 274 } 275 } 276 } 277 278 return $handler->handle($request); 279 } 280 281 /** 282 * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 283 * 284 * @param string $ip 285 * @param array<string> $valid_domains 286 * @param bool $reverse_only 287 * 288 * @return bool 289 */ 290 private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 291 { 292 $host = gethostbyaddr($ip); 293 294 if ($host === false) { 295 return false; 296 } 297 298 foreach ($valid_domains as $domain) { 299 if (str_ends_with($host, $domain)) { 300 return $reverse_only || $ip === gethostbyname($host); 301 } 302 } 303 304 return false; 305 } 306 307 /** 308 * Perform a whois search for an ASN. 309 * 310 * @param string $asn - The autonomous system number to query 311 * 312 * @return array<RangeInterface> 313 */ 314 private function fetchIpRangesForAsn(string $asn): array 315 { 316 return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 317 $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 318 319 try { 320 $loader = new CurlLoader(self::WHOIS_TIMEOUT); 321 $whois = new Whois($loader); 322 $info = $whois->loadAsnInfo($asn); 323 $routes = $info->routes; 324 $ranges = array_map($mapper, $routes); 325 326 return array_filter($ranges); 327 } catch (Throwable) { 328 return []; 329 } 330 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 331 } 332 333 /** 334 * Fetch a list of IP addresses from a remote file. 335 * 336 * @param string $ua 337 * @param string $url 338 * 339 * @return array<string> 340 */ 341 private function fetchIpRangesForUrl(string $ua, string $url): array 342 { 343 return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 344 try { 345 $client = new Client(); 346 $response = $client->get($url, ['timeout' => 5]); 347 $contents = $response->getBody()->getContents(); 348 349 preg_match_all(self::REGEX_IPV4, $contents, $matches); 350 351 return $matches[0]; 352 } catch (GuzzleException) { 353 return []; 354 } 355 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 356 } 357 358 /** 359 * @return ResponseInterface 360 */ 361 private function response(): ResponseInterface 362 { 363 return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 364 } 365} 366