1<?php 2 3/** 4 * webtrees: online genealogy 5 * Copyright (C) 2022 webtrees development team 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * You should have received a copy of the GNU General Public License 15 * along with this program. If not, see <https://www.gnu.org/licenses/>. 16 */ 17 18declare(strict_types=1); 19 20namespace Fisharebest\Webtrees\Http\Middleware; 21 22use Fig\Http\Message\StatusCodeInterface; 23use Fisharebest\Webtrees\Registry; 24use Fisharebest\Webtrees\Validator; 25use GuzzleHttp\Client; 26use GuzzleHttp\Exception\GuzzleException; 27use Iodev\Whois\Loaders\CurlLoader; 28use Iodev\Whois\Modules\Asn\AsnRouteInfo; 29use Iodev\Whois\Whois; 30use IPLib\Address\AddressInterface; 31use IPLib\Factory as IPFactory; 32use IPLib\Range\RangeInterface; 33use Psr\Http\Message\ResponseInterface; 34use Psr\Http\Message\ServerRequestInterface; 35use Psr\Http\Server\MiddlewareInterface; 36use Psr\Http\Server\RequestHandlerInterface; 37use Throwable; 38 39use function array_filter; 40use function array_map; 41use function assert; 42use function gethostbyaddr; 43use function gethostbyname; 44use function preg_match_all; 45use function random_int; 46use function response; 47use function str_contains; 48use function str_ends_with; 49 50/** 51 * Middleware to block bad robots before they waste our valuable CPU cycles. 52 */ 53class BadBotBlocker implements MiddlewareInterface 54{ 55 private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)'; 56 private const REGEX_IPV4 = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/'; 57 58 // Cache whois requests. Try to avoid all caches expiring at the same time. 59 private const WHOIS_TTL_MIN = 28 * 86400; 60 private const WHOIS_TTL_MAX = 35 * 86400; 61 private const WHOIS_TIMEOUT = 5; 62 63 // Bad robots - SEO optimisers, advertisers, etc. This list is shared with robots.txt. 64 public const BAD_ROBOTS = [ 65 'admantx', 66 'Adsbot', 67 'AhrefsBot', 68 'Amazonbot', // Until it understands crawl-delay and noindex / nofollow 69 'AspiegelBot', 70 'Barkrowler', 71 'BLEXBot', 72 'DataForSEO', 73 'DotBot', 74 'Grapeshot', 75 'Honolulu-bot', // Aggressive crawer, no info available 76 'ia_archiver', 77 'linabot', // Aggressive crawer, no info available 78 'Linguee', 79 'MJ12bot', 80 'netEstate NE', 81 'panscient', 82 'PetalBot', 83 'proximic', 84 'SemrushBot', 85 'SEOkicks', 86 'SiteKiosk', 87 'Turnitin', 88 'XoviBot', 89 'ZoominfoBot', 90 ]; 91 92 /** 93 * Some search engines use reverse/forward DNS to verify the IP address. 94 * 95 * @see https://developer.amazon.com/support/amazonbot 96 * @see https://support.google.com/webmasters/answer/80553?hl=en 97 * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0 98 * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26 99 * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html 100 * @see https://www.mojeek.com/bot.html 101 * @see https://support.apple.com/en-gb/HT204683 102 */ 103 private const ROBOT_REV_FWD_DNS = [ 104 'Amazonbot' => ['.crawl.amazon.com'], 105 'Applebot' => ['.applebot.apple.com'], 106 'bingbot' => ['.search.msn.com'], 107 'BingPreview' => ['.search.msn.com'], 108 'Google' => ['.google.com', '.googlebot.com'], 109 'MojeekBot' => ['.mojeek.com'], 110 'Mail.RU_Bot' => ['.mail.ru'], 111 'msnbot' => ['.search.msn.com'], 112 'Qwantify' => ['.search.qwant.com'], 113 'Sogou' => ['.crawl.sogou.com'], 114 'Yahoo' => ['.crawl.yahoo.net'], 115 'Yandex' => ['.yandex.ru', '.yandex.net', '.yandex.com'], 116 ]; 117 118 /** 119 * Some search engines only use reverse DNS to verify the IP address. 120 * 121 * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001 122 * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler 123 * @see https://www.ionos.de/terms-gtc/faq-crawler 124 */ 125 private const ROBOT_REV_ONLY_DNS = [ 126 'Baiduspider' => ['.baidu.com', '.baidu.jp'], 127 'FreshBot' => ['.seznam.cz'], 128 'IonCrawl' => ['.1und1.org'], 129 'Neevabot' => ['.neeva.com'], 130 ]; 131 132 /** 133 * Some search engines operate from designated IP addresses. 134 * 135 * @see https://www.apple.com/go/applebot 136 * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot 137 */ 138 private const ROBOT_IPS = [ 139 'AppleBot' => [ 140 '17.0.0.0/8', 141 ], 142 'Ask Jeeves' => [ 143 '65.214.45.143', 144 '65.214.45.148', 145 '66.235.124.192', 146 '66.235.124.7', 147 '66.235.124.101', 148 '66.235.124.193', 149 '66.235.124.73', 150 '66.235.124.196', 151 '66.235.124.74', 152 '63.123.238.8', 153 '202.143.148.61', 154 ], 155 'DuckDuckBot' => [ 156 '23.21.227.69', 157 '50.16.241.113', 158 '50.16.241.114', 159 '50.16.241.117', 160 '50.16.247.234', 161 '52.204.97.54', 162 '52.5.190.19', 163 '54.197.234.188', 164 '54.208.100.253', 165 '54.208.102.37', 166 '107.21.1.8', 167 ], 168 ]; 169 170 /** 171 * Some search engines operate from designated IP addresses. 172 * 173 * @see https://bot.seekport.com/ 174 */ 175 private const ROBOT_IP_FILES = [ 176 'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt', 177 ]; 178 179 /** 180 * Some search engines operate from within a designated autonomous system. 181 * 182 * @see https://developers.facebook.com/docs/sharing/webmasters/crawler 183 * @see https://www.facebook.com/peering/ 184 */ 185 private const ROBOT_ASNS = [ 186 'facebook' => ['AS32934', 'AS63293'], 187 'twitter' => ['AS13414'], 188 ]; 189 190 /** 191 * @param ServerRequestInterface $request 192 * @param RequestHandlerInterface $handler 193 * 194 * @return ResponseInterface 195 */ 196 public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface 197 { 198 $ua = Validator::serverParams($request)->string('HTTP_USER_AGENT', ''); 199 $ip = Validator::attributes($request)->string('client-ip'); 200 $address = IPFactory::parseAddressString($ip); 201 assert($address instanceof AddressInterface); 202 203 foreach (self::BAD_ROBOTS as $robot) { 204 if (str_contains($ua, $robot)) { 205 return $this->response(); 206 } 207 } 208 209 foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) { 210 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) { 211 return $this->response(); 212 } 213 } 214 215 foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) { 216 if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) { 217 return $this->response(); 218 } 219 } 220 221 foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) { 222 if (str_contains($ua, $robot)) { 223 foreach ($valid_ip_ranges as $ip_range) { 224 $range = IPFactory::parseRangeString($ip_range); 225 226 if ($range instanceof RangeInterface && $range->contains($address)) { 227 continue 2; 228 } 229 } 230 231 return $this->response(); 232 } 233 } 234 235 foreach (self::ROBOT_IP_FILES as $robot => $url) { 236 if (str_contains($ua, $robot)) { 237 $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url); 238 239 foreach ($valid_ip_ranges as $ip_range) { 240 $range = IPFactory::parseRangeString($ip_range); 241 242 if ($range instanceof RangeInterface && $range->contains($address)) { 243 continue 2; 244 } 245 } 246 247 return $this->response(); 248 } 249 } 250 251 foreach (self::ROBOT_ASNS as $robot => $asns) { 252 foreach ($asns as $asn) { 253 if (str_contains($ua, $robot)) { 254 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 255 if ($range->contains($address)) { 256 continue 2; 257 } 258 } 259 260 return $this->response(); 261 } 262 } 263 } 264 265 // Allow sites to block access from entire networks. 266 $block_asn = Validator::attributes($request)->string('block_asn', ''); 267 preg_match_all('/(AS\d+)/', $block_asn, $matches); 268 269 foreach ($matches[1] as $asn) { 270 foreach ($this->fetchIpRangesForAsn($asn) as $range) { 271 if ($range->contains($address)) { 272 return $this->response(); 273 } 274 } 275 } 276 277 return $handler->handle($request); 278 } 279 280 /** 281 * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup. 282 * 283 * @param string $ip 284 * @param array<string> $valid_domains 285 * @param bool $reverse_only 286 * 287 * @return bool 288 */ 289 private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool 290 { 291 $host = gethostbyaddr($ip); 292 293 if ($host === false) { 294 return false; 295 } 296 297 foreach ($valid_domains as $domain) { 298 if (str_ends_with($host, $domain)) { 299 return $reverse_only || $ip === gethostbyname($host); 300 } 301 } 302 303 return false; 304 } 305 306 /** 307 * Perform a whois search for an ASN. 308 * 309 * @param string $asn - The autonomous system number to query 310 * 311 * @return array<RangeInterface> 312 */ 313 private function fetchIpRangesForAsn(string $asn): array 314 { 315 return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array { 316 $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6); 317 318 try { 319 $loader = new CurlLoader(self::WHOIS_TIMEOUT); 320 $whois = new Whois($loader); 321 $info = $whois->loadAsnInfo($asn); 322 $routes = $info->routes; 323 $ranges = array_map($mapper, $routes); 324 325 return array_filter($ranges); 326 } catch (Throwable) { 327 return []; 328 } 329 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 330 } 331 332 /** 333 * Fetch a list of IP addresses from a remote file. 334 * 335 * @param string $ua 336 * @param string $url 337 * 338 * @return array<string> 339 */ 340 private function fetchIpRangesForUrl(string $ua, string $url): array 341 { 342 return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array { 343 try { 344 $client = new Client(); 345 $response = $client->get($url, ['timeout' => 5]); 346 $contents = $response->getBody()->getContents(); 347 348 preg_match_all(self::REGEX_IPV4, $contents, $matches); 349 350 return $matches[0]; 351 } catch (GuzzleException) { 352 return []; 353 } 354 }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX)); 355 } 356 357 /** 358 * @return ResponseInterface 359 */ 360 private function response(): ResponseInterface 361 { 362 return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE); 363 } 364} 365