xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision c8614595b95401791dfee90ebff831b2b3b41fc4)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\Middleware;
21
22use Fig\Http\Message\StatusCodeInterface;
23use Fisharebest\Webtrees\Registry;
24use Fisharebest\Webtrees\Validator;
25use GuzzleHttp\Client;
26use GuzzleHttp\Exception\GuzzleException;
27use Iodev\Whois\Loaders\CurlLoader;
28use Iodev\Whois\Modules\Asn\AsnRouteInfo;
29use Iodev\Whois\Whois;
30use IPLib\Address\AddressInterface;
31use IPLib\Factory as IPFactory;
32use IPLib\Range\RangeInterface;
33use Psr\Http\Message\ResponseInterface;
34use Psr\Http\Message\ServerRequestInterface;
35use Psr\Http\Server\MiddlewareInterface;
36use Psr\Http\Server\RequestHandlerInterface;
37use Throwable;
38
39use function array_filter;
40use function array_map;
41use function assert;
42use function gethostbyaddr;
43use function gethostbyname;
44use function preg_match_all;
45use function random_int;
46use function response;
47use function str_contains;
48use function str_ends_with;
49
50/**
51 * Middleware to block bad robots before they waste our valuable CPU cycles.
52 */
53class BadBotBlocker implements MiddlewareInterface
54{
55    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57
58    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59    private const WHOIS_TTL_MIN = 28 * 86400;
60    private const WHOIS_TTL_MAX = 35 * 86400;
61    private const WHOIS_TIMEOUT = 5;
62
63    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64    public const BAD_ROBOTS = [
65        'admantx',
66        'Adsbot',
67        'AhrefsBot',
68        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69        'AspiegelBot',
70        'Barkrowler',
71        'BLEXBot',
72        'DataForSEO',
73        'DotBot',
74        'Grapeshot',
75        'Honolulu-bot', // Aggressive crawer, no info available
76        'ia_archiver',
77        'linabot', // Aggressive crawer, no info available
78        'Linguee',
79        'MJ12bot',
80        'netEstate NE',
81        'panscient',
82        'PetalBot',
83        'proximic',
84        'SemrushBot',
85        'SEOkicks',
86        'SiteKiosk',
87        'Turnitin',
88        'XoviBot',
89        'ZoominfoBot',
90    ];
91
92    /**
93     * Some search engines use reverse/forward DNS to verify the IP address.
94     *
95     * @see https://developer.amazon.com/support/amazonbot
96     * @see https://support.google.com/webmasters/answer/80553?hl=en
97     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
98     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
99     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
100     * @see https://www.mojeek.com/bot.html
101     * @see https://support.apple.com/en-gb/HT204683
102     */
103    private const ROBOT_REV_FWD_DNS = [
104        'Amazonbot'   => ['.crawl.amazon.com'],
105        'Applebot'    => ['.applebot.apple.com'],
106        'bingbot'     => ['.search.msn.com'],
107        'BingPreview' => ['.search.msn.com'],
108        'Google'      => ['.google.com', '.googlebot.com'],
109        'MojeekBot'   => ['.mojeek.com'],
110        'Mail.RU_Bot' => ['.mail.ru'],
111        'msnbot'      => ['.search.msn.com'],
112        'Qwantify'    => ['.search.qwant.com'],
113        'Sogou'       => ['.crawl.sogou.com'],
114        'Yahoo'       => ['.crawl.yahoo.net'],
115        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
116    ];
117
118    /**
119     * Some search engines only use reverse DNS to verify the IP address.
120     *
121     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
122     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
123     * @see https://www.ionos.de/terms-gtc/faq-crawler
124     */
125    private const ROBOT_REV_ONLY_DNS = [
126        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
127        'FreshBot'    => ['.seznam.cz'],
128        'IonCrawl'    => ['.1und1.org'],
129        'Neevabot'    => ['.neeva.com'],
130    ];
131
132    /**
133     * Some search engines operate from designated IP addresses.
134     *
135     * @see https://www.apple.com/go/applebot
136     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
137     */
138    private const ROBOT_IPS = [
139        'AppleBot'    => [
140            '17.0.0.0/8',
141        ],
142        'Ask Jeeves'  => [
143            '65.214.45.143',
144            '65.214.45.148',
145            '66.235.124.192',
146            '66.235.124.7',
147            '66.235.124.101',
148            '66.235.124.193',
149            '66.235.124.73',
150            '66.235.124.196',
151            '66.235.124.74',
152            '63.123.238.8',
153            '202.143.148.61',
154        ],
155        'DuckDuckBot' => [
156            '23.21.227.69',
157            '50.16.241.113',
158            '50.16.241.114',
159            '50.16.241.117',
160            '50.16.247.234',
161            '52.204.97.54',
162            '52.5.190.19',
163            '54.197.234.188',
164            '54.208.100.253',
165            '54.208.102.37',
166            '107.21.1.8',
167        ],
168    ];
169
170    /**
171     * Some search engines operate from designated IP addresses.
172     *
173     * @see https://bot.seekport.com/
174     */
175    private const ROBOT_IP_FILES = [
176        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
177    ];
178
179    /**
180     * Some search engines operate from within a designated autonomous system.
181     *
182     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
183     * @see https://www.facebook.com/peering/
184     */
185    private const ROBOT_ASNS = [
186        'facebook' => ['AS32934', 'AS63293'],
187        'twitter'  => ['AS13414'],
188    ];
189
190    /**
191     * @param ServerRequestInterface  $request
192     * @param RequestHandlerInterface $handler
193     *
194     * @return ResponseInterface
195     */
196    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
197    {
198        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
199        $ip      = Validator::attributes($request)->string('client-ip');
200        $address = IPFactory::parseAddressString($ip);
201        assert($address instanceof AddressInterface);
202
203        foreach (self::BAD_ROBOTS as $robot) {
204            if (str_contains($ua, $robot)) {
205                return $this->response();
206            }
207        }
208
209        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
210            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
211                return $this->response();
212            }
213        }
214
215        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
216            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
217                return $this->response();
218            }
219        }
220
221        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
222            if (str_contains($ua, $robot)) {
223                foreach ($valid_ip_ranges as $ip_range) {
224                    $range = IPFactory::parseRangeString($ip_range);
225
226                    if ($range instanceof RangeInterface && $range->contains($address)) {
227                        continue 2;
228                    }
229                }
230
231                return $this->response();
232            }
233        }
234
235        foreach (self::ROBOT_IP_FILES as $robot => $url) {
236            if (str_contains($ua, $robot)) {
237                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
238
239                foreach ($valid_ip_ranges as $ip_range) {
240                    $range = IPFactory::parseRangeString($ip_range);
241
242                    if ($range instanceof RangeInterface && $range->contains($address)) {
243                        continue 2;
244                    }
245                }
246
247                return $this->response();
248            }
249        }
250
251        foreach (self::ROBOT_ASNS as $robot => $asns) {
252            foreach ($asns as $asn) {
253                if (str_contains($ua, $robot)) {
254                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
255                        if ($range->contains($address)) {
256                            continue 2;
257                        }
258                    }
259
260                    return $this->response();
261                }
262            }
263        }
264
265        // Allow sites to block access from entire networks.
266        $block_asn = Validator::attributes($request)->string('block_asn', '');
267        preg_match_all('/(AS\d+)/', $block_asn, $matches);
268
269        foreach ($matches[1] as $asn) {
270            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
271                if ($range->contains($address)) {
272                    return $this->response();
273                }
274            }
275        }
276
277        return $handler->handle($request);
278    }
279
280    /**
281     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
282     *
283     * @param string        $ip
284     * @param array<string> $valid_domains
285     * @param bool          $reverse_only
286     *
287     * @return bool
288     */
289    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
290    {
291        $host = gethostbyaddr($ip);
292
293        if ($host === false) {
294            return false;
295        }
296
297        foreach ($valid_domains as $domain) {
298            if (str_ends_with($host, $domain)) {
299                return $reverse_only || $ip === gethostbyname($host);
300            }
301        }
302
303        return false;
304    }
305
306    /**
307     * Perform a whois search for an ASN.
308     *
309     * @param string $asn - The autonomous system number to query
310     *
311     * @return array<RangeInterface>
312     */
313    private function fetchIpRangesForAsn(string $asn): array
314    {
315        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
316            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
317
318            try {
319                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
320                $whois  = new Whois($loader);
321                $info   = $whois->loadAsnInfo($asn);
322                $routes = $info->routes;
323                $ranges = array_map($mapper, $routes);
324
325                return array_filter($ranges);
326            } catch (Throwable) {
327                return [];
328            }
329        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
330    }
331
332    /**
333     * Fetch a list of IP addresses from a remote file.
334     *
335     * @param string $ua
336     * @param string $url
337     *
338     * @return array<string>
339     */
340    private function fetchIpRangesForUrl(string $ua, string $url): array
341    {
342        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
343            try {
344                $client   = new Client();
345                $response = $client->get($url, ['timeout' => 5]);
346                $contents = $response->getBody()->getContents();
347
348                preg_match_all(self::REGEX_IPV4, $contents, $matches);
349
350                return $matches[0];
351            } catch (GuzzleException) {
352                return [];
353            }
354        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
355    }
356
357    /**
358     * @return ResponseInterface
359     */
360    private function response(): ResponseInterface
361    {
362        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
363    }
364}
365