xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 3a3594e9af1303d3b17ba9aa7e8c31b403b7b9c8)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\Middleware;
21
22use Fig\Http\Message\StatusCodeInterface;
23use Fisharebest\Webtrees\Registry;
24use Fisharebest\Webtrees\Validator;
25use GuzzleHttp\Client;
26use GuzzleHttp\Exception\GuzzleException;
27use Iodev\Whois\Loaders\CurlLoader;
28use Iodev\Whois\Modules\Asn\AsnRouteInfo;
29use Iodev\Whois\Whois;
30use IPLib\Address\AddressInterface;
31use IPLib\Factory as IPFactory;
32use IPLib\Range\RangeInterface;
33use Psr\Http\Message\ResponseInterface;
34use Psr\Http\Message\ServerRequestInterface;
35use Psr\Http\Server\MiddlewareInterface;
36use Psr\Http\Server\RequestHandlerInterface;
37use Throwable;
38
39use function array_filter;
40use function array_map;
41use function assert;
42use function gethostbyaddr;
43use function gethostbyname;
44use function preg_match_all;
45use function random_int;
46use function response;
47use function str_contains;
48use function str_ends_with;
49
50/**
51 * Middleware to block bad robots before they waste our valuable CPU cycles.
52 */
53class BadBotBlocker implements MiddlewareInterface
54{
55    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57
58    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59    private const WHOIS_TTL_MIN = 28 * 86400;
60    private const WHOIS_TTL_MAX = 35 * 86400;
61    private const WHOIS_TIMEOUT = 5;
62
63    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64    public const BAD_ROBOTS = [
65        'admantx',
66        'Adsbot',
67        'AhrefsBot',
68        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69        'AspiegelBot',
70        'Barkrowler',
71        'BLEXBot',
72        'DataForSEO',
73        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
74        'DotBot',
75        'Grapeshot',
76        'Honolulu-bot', // Aggressive crawer, no info available
77        'ia_archiver',
78        'linabot', // Aggressive crawer, no info available
79        'Linguee',
80        'MJ12bot',
81        'netEstate NE',
82        'panscient',
83        'PetalBot',
84        'proximic',
85        'SemrushBot',
86        'serpstatbot',
87        'SEOkicks',
88        'SiteKiosk',
89        'Turnitin',
90        'XoviBot',
91        'ZoominfoBot',
92    ];
93
94    /**
95     * Some search engines use reverse/forward DNS to verify the IP address.
96     *
97     * @see https://developer.amazon.com/support/amazonbot
98     * @see https://support.google.com/webmasters/answer/80553?hl=en
99     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
100     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
101     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
102     * @see https://www.mojeek.com/bot.html
103     * @see https://support.apple.com/en-gb/HT204683
104     */
105    private const ROBOT_REV_FWD_DNS = [
106        'Amazonbot'   => ['.crawl.amazon.com'],
107        'Applebot'    => ['.applebot.apple.com'],
108        'bingbot'     => ['.search.msn.com'],
109        'BingPreview' => ['.search.msn.com'],
110        'Google'      => ['.google.com', '.googlebot.com'],
111        'MojeekBot'   => ['.mojeek.com'],
112        'Mail.RU_Bot' => ['.mail.ru'],
113        'msnbot'      => ['.search.msn.com'],
114        'Qwantify'    => ['.search.qwant.com'],
115        'Sogou'       => ['.crawl.sogou.com'],
116        'Yahoo'       => ['.crawl.yahoo.net'],
117        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
118    ];
119
120    /**
121     * Some search engines only use reverse DNS to verify the IP address.
122     *
123     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
124     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
125     * @see https://www.ionos.de/terms-gtc/faq-crawler
126     */
127    private const ROBOT_REV_ONLY_DNS = [
128        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
129        'FreshBot'    => ['.seznam.cz'],
130        'IonCrawl'    => ['.1und1.org'],
131        'Neevabot'    => ['.neeva.com'],
132    ];
133
134    /**
135     * Some search engines operate from designated IP addresses.
136     *
137     * @see https://www.apple.com/go/applebot
138     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
139     */
140    private const ROBOT_IPS = [
141        'AppleBot'    => [
142            '17.0.0.0/8',
143        ],
144        'Ask Jeeves'  => [
145            '65.214.45.143',
146            '65.214.45.148',
147            '66.235.124.192',
148            '66.235.124.7',
149            '66.235.124.101',
150            '66.235.124.193',
151            '66.235.124.73',
152            '66.235.124.196',
153            '66.235.124.74',
154            '63.123.238.8',
155            '202.143.148.61',
156        ],
157        'DuckDuckBot' => [
158            '23.21.227.69',
159            '50.16.241.113',
160            '50.16.241.114',
161            '50.16.241.117',
162            '50.16.247.234',
163            '52.204.97.54',
164            '52.5.190.19',
165            '54.197.234.188',
166            '54.208.100.253',
167            '54.208.102.37',
168            '107.21.1.8',
169        ],
170    ];
171
172    /**
173     * Some search engines operate from designated IP addresses.
174     *
175     * @see https://bot.seekport.com/
176     */
177    private const ROBOT_IP_FILES = [
178        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
179    ];
180
181    /**
182     * Some search engines operate from within a designated autonomous system.
183     *
184     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
185     * @see https://www.facebook.com/peering/
186     */
187    private const ROBOT_ASNS = [
188        'facebook' => ['AS32934', 'AS63293'],
189        'twitter'  => ['AS13414'],
190    ];
191
192    /**
193     * @param ServerRequestInterface  $request
194     * @param RequestHandlerInterface $handler
195     *
196     * @return ResponseInterface
197     */
198    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
199    {
200        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
201        $ip      = Validator::attributes($request)->string('client-ip');
202        $address = IPFactory::parseAddressString($ip);
203        assert($address instanceof AddressInterface);
204
205        foreach (self::BAD_ROBOTS as $robot) {
206            if (str_contains($ua, $robot)) {
207                return $this->response();
208            }
209        }
210
211        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
212            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
213                return $this->response();
214            }
215        }
216
217        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
218            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
219                return $this->response();
220            }
221        }
222
223        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
224            if (str_contains($ua, $robot)) {
225                foreach ($valid_ip_ranges as $ip_range) {
226                    $range = IPFactory::parseRangeString($ip_range);
227
228                    if ($range instanceof RangeInterface && $range->contains($address)) {
229                        continue 2;
230                    }
231                }
232
233                return $this->response();
234            }
235        }
236
237        foreach (self::ROBOT_IP_FILES as $robot => $url) {
238            if (str_contains($ua, $robot)) {
239                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
240
241                foreach ($valid_ip_ranges as $ip_range) {
242                    $range = IPFactory::parseRangeString($ip_range);
243
244                    if ($range instanceof RangeInterface && $range->contains($address)) {
245                        continue 2;
246                    }
247                }
248
249                return $this->response();
250            }
251        }
252
253        foreach (self::ROBOT_ASNS as $robot => $asns) {
254            foreach ($asns as $asn) {
255                if (str_contains($ua, $robot)) {
256                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
257                        if ($range->contains($address)) {
258                            continue 2;
259                        }
260                    }
261
262                    return $this->response();
263                }
264            }
265        }
266
267        // Allow sites to block access from entire networks.
268        $block_asn = Validator::attributes($request)->string('block_asn', '');
269        preg_match_all('/(AS\d+)/', $block_asn, $matches);
270
271        foreach ($matches[1] as $asn) {
272            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
273                if ($range->contains($address)) {
274                    return $this->response();
275                }
276            }
277        }
278
279        return $handler->handle($request);
280    }
281
282    /**
283     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
284     *
285     * @param string        $ip
286     * @param array<string> $valid_domains
287     * @param bool          $reverse_only
288     *
289     * @return bool
290     */
291    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
292    {
293        $host = gethostbyaddr($ip);
294
295        if ($host === false) {
296            return false;
297        }
298
299        foreach ($valid_domains as $domain) {
300            if (str_ends_with($host, $domain)) {
301                return $reverse_only || $ip === gethostbyname($host);
302            }
303        }
304
305        return false;
306    }
307
308    /**
309     * Perform a whois search for an ASN.
310     *
311     * @param string $asn - The autonomous system number to query
312     *
313     * @return array<RangeInterface>
314     */
315    private function fetchIpRangesForAsn(string $asn): array
316    {
317        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
318            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
319
320            try {
321                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
322                $whois  = new Whois($loader);
323                $info   = $whois->loadAsnInfo($asn);
324                $routes = $info->routes;
325                $ranges = array_map($mapper, $routes);
326
327                return array_filter($ranges);
328            } catch (Throwable) {
329                return [];
330            }
331        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
332    }
333
334    /**
335     * Fetch a list of IP addresses from a remote file.
336     *
337     * @param string $ua
338     * @param string $url
339     *
340     * @return array<string>
341     */
342    private function fetchIpRangesForUrl(string $ua, string $url): array
343    {
344        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
345            try {
346                $client   = new Client();
347                $response = $client->get($url, ['timeout' => 5]);
348                $contents = $response->getBody()->getContents();
349
350                preg_match_all(self::REGEX_IPV4, $contents, $matches);
351
352                return $matches[0];
353            } catch (GuzzleException) {
354                return [];
355            }
356        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
357    }
358
359    /**
360     * @return ResponseInterface
361     */
362    private function response(): ResponseInterface
363    {
364        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
365    }
366}
367