xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision f44cff08401967b50f32ffd49743fd70fb5ec0b9)
1<?php
2
3/**
4 * webtrees: online genealogy
5 * Copyright (C) 2022 webtrees development team
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16 */
17
18declare(strict_types=1);
19
20namespace Fisharebest\Webtrees\Http\Middleware;
21
22use Fig\Http\Message\StatusCodeInterface;
23use Fisharebest\Webtrees\Registry;
24use Fisharebest\Webtrees\Validator;
25use GuzzleHttp\Client;
26use GuzzleHttp\Exception\GuzzleException;
27use Iodev\Whois\Loaders\CurlLoader;
28use Iodev\Whois\Modules\Asn\AsnRouteInfo;
29use Iodev\Whois\Whois;
30use IPLib\Address\AddressInterface;
31use IPLib\Factory as IPFactory;
32use IPLib\Range\RangeInterface;
33use Psr\Http\Message\ResponseInterface;
34use Psr\Http\Message\ServerRequestInterface;
35use Psr\Http\Server\MiddlewareInterface;
36use Psr\Http\Server\RequestHandlerInterface;
37use Throwable;
38
39use function array_filter;
40use function array_map;
41use function assert;
42use function gethostbyaddr;
43use function gethostbyname;
44use function preg_match_all;
45use function random_int;
46use function response;
47use function str_contains;
48use function str_ends_with;
49
50/**
51 * Middleware to block bad robots before they waste our valuable CPU cycles.
52 */
53class BadBotBlocker implements MiddlewareInterface
54{
55    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57
58    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59    private const WHOIS_TTL_MIN = 28 * 86400;
60    private const WHOIS_TTL_MAX = 35 * 86400;
61    private const WHOIS_TIMEOUT = 5;
62
63    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64    public const BAD_ROBOTS = [
65        'admantx',
66        'Adsbot',
67        'AhrefsBot',
68        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69        'AspiegelBot',
70        'Barkrowler',
71        'BLEXBot',
72        'DataForSEO',
73        'DotBot',
74        'Grapeshot',
75        'Honolulu-bot', // Aggressive crawer, no info available
76        'ia_archiver',
77        'linabot', // Aggressive crawer, no info available
78        'Linguee',
79        'MJ12bot',
80        'netEstate NE',
81        'panscient',
82        'PetalBot',
83        'proximic',
84        'SemrushBot',
85        'serpstatbot',
86        'SEOkicks',
87        'SiteKiosk',
88        'Turnitin',
89        'XoviBot',
90        'ZoominfoBot',
91    ];
92
93    /**
94     * Some search engines use reverse/forward DNS to verify the IP address.
95     *
96     * @see https://developer.amazon.com/support/amazonbot
97     * @see https://support.google.com/webmasters/answer/80553?hl=en
98     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
99     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
100     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
101     * @see https://www.mojeek.com/bot.html
102     * @see https://support.apple.com/en-gb/HT204683
103     */
104    private const ROBOT_REV_FWD_DNS = [
105        'Amazonbot'   => ['.crawl.amazon.com'],
106        'Applebot'    => ['.applebot.apple.com'],
107        'bingbot'     => ['.search.msn.com'],
108        'BingPreview' => ['.search.msn.com'],
109        'Google'      => ['.google.com', '.googlebot.com'],
110        'MojeekBot'   => ['.mojeek.com'],
111        'Mail.RU_Bot' => ['.mail.ru'],
112        'msnbot'      => ['.search.msn.com'],
113        'Qwantify'    => ['.search.qwant.com'],
114        'Sogou'       => ['.crawl.sogou.com'],
115        'Yahoo'       => ['.crawl.yahoo.net'],
116        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
117    ];
118
119    /**
120     * Some search engines only use reverse DNS to verify the IP address.
121     *
122     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
123     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
124     * @see https://www.ionos.de/terms-gtc/faq-crawler
125     */
126    private const ROBOT_REV_ONLY_DNS = [
127        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
128        'FreshBot'    => ['.seznam.cz'],
129        'IonCrawl'    => ['.1und1.org'],
130        'Neevabot'    => ['.neeva.com'],
131    ];
132
133    /**
134     * Some search engines operate from designated IP addresses.
135     *
136     * @see https://www.apple.com/go/applebot
137     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
138     */
139    private const ROBOT_IPS = [
140        'AppleBot'    => [
141            '17.0.0.0/8',
142        ],
143        'Ask Jeeves'  => [
144            '65.214.45.143',
145            '65.214.45.148',
146            '66.235.124.192',
147            '66.235.124.7',
148            '66.235.124.101',
149            '66.235.124.193',
150            '66.235.124.73',
151            '66.235.124.196',
152            '66.235.124.74',
153            '63.123.238.8',
154            '202.143.148.61',
155        ],
156        'DuckDuckBot' => [
157            '23.21.227.69',
158            '50.16.241.113',
159            '50.16.241.114',
160            '50.16.241.117',
161            '50.16.247.234',
162            '52.204.97.54',
163            '52.5.190.19',
164            '54.197.234.188',
165            '54.208.100.253',
166            '54.208.102.37',
167            '107.21.1.8',
168        ],
169    ];
170
171    /**
172     * Some search engines operate from designated IP addresses.
173     *
174     * @see https://bot.seekport.com/
175     */
176    private const ROBOT_IP_FILES = [
177        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
178    ];
179
180    /**
181     * Some search engines operate from within a designated autonomous system.
182     *
183     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
184     * @see https://www.facebook.com/peering/
185     */
186    private const ROBOT_ASNS = [
187        'facebook' => ['AS32934', 'AS63293'],
188        'twitter'  => ['AS13414'],
189    ];
190
191    /**
192     * @param ServerRequestInterface  $request
193     * @param RequestHandlerInterface $handler
194     *
195     * @return ResponseInterface
196     */
197    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
198    {
199        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
200        $ip      = Validator::attributes($request)->string('client-ip');
201        $address = IPFactory::parseAddressString($ip);
202        assert($address instanceof AddressInterface);
203
204        foreach (self::BAD_ROBOTS as $robot) {
205            if (str_contains($ua, $robot)) {
206                return $this->response();
207            }
208        }
209
210        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
211            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
212                return $this->response();
213            }
214        }
215
216        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
217            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
218                return $this->response();
219            }
220        }
221
222        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
223            if (str_contains($ua, $robot)) {
224                foreach ($valid_ip_ranges as $ip_range) {
225                    $range = IPFactory::parseRangeString($ip_range);
226
227                    if ($range instanceof RangeInterface && $range->contains($address)) {
228                        continue 2;
229                    }
230                }
231
232                return $this->response();
233            }
234        }
235
236        foreach (self::ROBOT_IP_FILES as $robot => $url) {
237            if (str_contains($ua, $robot)) {
238                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
239
240                foreach ($valid_ip_ranges as $ip_range) {
241                    $range = IPFactory::parseRangeString($ip_range);
242
243                    if ($range instanceof RangeInterface && $range->contains($address)) {
244                        continue 2;
245                    }
246                }
247
248                return $this->response();
249            }
250        }
251
252        foreach (self::ROBOT_ASNS as $robot => $asns) {
253            foreach ($asns as $asn) {
254                if (str_contains($ua, $robot)) {
255                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
256                        if ($range->contains($address)) {
257                            continue 2;
258                        }
259                    }
260
261                    return $this->response();
262                }
263            }
264        }
265
266        // Allow sites to block access from entire networks.
267        $block_asn = Validator::attributes($request)->string('block_asn', '');
268        preg_match_all('/(AS\d+)/', $block_asn, $matches);
269
270        foreach ($matches[1] as $asn) {
271            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
272                if ($range->contains($address)) {
273                    return $this->response();
274                }
275            }
276        }
277
278        return $handler->handle($request);
279    }
280
281    /**
282     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
283     *
284     * @param string        $ip
285     * @param array<string> $valid_domains
286     * @param bool          $reverse_only
287     *
288     * @return bool
289     */
290    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
291    {
292        $host = gethostbyaddr($ip);
293
294        if ($host === false) {
295            return false;
296        }
297
298        foreach ($valid_domains as $domain) {
299            if (str_ends_with($host, $domain)) {
300                return $reverse_only || $ip === gethostbyname($host);
301            }
302        }
303
304        return false;
305    }
306
307    /**
308     * Perform a whois search for an ASN.
309     *
310     * @param string $asn - The autonomous system number to query
311     *
312     * @return array<RangeInterface>
313     */
314    private function fetchIpRangesForAsn(string $asn): array
315    {
316        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
317            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
318
319            try {
320                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
321                $whois  = new Whois($loader);
322                $info   = $whois->loadAsnInfo($asn);
323                $routes = $info->routes;
324                $ranges = array_map($mapper, $routes);
325
326                return array_filter($ranges);
327            } catch (Throwable) {
328                return [];
329            }
330        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
331    }
332
333    /**
334     * Fetch a list of IP addresses from a remote file.
335     *
336     * @param string $ua
337     * @param string $url
338     *
339     * @return array<string>
340     */
341    private function fetchIpRangesForUrl(string $ua, string $url): array
342    {
343        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
344            try {
345                $client   = new Client();
346                $response = $client->get($url, ['timeout' => 5]);
347                $contents = $response->getBody()->getContents();
348
349                preg_match_all(self::REGEX_IPV4, $contents, $matches);
350
351                return $matches[0];
352            } catch (GuzzleException) {
353                return [];
354            }
355        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
356    }
357
358    /**
359     * @return ResponseInterface
360     */
361    private function response(): ResponseInterface
362    {
363        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
364    }
365}
366