xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision e5766395c1a71e715ebaadcf2d63d036d60fb649)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25d2d58874SGreg Roachuse GuzzleHttp\Client;
26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException;
27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
29089dadacSGreg Roachuse Iodev\Whois\Whois;
30089dadacSGreg Roachuse IPLib\Address\AddressInterface;
3169675509SGreg Roachuse IPLib\Factory as IPFactory;
32089dadacSGreg Roachuse IPLib\Range\RangeInterface;
33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
37089dadacSGreg Roachuse Throwable;
38089dadacSGreg Roach
39b7e8616fSGreg Roachuse function array_filter;
40089dadacSGreg Roachuse function array_map;
41089dadacSGreg Roachuse function assert;
42089dadacSGreg Roachuse function gethostbyaddr;
43089dadacSGreg Roachuse function gethostbyname;
44b7e8616fSGreg Roachuse function preg_match_all;
45b7e8616fSGreg Roachuse function random_int;
46089dadacSGreg Roachuse function response;
47dec352c1SGreg Roachuse function str_contains;
48dec352c1SGreg Roachuse function str_ends_with;
49089dadacSGreg Roach
50089dadacSGreg Roach/**
51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
52089dadacSGreg Roach */
53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
54089dadacSGreg Roach{
55d2d58874SGreg Roach    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56d2d58874SGreg Roach    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57d2d58874SGreg Roach
58089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
60089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
61089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
62089dadacSGreg Roach
63ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64ffa287a1SGreg Roach    public const BAD_ROBOTS = [
65089dadacSGreg Roach        'admantx',
66be5f8e6aSGreg Roach        'Adsbot',
67089dadacSGreg Roach        'AhrefsBot',
687fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69227c6666SGreg Roach        'AspiegelBot',
700036e960SGreg Roach        'Awario', // Brand management
7161e93e26SGreg Roach        'Barkrowler',
72a10ff261SGreg Roach        'BLEXBot',
731763aecaSGreg Roach        'DataForSEO',
743a3594e9SGreg Roach        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
75089dadacSGreg Roach        'DotBot',
76089dadacSGreg Roach        'Grapeshot',
77f3d48b69SGreg Roach        'Honolulu-bot', // Aggressive crawer, no info available
78089dadacSGreg Roach        'ia_archiver',
79c8614595SGreg Roach        'linabot', // Aggressive crawer, no info available
8003bad539SGreg Roach        'Linguee',
8110d27708SGreg Roach        'MegaIndex.ru',
82089dadacSGreg Roach        'MJ12bot',
83d5bb02daSGreg Roach        'netEstate NE',
84227c6666SGreg Roach        'panscient',
85be5f8e6aSGreg Roach        'PetalBot',
86089dadacSGreg Roach        'proximic',
8710d27708SGreg Roach        'SeekportBot', // Pretends to be a search engine - but isn't
88089dadacSGreg Roach        'SemrushBot',
89f4b15485SGreg Roach        'serpstatbot',
90d5bb02daSGreg Roach        'SEOkicks',
91d5bb02daSGreg Roach        'SiteKiosk',
92be5f8e6aSGreg Roach        'Turnitin',
937d9d7ecaSGreg Roach        'wp_is_mobile', // Nothing to do with wordpress
94089dadacSGreg Roach        'XoviBot',
95a10ff261SGreg Roach        'ZoominfoBot',
96089dadacSGreg Roach    ];
97089dadacSGreg Roach
98089dadacSGreg Roach    /**
995c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
100089dadacSGreg Roach     *
101891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
102089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
103089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
104089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
105089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
10677d0194eSGreg Roach     * @see https://www.mojeek.com/bot.html
10777d0194eSGreg Roach     * @see https://support.apple.com/en-gb/HT204683
108089dadacSGreg Roach     */
1095c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
110891c4176SGreg Roach        'Amazonbot'        => ['.crawl.amazon.com'],
11177d0194eSGreg Roach        'Applebot'         => ['.applebot.apple.com'],
112089dadacSGreg Roach        'BingPreview'      => ['.search.msn.com'],
113089dadacSGreg Roach        'Google'           => ['.google.com', '.googlebot.com'],
114d5bb02daSGreg Roach        'Mail.RU_Bot'      => ['.mail.ru'],
115e47c3c91SGreg Roach        'MicrosoftPreview' => ['.search.msn.com'],
116e47c3c91SGreg Roach        'MojeekBot'        => ['.mojeek.com'],
117089dadacSGreg Roach        'Qwantify'         => ['.search.qwant.com'],
118089dadacSGreg Roach        'Sogou'            => ['.crawl.sogou.com'],
119089dadacSGreg Roach        'Yahoo'            => ['.crawl.yahoo.net'],
120089dadacSGreg Roach        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
121e47c3c91SGreg Roach        'bingbot'          => ['.search.msn.com'],
122e47c3c91SGreg Roach        'msnbot'           => ['.search.msn.com'],
123089dadacSGreg Roach    ];
124089dadacSGreg Roach
125089dadacSGreg Roach    /**
1265c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1275c20d904SGreg Roach     *
1285c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1291ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
130a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1315c20d904SGreg Roach     */
1325c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1336a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1341ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
135a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
136d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1378e1afc64SGreg Roach        'SeznamBot'   => ['.seznam.cz'],
1385c20d904SGreg Roach    ];
1395c20d904SGreg Roach
1405c20d904SGreg Roach    /**
141089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
142089dadacSGreg Roach     *
143ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
144089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
145089dadacSGreg Roach     */
146089dadacSGreg Roach    private const ROBOT_IPS = [
147813eb6c8SGreg Roach        'AppleBot'    => [
148813eb6c8SGreg Roach            '17.0.0.0/8',
149813eb6c8SGreg Roach        ],
150089dadacSGreg Roach        'Ask Jeeves'  => [
151089dadacSGreg Roach            '65.214.45.143',
152089dadacSGreg Roach            '65.214.45.148',
153089dadacSGreg Roach            '66.235.124.192',
154089dadacSGreg Roach            '66.235.124.7',
155089dadacSGreg Roach            '66.235.124.101',
156089dadacSGreg Roach            '66.235.124.193',
157089dadacSGreg Roach            '66.235.124.73',
158089dadacSGreg Roach            '66.235.124.196',
159089dadacSGreg Roach            '66.235.124.74',
160089dadacSGreg Roach            '63.123.238.8',
161089dadacSGreg Roach            '202.143.148.61',
162089dadacSGreg Roach        ],
163089dadacSGreg Roach        'DuckDuckBot' => [
164089dadacSGreg Roach            '23.21.227.69',
165089dadacSGreg Roach            '50.16.241.113',
166089dadacSGreg Roach            '50.16.241.114',
167089dadacSGreg Roach            '50.16.241.117',
168089dadacSGreg Roach            '50.16.247.234',
169089dadacSGreg Roach            '52.204.97.54',
170089dadacSGreg Roach            '52.5.190.19',
171089dadacSGreg Roach            '54.197.234.188',
172089dadacSGreg Roach            '54.208.100.253',
173089dadacSGreg Roach            '54.208.102.37',
174089dadacSGreg Roach            '107.21.1.8',
175089dadacSGreg Roach        ],
176089dadacSGreg Roach    ];
177089dadacSGreg Roach
178089dadacSGreg Roach    /**
179d2d58874SGreg Roach     * Some search engines operate from designated IP addresses.
180d2d58874SGreg Roach     *
181d2d58874SGreg Roach     * @see https://bot.seekport.com/
182d2d58874SGreg Roach     */
183d2d58874SGreg Roach    private const ROBOT_IP_FILES = [
184d2d58874SGreg Roach        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
185d2d58874SGreg Roach    ];
186d2d58874SGreg Roach
187d2d58874SGreg Roach    /**
188089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
189089dadacSGreg Roach     *
190089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
191cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
192089dadacSGreg Roach     */
193cc7171a0SGreg Roach    private const ROBOT_ASNS = [
194cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
195cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
196089dadacSGreg Roach    ];
197089dadacSGreg Roach
198089dadacSGreg Roach    /**
199089dadacSGreg Roach     * @param ServerRequestInterface  $request
200089dadacSGreg Roach     * @param RequestHandlerInterface $handler
201089dadacSGreg Roach     *
202089dadacSGreg Roach     * @return ResponseInterface
203089dadacSGreg Roach     */
204089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
205089dadacSGreg Roach    {
206b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
207b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
2084a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
209089dadacSGreg Roach        assert($address instanceof AddressInterface);
210089dadacSGreg Roach
211dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
212dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
213089dadacSGreg Roach                return $this->response();
214089dadacSGreg Roach            }
215dec352c1SGreg Roach        }
216089dadacSGreg Roach
2175c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
218dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
2195c20d904SGreg Roach                return $this->response();
2205c20d904SGreg Roach            }
2215c20d904SGreg Roach        }
2225c20d904SGreg Roach
2235c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
224dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
225089dadacSGreg Roach                return $this->response();
226089dadacSGreg Roach            }
227089dadacSGreg Roach        }
228089dadacSGreg Roach
229d2d58874SGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
230dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
231d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
232d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
233d2d58874SGreg Roach
234d2d58874SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
235d2d58874SGreg Roach                        continue 2;
236d2d58874SGreg Roach                    }
237d2d58874SGreg Roach                }
238d2d58874SGreg Roach
239d2d58874SGreg Roach                return $this->response();
240d2d58874SGreg Roach            }
241d2d58874SGreg Roach        }
242d2d58874SGreg Roach
243d2d58874SGreg Roach        foreach (self::ROBOT_IP_FILES as $robot => $url) {
244d2d58874SGreg Roach            if (str_contains($ua, $robot)) {
245d2d58874SGreg Roach                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
246d2d58874SGreg Roach
247d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
248d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
249813eb6c8SGreg Roach
250813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
251813eb6c8SGreg Roach                        continue 2;
252813eb6c8SGreg Roach                    }
253813eb6c8SGreg Roach                }
254813eb6c8SGreg Roach
255089dadacSGreg Roach                return $this->response();
256089dadacSGreg Roach            }
257089dadacSGreg Roach        }
258089dadacSGreg Roach
259cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
260cc7171a0SGreg Roach            foreach ($asns as $asn) {
261dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
262089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
263089dadacSGreg Roach                        if ($range->contains($address)) {
264089dadacSGreg Roach                            continue 2;
265089dadacSGreg Roach                        }
266089dadacSGreg Roach                    }
267089dadacSGreg Roach
268089dadacSGreg Roach                    return $this->response();
269089dadacSGreg Roach                }
270089dadacSGreg Roach            }
271cc7171a0SGreg Roach        }
272089dadacSGreg Roach
273617057d4SGreg Roach        // Allow sites to block access from entire networks.
274b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
275b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
276b55cbc6bSGreg Roach
277617057d4SGreg Roach        foreach ($matches[1] as $asn) {
278617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
279617057d4SGreg Roach                if ($range->contains($address)) {
280617057d4SGreg Roach                    return $this->response();
281617057d4SGreg Roach                }
282617057d4SGreg Roach            }
283617057d4SGreg Roach        }
284089dadacSGreg Roach
285089dadacSGreg Roach        return $handler->handle($request);
286089dadacSGreg Roach    }
287089dadacSGreg Roach
288089dadacSGreg Roach    /**
289089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
290089dadacSGreg Roach     *
291089dadacSGreg Roach     * @param string        $ip
292089dadacSGreg Roach     * @param array<string> $valid_domains
2935c20d904SGreg Roach     * @param bool          $reverse_only
294089dadacSGreg Roach     *
295089dadacSGreg Roach     * @return bool
296089dadacSGreg Roach     */
2975c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
298089dadacSGreg Roach    {
299089dadacSGreg Roach        $host = gethostbyaddr($ip);
300089dadacSGreg Roach
301dec352c1SGreg Roach        if ($host === false) {
302089dadacSGreg Roach            return false;
303089dadacSGreg Roach        }
304089dadacSGreg Roach
305dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
306dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
3075c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
308089dadacSGreg Roach            }
309dec352c1SGreg Roach        }
310dec352c1SGreg Roach
311dec352c1SGreg Roach        return false;
312dec352c1SGreg Roach    }
313089dadacSGreg Roach
314089dadacSGreg Roach    /**
315089dadacSGreg Roach     * Perform a whois search for an ASN.
316089dadacSGreg Roach     *
317*e5766395SGreg Roach     * @param string $asn The autonomous system number to query
318089dadacSGreg Roach     *
319089dadacSGreg Roach     * @return array<RangeInterface>
320089dadacSGreg Roach     */
321089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
322089dadacSGreg Roach    {
3236b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
3244a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
325273a564eSGreg Roach
326089dadacSGreg Roach            try {
327089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
328089dadacSGreg Roach                $whois  = new Whois($loader);
329089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
330273a564eSGreg Roach                $routes = $info->routes;
331273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
332089dadacSGreg Roach
333089dadacSGreg Roach                return array_filter($ranges);
33428d026adSGreg Roach            } catch (Throwable) {
335089dadacSGreg Roach                return [];
336089dadacSGreg Roach            }
337089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
338089dadacSGreg Roach    }
339089dadacSGreg Roach
340089dadacSGreg Roach    /**
341d2d58874SGreg Roach     * Fetch a list of IP addresses from a remote file.
342d2d58874SGreg Roach     *
343d2d58874SGreg Roach     * @param string $ua
344d2d58874SGreg Roach     * @param string $url
345d2d58874SGreg Roach     *
346d2d58874SGreg Roach     * @return array<string>
347d2d58874SGreg Roach     */
348d2d58874SGreg Roach    private function fetchIpRangesForUrl(string $ua, string $url): array
349d2d58874SGreg Roach    {
350d2d58874SGreg Roach        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
351d2d58874SGreg Roach            try {
352d2d58874SGreg Roach                $client   = new Client();
353d2d58874SGreg Roach                $response = $client->get($url, ['timeout' => 5]);
354d2d58874SGreg Roach                $contents = $response->getBody()->getContents();
355d2d58874SGreg Roach
356d2d58874SGreg Roach                preg_match_all(self::REGEX_IPV4, $contents, $matches);
357d2d58874SGreg Roach
358d2d58874SGreg Roach                return $matches[0];
359d2d58874SGreg Roach            } catch (GuzzleException) {
360d2d58874SGreg Roach                return [];
361d2d58874SGreg Roach            }
362d2d58874SGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
363d2d58874SGreg Roach    }
364d2d58874SGreg Roach
365d2d58874SGreg Roach    /**
366089dadacSGreg Roach     * @return ResponseInterface
367089dadacSGreg Roach     */
368089dadacSGreg Roach    private function response(): ResponseInterface
369089dadacSGreg Roach    {
370089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
371089dadacSGreg Roach    }
372089dadacSGreg Roach}
373