xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 1dc9522fdcf3e2d9a2344f57cec6274a46efc3d5)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25d2d58874SGreg Roachuse GuzzleHttp\Client;
26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException;
27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
29089dadacSGreg Roachuse Iodev\Whois\Whois;
30089dadacSGreg Roachuse IPLib\Address\AddressInterface;
3169675509SGreg Roachuse IPLib\Factory as IPFactory;
32089dadacSGreg Roachuse IPLib\Range\RangeInterface;
33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
37089dadacSGreg Roachuse Throwable;
38089dadacSGreg Roach
39b7e8616fSGreg Roachuse function array_filter;
40089dadacSGreg Roachuse function array_map;
41089dadacSGreg Roachuse function assert;
42089dadacSGreg Roachuse function gethostbyaddr;
43089dadacSGreg Roachuse function gethostbyname;
44b7e8616fSGreg Roachuse function preg_match_all;
45b7e8616fSGreg Roachuse function random_int;
46089dadacSGreg Roachuse function response;
47dec352c1SGreg Roachuse function str_contains;
48dec352c1SGreg Roachuse function str_ends_with;
49089dadacSGreg Roach
50089dadacSGreg Roach/**
51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
52089dadacSGreg Roach */
53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
54089dadacSGreg Roach{
55d2d58874SGreg Roach    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56d2d58874SGreg Roach    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57d2d58874SGreg Roach
58089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
60089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
61089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
62089dadacSGreg Roach
63ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64ffa287a1SGreg Roach    public const BAD_ROBOTS = [
65089dadacSGreg Roach        'admantx',
66be5f8e6aSGreg Roach        'Adsbot',
67089dadacSGreg Roach        'AhrefsBot',
687fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69*1dc9522fSGreg Roach        'AntBot', // Aggressive crawler
70227c6666SGreg Roach        'AspiegelBot',
710036e960SGreg Roach        'Awario', // Brand management
7261e93e26SGreg Roach        'Barkrowler',
73a10ff261SGreg Roach        'BLEXBot',
74a6224258SGreg Roach        'Bytespider',
750d515f58SGreg Roach        'CCBot', // Used to train a number of LLMs
760d515f58SGreg Roach        'ChatGPT-User', // Used by ChatGPT during operation
773a3594e9SGreg Roach        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
78089dadacSGreg Roach        'DotBot',
790d515f58SGreg Roach        'FacebookBot', // Collects training data for Facebook's LLM translator.
80*1dc9522fSGreg Roach        'fidget-spinner-bot', // Agressive crawler
810d515f58SGreg Roach        'Google-Extended', // Collects training data for Google Bard
82970c4733SGreg Roach        'GPTBot', // Collects training data for ChatGPT
83089dadacSGreg Roach        'Grapeshot',
84f3d48b69SGreg Roach        'Honolulu-bot', // Aggressive crawer, no info available
85089dadacSGreg Roach        'ia_archiver',
86c8614595SGreg Roach        'linabot', // Aggressive crawer, no info available
8703bad539SGreg Roach        'Linguee',
8810d27708SGreg Roach        'MegaIndex.ru',
89089dadacSGreg Roach        'MJ12bot',
90d5bb02daSGreg Roach        'netEstate NE',
910d515f58SGreg Roach        'Omgilibot', // Collects training data for LLMs
92227c6666SGreg Roach        'panscient',
93be5f8e6aSGreg Roach        'PetalBot',
94089dadacSGreg Roach        'proximic',
9510d27708SGreg Roach        'SeekportBot', // Pretends to be a search engine - but isn't
96089dadacSGreg Roach        'SemrushBot',
97f4b15485SGreg Roach        'serpstatbot',
98d5bb02daSGreg Roach        'SEOkicks',
99d5bb02daSGreg Roach        'SiteKiosk',
100*1dc9522fSGreg Roach        'test-bot', // Agressive crawler
10145d54b04SGreg Roach        'TinyTestBot',
102be5f8e6aSGreg Roach        'Turnitin',
1037d9d7ecaSGreg Roach        'wp_is_mobile', // Nothing to do with wordpress
104089dadacSGreg Roach        'XoviBot',
10552567a36SGreg Roach        'YisouSpider',
106a10ff261SGreg Roach        'ZoominfoBot',
107089dadacSGreg Roach    ];
108089dadacSGreg Roach
109089dadacSGreg Roach    /**
1105c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
111089dadacSGreg Roach     *
112891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
113089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
114089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
115089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
116089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
11777d0194eSGreg Roach     * @see https://www.mojeek.com/bot.html
11877d0194eSGreg Roach     * @see https://support.apple.com/en-gb/HT204683
119089dadacSGreg Roach     */
1205c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
121891c4176SGreg Roach        'Amazonbot'        => ['.crawl.amazon.com'],
12277d0194eSGreg Roach        'Applebot'         => ['.applebot.apple.com'],
123089dadacSGreg Roach        'BingPreview'      => ['.search.msn.com'],
124089dadacSGreg Roach        'Google'           => ['.google.com', '.googlebot.com'],
125d5bb02daSGreg Roach        'Mail.RU_Bot'      => ['.mail.ru'],
126e47c3c91SGreg Roach        'MicrosoftPreview' => ['.search.msn.com'],
127e47c3c91SGreg Roach        'MojeekBot'        => ['.mojeek.com'],
128089dadacSGreg Roach        'Qwantify'         => ['.search.qwant.com'],
129089dadacSGreg Roach        'Sogou'            => ['.crawl.sogou.com'],
130089dadacSGreg Roach        'Yahoo'            => ['.crawl.yahoo.net'],
131089dadacSGreg Roach        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
132e47c3c91SGreg Roach        'bingbot'          => ['.search.msn.com'],
133e47c3c91SGreg Roach        'msnbot'           => ['.search.msn.com'],
134089dadacSGreg Roach    ];
135089dadacSGreg Roach
136089dadacSGreg Roach    /**
1375c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1385c20d904SGreg Roach     *
1395c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1401ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
141a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1425c20d904SGreg Roach     */
1435c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1446a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1451ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
146a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
147d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1488e1afc64SGreg Roach        'SeznamBot'   => ['.seznam.cz'],
1495c20d904SGreg Roach    ];
1505c20d904SGreg Roach
1515c20d904SGreg Roach    /**
152089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
153089dadacSGreg Roach     *
154ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
155089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
156089dadacSGreg Roach     */
157089dadacSGreg Roach    private const ROBOT_IPS = [
158813eb6c8SGreg Roach        'AppleBot'    => [
159813eb6c8SGreg Roach            '17.0.0.0/8',
160813eb6c8SGreg Roach        ],
161089dadacSGreg Roach        'Ask Jeeves'  => [
162089dadacSGreg Roach            '65.214.45.143',
163089dadacSGreg Roach            '65.214.45.148',
164089dadacSGreg Roach            '66.235.124.192',
165089dadacSGreg Roach            '66.235.124.7',
166089dadacSGreg Roach            '66.235.124.101',
167089dadacSGreg Roach            '66.235.124.193',
168089dadacSGreg Roach            '66.235.124.73',
169089dadacSGreg Roach            '66.235.124.196',
170089dadacSGreg Roach            '66.235.124.74',
171089dadacSGreg Roach            '63.123.238.8',
172089dadacSGreg Roach            '202.143.148.61',
173089dadacSGreg Roach        ],
174089dadacSGreg Roach        'DuckDuckBot' => [
175089dadacSGreg Roach            '23.21.227.69',
176089dadacSGreg Roach            '50.16.241.113',
177089dadacSGreg Roach            '50.16.241.114',
178089dadacSGreg Roach            '50.16.241.117',
179089dadacSGreg Roach            '50.16.247.234',
180089dadacSGreg Roach            '52.204.97.54',
181089dadacSGreg Roach            '52.5.190.19',
182089dadacSGreg Roach            '54.197.234.188',
183089dadacSGreg Roach            '54.208.100.253',
184089dadacSGreg Roach            '54.208.102.37',
185089dadacSGreg Roach            '107.21.1.8',
186089dadacSGreg Roach        ],
187089dadacSGreg Roach    ];
188089dadacSGreg Roach
189089dadacSGreg Roach    /**
190d2d58874SGreg Roach     * Some search engines operate from designated IP addresses.
191d2d58874SGreg Roach     *
192d2d58874SGreg Roach     * @see https://bot.seekport.com/
193d2d58874SGreg Roach     */
194d2d58874SGreg Roach    private const ROBOT_IP_FILES = [
195d2d58874SGreg Roach        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
196d2d58874SGreg Roach    ];
197d2d58874SGreg Roach
198d2d58874SGreg Roach    /**
199089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
200089dadacSGreg Roach     *
201089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
202cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
203089dadacSGreg Roach     */
204cc7171a0SGreg Roach    private const ROBOT_ASNS = [
205cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
206cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
207089dadacSGreg Roach    ];
208089dadacSGreg Roach
209089dadacSGreg Roach    /**
210089dadacSGreg Roach     * @param ServerRequestInterface  $request
211089dadacSGreg Roach     * @param RequestHandlerInterface $handler
212089dadacSGreg Roach     *
213089dadacSGreg Roach     * @return ResponseInterface
214089dadacSGreg Roach     */
215089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
216089dadacSGreg Roach    {
217b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
218b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
2194a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
220089dadacSGreg Roach        assert($address instanceof AddressInterface);
221089dadacSGreg Roach
222dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
223dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
224089dadacSGreg Roach                return $this->response();
225089dadacSGreg Roach            }
226dec352c1SGreg Roach        }
227089dadacSGreg Roach
2285c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
229dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
2305c20d904SGreg Roach                return $this->response();
2315c20d904SGreg Roach            }
2325c20d904SGreg Roach        }
2335c20d904SGreg Roach
2345c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
235dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
236089dadacSGreg Roach                return $this->response();
237089dadacSGreg Roach            }
238089dadacSGreg Roach        }
239089dadacSGreg Roach
240d2d58874SGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
241dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
242d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
243d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
244d2d58874SGreg Roach
245d2d58874SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
246d2d58874SGreg Roach                        continue 2;
247d2d58874SGreg Roach                    }
248d2d58874SGreg Roach                }
249d2d58874SGreg Roach
250d2d58874SGreg Roach                return $this->response();
251d2d58874SGreg Roach            }
252d2d58874SGreg Roach        }
253d2d58874SGreg Roach
254d2d58874SGreg Roach        foreach (self::ROBOT_IP_FILES as $robot => $url) {
255d2d58874SGreg Roach            if (str_contains($ua, $robot)) {
256d2d58874SGreg Roach                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
257d2d58874SGreg Roach
258d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
259d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
260813eb6c8SGreg Roach
261813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
262813eb6c8SGreg Roach                        continue 2;
263813eb6c8SGreg Roach                    }
264813eb6c8SGreg Roach                }
265813eb6c8SGreg Roach
266089dadacSGreg Roach                return $this->response();
267089dadacSGreg Roach            }
268089dadacSGreg Roach        }
269089dadacSGreg Roach
270cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
271cc7171a0SGreg Roach            foreach ($asns as $asn) {
272dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
273089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
274089dadacSGreg Roach                        if ($range->contains($address)) {
275089dadacSGreg Roach                            continue 2;
276089dadacSGreg Roach                        }
277089dadacSGreg Roach                    }
278089dadacSGreg Roach
279089dadacSGreg Roach                    return $this->response();
280089dadacSGreg Roach                }
281089dadacSGreg Roach            }
282cc7171a0SGreg Roach        }
283089dadacSGreg Roach
284617057d4SGreg Roach        // Allow sites to block access from entire networks.
285b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
286b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
287b55cbc6bSGreg Roach
288617057d4SGreg Roach        foreach ($matches[1] as $asn) {
289617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
290617057d4SGreg Roach                if ($range->contains($address)) {
291617057d4SGreg Roach                    return $this->response();
292617057d4SGreg Roach                }
293617057d4SGreg Roach            }
294617057d4SGreg Roach        }
295089dadacSGreg Roach
296089dadacSGreg Roach        return $handler->handle($request);
297089dadacSGreg Roach    }
298089dadacSGreg Roach
299089dadacSGreg Roach    /**
300089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
301089dadacSGreg Roach     *
302089dadacSGreg Roach     * @param string        $ip
303089dadacSGreg Roach     * @param array<string> $valid_domains
3045c20d904SGreg Roach     * @param bool          $reverse_only
305089dadacSGreg Roach     *
306089dadacSGreg Roach     * @return bool
307089dadacSGreg Roach     */
3085c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
309089dadacSGreg Roach    {
310089dadacSGreg Roach        $host = gethostbyaddr($ip);
311089dadacSGreg Roach
312dec352c1SGreg Roach        if ($host === false) {
313089dadacSGreg Roach            return false;
314089dadacSGreg Roach        }
315089dadacSGreg Roach
316dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
317dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
3185c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
319089dadacSGreg Roach            }
320dec352c1SGreg Roach        }
321dec352c1SGreg Roach
322dec352c1SGreg Roach        return false;
323dec352c1SGreg Roach    }
324089dadacSGreg Roach
325089dadacSGreg Roach    /**
326089dadacSGreg Roach     * Perform a whois search for an ASN.
327089dadacSGreg Roach     *
328e5766395SGreg Roach     * @param string $asn The autonomous system number to query
329089dadacSGreg Roach     *
330089dadacSGreg Roach     * @return array<RangeInterface>
331089dadacSGreg Roach     */
332089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
333089dadacSGreg Roach    {
3346b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
3354a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
336273a564eSGreg Roach
337089dadacSGreg Roach            try {
338089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
339089dadacSGreg Roach                $whois  = new Whois($loader);
340089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
341273a564eSGreg Roach                $routes = $info->routes;
342273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
343089dadacSGreg Roach
344089dadacSGreg Roach                return array_filter($ranges);
34528d026adSGreg Roach            } catch (Throwable) {
346089dadacSGreg Roach                return [];
347089dadacSGreg Roach            }
348089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
349089dadacSGreg Roach    }
350089dadacSGreg Roach
351089dadacSGreg Roach    /**
352d2d58874SGreg Roach     * Fetch a list of IP addresses from a remote file.
353d2d58874SGreg Roach     *
354d2d58874SGreg Roach     * @param string $ua
355d2d58874SGreg Roach     * @param string $url
356d2d58874SGreg Roach     *
357d2d58874SGreg Roach     * @return array<string>
358d2d58874SGreg Roach     */
359d2d58874SGreg Roach    private function fetchIpRangesForUrl(string $ua, string $url): array
360d2d58874SGreg Roach    {
361d2d58874SGreg Roach        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
362d2d58874SGreg Roach            try {
363d2d58874SGreg Roach                $client   = new Client();
364d2d58874SGreg Roach                $response = $client->get($url, ['timeout' => 5]);
365d2d58874SGreg Roach                $contents = $response->getBody()->getContents();
366d2d58874SGreg Roach
367d2d58874SGreg Roach                preg_match_all(self::REGEX_IPV4, $contents, $matches);
368d2d58874SGreg Roach
369d2d58874SGreg Roach                return $matches[0];
370d2d58874SGreg Roach            } catch (GuzzleException) {
371d2d58874SGreg Roach                return [];
372d2d58874SGreg Roach            }
373d2d58874SGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
374d2d58874SGreg Roach    }
375d2d58874SGreg Roach
376d2d58874SGreg Roach    /**
377089dadacSGreg Roach     * @return ResponseInterface
378089dadacSGreg Roach     */
379089dadacSGreg Roach    private function response(): ResponseInterface
380089dadacSGreg Roach    {
381089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
382089dadacSGreg Roach    }
383089dadacSGreg Roach}
384