xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 52567a3670886985b2d263f3f10cd592bdc97ded)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
5d11be702SGreg Roach * Copyright (C) 2023 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25d2d58874SGreg Roachuse GuzzleHttp\Client;
26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException;
27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
29089dadacSGreg Roachuse Iodev\Whois\Whois;
30089dadacSGreg Roachuse IPLib\Address\AddressInterface;
3169675509SGreg Roachuse IPLib\Factory as IPFactory;
32089dadacSGreg Roachuse IPLib\Range\RangeInterface;
33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
37089dadacSGreg Roachuse Throwable;
38089dadacSGreg Roach
39b7e8616fSGreg Roachuse function array_filter;
40089dadacSGreg Roachuse function array_map;
41089dadacSGreg Roachuse function assert;
42089dadacSGreg Roachuse function gethostbyaddr;
43089dadacSGreg Roachuse function gethostbyname;
44b7e8616fSGreg Roachuse function preg_match_all;
45b7e8616fSGreg Roachuse function random_int;
46089dadacSGreg Roachuse function response;
47dec352c1SGreg Roachuse function str_contains;
48dec352c1SGreg Roachuse function str_ends_with;
49089dadacSGreg Roach
50089dadacSGreg Roach/**
51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
52089dadacSGreg Roach */
53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
54089dadacSGreg Roach{
55d2d58874SGreg Roach    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56d2d58874SGreg Roach    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57d2d58874SGreg Roach
58089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
60089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
61089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
62089dadacSGreg Roach
63ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64ffa287a1SGreg Roach    public const BAD_ROBOTS = [
65089dadacSGreg Roach        'admantx',
66be5f8e6aSGreg Roach        'Adsbot',
67089dadacSGreg Roach        'AhrefsBot',
687fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69227c6666SGreg Roach        'AspiegelBot',
700036e960SGreg Roach        'Awario', // Brand management
7161e93e26SGreg Roach        'Barkrowler',
72a10ff261SGreg Roach        'BLEXBot',
73a6224258SGreg Roach        'Bytespider',
741763aecaSGreg Roach        'DataForSEO',
753a3594e9SGreg Roach        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
76089dadacSGreg Roach        'DotBot',
77089dadacSGreg Roach        'Grapeshot',
78f3d48b69SGreg Roach        'Honolulu-bot', // Aggressive crawer, no info available
79089dadacSGreg Roach        'ia_archiver',
80c8614595SGreg Roach        'linabot', // Aggressive crawer, no info available
8103bad539SGreg Roach        'Linguee',
8210d27708SGreg Roach        'MegaIndex.ru',
83089dadacSGreg Roach        'MJ12bot',
84d5bb02daSGreg Roach        'netEstate NE',
85227c6666SGreg Roach        'panscient',
86be5f8e6aSGreg Roach        'PetalBot',
87089dadacSGreg Roach        'proximic',
8810d27708SGreg Roach        'SeekportBot', // Pretends to be a search engine - but isn't
89089dadacSGreg Roach        'SemrushBot',
90f4b15485SGreg Roach        'serpstatbot',
91d5bb02daSGreg Roach        'SEOkicks',
92d5bb02daSGreg Roach        'SiteKiosk',
93be5f8e6aSGreg Roach        'Turnitin',
947d9d7ecaSGreg Roach        'wp_is_mobile', // Nothing to do with wordpress
95089dadacSGreg Roach        'XoviBot',
96*52567a36SGreg Roach        'YisouSpider',
97a10ff261SGreg Roach        'ZoominfoBot',
98089dadacSGreg Roach    ];
99089dadacSGreg Roach
100089dadacSGreg Roach    /**
1015c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
102089dadacSGreg Roach     *
103891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
104089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
105089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
106089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
107089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
10877d0194eSGreg Roach     * @see https://www.mojeek.com/bot.html
10977d0194eSGreg Roach     * @see https://support.apple.com/en-gb/HT204683
110089dadacSGreg Roach     */
1115c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
112891c4176SGreg Roach        'Amazonbot'        => ['.crawl.amazon.com'],
11377d0194eSGreg Roach        'Applebot'         => ['.applebot.apple.com'],
114089dadacSGreg Roach        'BingPreview'      => ['.search.msn.com'],
115089dadacSGreg Roach        'Google'           => ['.google.com', '.googlebot.com'],
116d5bb02daSGreg Roach        'Mail.RU_Bot'      => ['.mail.ru'],
117e47c3c91SGreg Roach        'MicrosoftPreview' => ['.search.msn.com'],
118e47c3c91SGreg Roach        'MojeekBot'        => ['.mojeek.com'],
119089dadacSGreg Roach        'Qwantify'         => ['.search.qwant.com'],
120089dadacSGreg Roach        'Sogou'            => ['.crawl.sogou.com'],
121089dadacSGreg Roach        'Yahoo'            => ['.crawl.yahoo.net'],
122089dadacSGreg Roach        'Yandex'           => ['.yandex.ru', '.yandex.net', '.yandex.com'],
123e47c3c91SGreg Roach        'bingbot'          => ['.search.msn.com'],
124e47c3c91SGreg Roach        'msnbot'           => ['.search.msn.com'],
125089dadacSGreg Roach    ];
126089dadacSGreg Roach
127089dadacSGreg Roach    /**
1285c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1295c20d904SGreg Roach     *
1305c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1311ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
132a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1335c20d904SGreg Roach     */
1345c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1356a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1361ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
137a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
138d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1398e1afc64SGreg Roach        'SeznamBot'   => ['.seznam.cz'],
1405c20d904SGreg Roach    ];
1415c20d904SGreg Roach
1425c20d904SGreg Roach    /**
143089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
144089dadacSGreg Roach     *
145ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
146089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
147089dadacSGreg Roach     */
148089dadacSGreg Roach    private const ROBOT_IPS = [
149813eb6c8SGreg Roach        'AppleBot'    => [
150813eb6c8SGreg Roach            '17.0.0.0/8',
151813eb6c8SGreg Roach        ],
152089dadacSGreg Roach        'Ask Jeeves'  => [
153089dadacSGreg Roach            '65.214.45.143',
154089dadacSGreg Roach            '65.214.45.148',
155089dadacSGreg Roach            '66.235.124.192',
156089dadacSGreg Roach            '66.235.124.7',
157089dadacSGreg Roach            '66.235.124.101',
158089dadacSGreg Roach            '66.235.124.193',
159089dadacSGreg Roach            '66.235.124.73',
160089dadacSGreg Roach            '66.235.124.196',
161089dadacSGreg Roach            '66.235.124.74',
162089dadacSGreg Roach            '63.123.238.8',
163089dadacSGreg Roach            '202.143.148.61',
164089dadacSGreg Roach        ],
165089dadacSGreg Roach        'DuckDuckBot' => [
166089dadacSGreg Roach            '23.21.227.69',
167089dadacSGreg Roach            '50.16.241.113',
168089dadacSGreg Roach            '50.16.241.114',
169089dadacSGreg Roach            '50.16.241.117',
170089dadacSGreg Roach            '50.16.247.234',
171089dadacSGreg Roach            '52.204.97.54',
172089dadacSGreg Roach            '52.5.190.19',
173089dadacSGreg Roach            '54.197.234.188',
174089dadacSGreg Roach            '54.208.100.253',
175089dadacSGreg Roach            '54.208.102.37',
176089dadacSGreg Roach            '107.21.1.8',
177089dadacSGreg Roach        ],
178089dadacSGreg Roach    ];
179089dadacSGreg Roach
180089dadacSGreg Roach    /**
181d2d58874SGreg Roach     * Some search engines operate from designated IP addresses.
182d2d58874SGreg Roach     *
183d2d58874SGreg Roach     * @see https://bot.seekport.com/
184d2d58874SGreg Roach     */
185d2d58874SGreg Roach    private const ROBOT_IP_FILES = [
186d2d58874SGreg Roach        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
187d2d58874SGreg Roach    ];
188d2d58874SGreg Roach
189d2d58874SGreg Roach    /**
190089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
191089dadacSGreg Roach     *
192089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
193cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
194089dadacSGreg Roach     */
195cc7171a0SGreg Roach    private const ROBOT_ASNS = [
196cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
197cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
198089dadacSGreg Roach    ];
199089dadacSGreg Roach
200089dadacSGreg Roach    /**
201089dadacSGreg Roach     * @param ServerRequestInterface  $request
202089dadacSGreg Roach     * @param RequestHandlerInterface $handler
203089dadacSGreg Roach     *
204089dadacSGreg Roach     * @return ResponseInterface
205089dadacSGreg Roach     */
206089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
207089dadacSGreg Roach    {
208b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
209b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
2104a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
211089dadacSGreg Roach        assert($address instanceof AddressInterface);
212089dadacSGreg Roach
213dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
214dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
215089dadacSGreg Roach                return $this->response();
216089dadacSGreg Roach            }
217dec352c1SGreg Roach        }
218089dadacSGreg Roach
2195c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
220dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
2215c20d904SGreg Roach                return $this->response();
2225c20d904SGreg Roach            }
2235c20d904SGreg Roach        }
2245c20d904SGreg Roach
2255c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
226dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
227089dadacSGreg Roach                return $this->response();
228089dadacSGreg Roach            }
229089dadacSGreg Roach        }
230089dadacSGreg Roach
231d2d58874SGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
232dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
233d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
234d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
235d2d58874SGreg Roach
236d2d58874SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
237d2d58874SGreg Roach                        continue 2;
238d2d58874SGreg Roach                    }
239d2d58874SGreg Roach                }
240d2d58874SGreg Roach
241d2d58874SGreg Roach                return $this->response();
242d2d58874SGreg Roach            }
243d2d58874SGreg Roach        }
244d2d58874SGreg Roach
245d2d58874SGreg Roach        foreach (self::ROBOT_IP_FILES as $robot => $url) {
246d2d58874SGreg Roach            if (str_contains($ua, $robot)) {
247d2d58874SGreg Roach                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
248d2d58874SGreg Roach
249d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
250d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
251813eb6c8SGreg Roach
252813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
253813eb6c8SGreg Roach                        continue 2;
254813eb6c8SGreg Roach                    }
255813eb6c8SGreg Roach                }
256813eb6c8SGreg Roach
257089dadacSGreg Roach                return $this->response();
258089dadacSGreg Roach            }
259089dadacSGreg Roach        }
260089dadacSGreg Roach
261cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
262cc7171a0SGreg Roach            foreach ($asns as $asn) {
263dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
264089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
265089dadacSGreg Roach                        if ($range->contains($address)) {
266089dadacSGreg Roach                            continue 2;
267089dadacSGreg Roach                        }
268089dadacSGreg Roach                    }
269089dadacSGreg Roach
270089dadacSGreg Roach                    return $this->response();
271089dadacSGreg Roach                }
272089dadacSGreg Roach            }
273cc7171a0SGreg Roach        }
274089dadacSGreg Roach
275617057d4SGreg Roach        // Allow sites to block access from entire networks.
276b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
277b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
278b55cbc6bSGreg Roach
279617057d4SGreg Roach        foreach ($matches[1] as $asn) {
280617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
281617057d4SGreg Roach                if ($range->contains($address)) {
282617057d4SGreg Roach                    return $this->response();
283617057d4SGreg Roach                }
284617057d4SGreg Roach            }
285617057d4SGreg Roach        }
286089dadacSGreg Roach
287089dadacSGreg Roach        return $handler->handle($request);
288089dadacSGreg Roach    }
289089dadacSGreg Roach
290089dadacSGreg Roach    /**
291089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
292089dadacSGreg Roach     *
293089dadacSGreg Roach     * @param string        $ip
294089dadacSGreg Roach     * @param array<string> $valid_domains
2955c20d904SGreg Roach     * @param bool          $reverse_only
296089dadacSGreg Roach     *
297089dadacSGreg Roach     * @return bool
298089dadacSGreg Roach     */
2995c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
300089dadacSGreg Roach    {
301089dadacSGreg Roach        $host = gethostbyaddr($ip);
302089dadacSGreg Roach
303dec352c1SGreg Roach        if ($host === false) {
304089dadacSGreg Roach            return false;
305089dadacSGreg Roach        }
306089dadacSGreg Roach
307dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
308dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
3095c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
310089dadacSGreg Roach            }
311dec352c1SGreg Roach        }
312dec352c1SGreg Roach
313dec352c1SGreg Roach        return false;
314dec352c1SGreg Roach    }
315089dadacSGreg Roach
316089dadacSGreg Roach    /**
317089dadacSGreg Roach     * Perform a whois search for an ASN.
318089dadacSGreg Roach     *
319e5766395SGreg Roach     * @param string $asn The autonomous system number to query
320089dadacSGreg Roach     *
321089dadacSGreg Roach     * @return array<RangeInterface>
322089dadacSGreg Roach     */
323089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
324089dadacSGreg Roach    {
3256b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
3264a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
327273a564eSGreg Roach
328089dadacSGreg Roach            try {
329089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
330089dadacSGreg Roach                $whois  = new Whois($loader);
331089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
332273a564eSGreg Roach                $routes = $info->routes;
333273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
334089dadacSGreg Roach
335089dadacSGreg Roach                return array_filter($ranges);
33628d026adSGreg Roach            } catch (Throwable) {
337089dadacSGreg Roach                return [];
338089dadacSGreg Roach            }
339089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
340089dadacSGreg Roach    }
341089dadacSGreg Roach
342089dadacSGreg Roach    /**
343d2d58874SGreg Roach     * Fetch a list of IP addresses from a remote file.
344d2d58874SGreg Roach     *
345d2d58874SGreg Roach     * @param string $ua
346d2d58874SGreg Roach     * @param string $url
347d2d58874SGreg Roach     *
348d2d58874SGreg Roach     * @return array<string>
349d2d58874SGreg Roach     */
350d2d58874SGreg Roach    private function fetchIpRangesForUrl(string $ua, string $url): array
351d2d58874SGreg Roach    {
352d2d58874SGreg Roach        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
353d2d58874SGreg Roach            try {
354d2d58874SGreg Roach                $client   = new Client();
355d2d58874SGreg Roach                $response = $client->get($url, ['timeout' => 5]);
356d2d58874SGreg Roach                $contents = $response->getBody()->getContents();
357d2d58874SGreg Roach
358d2d58874SGreg Roach                preg_match_all(self::REGEX_IPV4, $contents, $matches);
359d2d58874SGreg Roach
360d2d58874SGreg Roach                return $matches[0];
361d2d58874SGreg Roach            } catch (GuzzleException) {
362d2d58874SGreg Roach                return [];
363d2d58874SGreg Roach            }
364d2d58874SGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
365d2d58874SGreg Roach    }
366d2d58874SGreg Roach
367d2d58874SGreg Roach    /**
368089dadacSGreg Roach     * @return ResponseInterface
369089dadacSGreg Roach     */
370089dadacSGreg Roach    private function response(): ResponseInterface
371089dadacSGreg Roach    {
372089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
373089dadacSGreg Roach    }
374089dadacSGreg Roach}
375