xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision f3d48b691701b372cb536390c0aa5fb478936015)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25d2d58874SGreg Roachuse GuzzleHttp\Client;
26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException;
27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
29089dadacSGreg Roachuse Iodev\Whois\Whois;
30089dadacSGreg Roachuse IPLib\Address\AddressInterface;
3169675509SGreg Roachuse IPLib\Factory as IPFactory;
32089dadacSGreg Roachuse IPLib\Range\RangeInterface;
33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
37089dadacSGreg Roachuse Throwable;
38089dadacSGreg Roach
39b7e8616fSGreg Roachuse function array_filter;
40089dadacSGreg Roachuse function array_map;
41089dadacSGreg Roachuse function assert;
42089dadacSGreg Roachuse function gethostbyaddr;
43089dadacSGreg Roachuse function gethostbyname;
44b7e8616fSGreg Roachuse function preg_match_all;
45b7e8616fSGreg Roachuse function random_int;
46089dadacSGreg Roachuse function response;
47dec352c1SGreg Roachuse function str_contains;
48dec352c1SGreg Roachuse function str_ends_with;
49089dadacSGreg Roach
50089dadacSGreg Roach/**
51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
52089dadacSGreg Roach */
53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
54089dadacSGreg Roach{
55d2d58874SGreg Roach    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56d2d58874SGreg Roach    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57d2d58874SGreg Roach
58089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
60089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
61089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
62089dadacSGreg Roach
63ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64ffa287a1SGreg Roach    public const BAD_ROBOTS = [
65089dadacSGreg Roach        'admantx',
66be5f8e6aSGreg Roach        'Adsbot',
67089dadacSGreg Roach        'AhrefsBot',
687fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69227c6666SGreg Roach        'AspiegelBot',
7061e93e26SGreg Roach        'Barkrowler',
71a10ff261SGreg Roach        'BLEXBot',
721763aecaSGreg Roach        'DataForSEO',
73089dadacSGreg Roach        'DotBot',
74089dadacSGreg Roach        'Grapeshot',
75*f3d48b69SGreg Roach        'Honolulu-bot', // Aggressive crawer, no info available
76089dadacSGreg Roach        'ia_archiver',
7703bad539SGreg Roach        'Linguee',
78089dadacSGreg Roach        'MJ12bot',
79d5bb02daSGreg Roach        'netEstate NE',
80227c6666SGreg Roach        'panscient',
81be5f8e6aSGreg Roach        'PetalBot',
82089dadacSGreg Roach        'proximic',
83089dadacSGreg Roach        'SemrushBot',
84d5bb02daSGreg Roach        'SEOkicks',
85d5bb02daSGreg Roach        'SiteKiosk',
86be5f8e6aSGreg Roach        'Turnitin',
87089dadacSGreg Roach        'XoviBot',
88a10ff261SGreg Roach        'ZoominfoBot',
89089dadacSGreg Roach    ];
90089dadacSGreg Roach
91089dadacSGreg Roach    /**
925c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
93089dadacSGreg Roach     *
94891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
95089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
96089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
97089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
98089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
9977d0194eSGreg Roach     * @see https://www.mojeek.com/bot.html
10077d0194eSGreg Roach     * @see https://support.apple.com/en-gb/HT204683
101089dadacSGreg Roach     */
1025c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
103891c4176SGreg Roach        'Amazonbot'   => ['.crawl.amazon.com'],
10477d0194eSGreg Roach        'Applebot'    => ['.applebot.apple.com'],
105089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
106089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
107089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
10877d0194eSGreg Roach        'MojeekBot'   => ['.mojeek.com'],
109d5bb02daSGreg Roach        'Mail.RU_Bot' => ['.mail.ru'],
110089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
111089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
112089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
113089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
114089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
115089dadacSGreg Roach    ];
116089dadacSGreg Roach
117089dadacSGreg Roach    /**
1185c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1195c20d904SGreg Roach     *
1205c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1211ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
122a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1235c20d904SGreg Roach     */
1245c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1256a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1261ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
127a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
128d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1295c20d904SGreg Roach    ];
1305c20d904SGreg Roach
1315c20d904SGreg Roach    /**
132089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
133089dadacSGreg Roach     *
134ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
135089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
136089dadacSGreg Roach     */
137089dadacSGreg Roach    private const ROBOT_IPS = [
138813eb6c8SGreg Roach        'AppleBot'    => [
139813eb6c8SGreg Roach            '17.0.0.0/8',
140813eb6c8SGreg Roach        ],
141089dadacSGreg Roach        'Ask Jeeves'  => [
142089dadacSGreg Roach            '65.214.45.143',
143089dadacSGreg Roach            '65.214.45.148',
144089dadacSGreg Roach            '66.235.124.192',
145089dadacSGreg Roach            '66.235.124.7',
146089dadacSGreg Roach            '66.235.124.101',
147089dadacSGreg Roach            '66.235.124.193',
148089dadacSGreg Roach            '66.235.124.73',
149089dadacSGreg Roach            '66.235.124.196',
150089dadacSGreg Roach            '66.235.124.74',
151089dadacSGreg Roach            '63.123.238.8',
152089dadacSGreg Roach            '202.143.148.61',
153089dadacSGreg Roach        ],
154089dadacSGreg Roach        'DuckDuckBot' => [
155089dadacSGreg Roach            '23.21.227.69',
156089dadacSGreg Roach            '50.16.241.113',
157089dadacSGreg Roach            '50.16.241.114',
158089dadacSGreg Roach            '50.16.241.117',
159089dadacSGreg Roach            '50.16.247.234',
160089dadacSGreg Roach            '52.204.97.54',
161089dadacSGreg Roach            '52.5.190.19',
162089dadacSGreg Roach            '54.197.234.188',
163089dadacSGreg Roach            '54.208.100.253',
164089dadacSGreg Roach            '54.208.102.37',
165089dadacSGreg Roach            '107.21.1.8',
166089dadacSGreg Roach        ],
167089dadacSGreg Roach    ];
168089dadacSGreg Roach
169089dadacSGreg Roach    /**
170d2d58874SGreg Roach     * Some search engines operate from designated IP addresses.
171d2d58874SGreg Roach     *
172d2d58874SGreg Roach     * @see https://bot.seekport.com/
173d2d58874SGreg Roach     */
174d2d58874SGreg Roach    private const ROBOT_IP_FILES = [
175d2d58874SGreg Roach        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
176d2d58874SGreg Roach    ];
177d2d58874SGreg Roach
178d2d58874SGreg Roach    /**
179089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
180089dadacSGreg Roach     *
181089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
182cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
183089dadacSGreg Roach     */
184cc7171a0SGreg Roach    private const ROBOT_ASNS = [
185cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
186cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
187089dadacSGreg Roach    ];
188089dadacSGreg Roach
189089dadacSGreg Roach    /**
190089dadacSGreg Roach     * @param ServerRequestInterface  $request
191089dadacSGreg Roach     * @param RequestHandlerInterface $handler
192089dadacSGreg Roach     *
193089dadacSGreg Roach     * @return ResponseInterface
194089dadacSGreg Roach     */
195089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
196089dadacSGreg Roach    {
197b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
198b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
1994a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
200089dadacSGreg Roach        assert($address instanceof AddressInterface);
201089dadacSGreg Roach
202dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
203dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
204089dadacSGreg Roach                return $this->response();
205089dadacSGreg Roach            }
206dec352c1SGreg Roach        }
207089dadacSGreg Roach
2085c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
209dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
2105c20d904SGreg Roach                return $this->response();
2115c20d904SGreg Roach            }
2125c20d904SGreg Roach        }
2135c20d904SGreg Roach
2145c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
215dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
216089dadacSGreg Roach                return $this->response();
217089dadacSGreg Roach            }
218089dadacSGreg Roach        }
219089dadacSGreg Roach
220d2d58874SGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
221dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
222d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
223d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
224d2d58874SGreg Roach
225d2d58874SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
226d2d58874SGreg Roach                        continue 2;
227d2d58874SGreg Roach                    }
228d2d58874SGreg Roach                }
229d2d58874SGreg Roach
230d2d58874SGreg Roach                return $this->response();
231d2d58874SGreg Roach            }
232d2d58874SGreg Roach        }
233d2d58874SGreg Roach
234d2d58874SGreg Roach        foreach (self::ROBOT_IP_FILES as $robot => $url) {
235d2d58874SGreg Roach            if (str_contains($ua, $robot)) {
236d2d58874SGreg Roach                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
237d2d58874SGreg Roach
238d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
239d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
240813eb6c8SGreg Roach
241813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
242813eb6c8SGreg Roach                        continue 2;
243813eb6c8SGreg Roach                    }
244813eb6c8SGreg Roach                }
245813eb6c8SGreg Roach
246089dadacSGreg Roach                return $this->response();
247089dadacSGreg Roach            }
248089dadacSGreg Roach        }
249089dadacSGreg Roach
250cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
251cc7171a0SGreg Roach            foreach ($asns as $asn) {
252dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
253089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
254089dadacSGreg Roach                        if ($range->contains($address)) {
255089dadacSGreg Roach                            continue 2;
256089dadacSGreg Roach                        }
257089dadacSGreg Roach                    }
258089dadacSGreg Roach
259089dadacSGreg Roach                    return $this->response();
260089dadacSGreg Roach                }
261089dadacSGreg Roach            }
262cc7171a0SGreg Roach        }
263089dadacSGreg Roach
264617057d4SGreg Roach        // Allow sites to block access from entire networks.
265b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
266b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
267b55cbc6bSGreg Roach
268617057d4SGreg Roach        foreach ($matches[1] as $asn) {
269617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
270617057d4SGreg Roach                if ($range->contains($address)) {
271617057d4SGreg Roach                    return $this->response();
272617057d4SGreg Roach                }
273617057d4SGreg Roach            }
274617057d4SGreg Roach        }
275089dadacSGreg Roach
276089dadacSGreg Roach        return $handler->handle($request);
277089dadacSGreg Roach    }
278089dadacSGreg Roach
279089dadacSGreg Roach    /**
280089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
281089dadacSGreg Roach     *
282089dadacSGreg Roach     * @param string        $ip
283089dadacSGreg Roach     * @param array<string> $valid_domains
2845c20d904SGreg Roach     * @param bool          $reverse_only
285089dadacSGreg Roach     *
286089dadacSGreg Roach     * @return bool
287089dadacSGreg Roach     */
2885c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
289089dadacSGreg Roach    {
290089dadacSGreg Roach        $host = gethostbyaddr($ip);
291089dadacSGreg Roach
292dec352c1SGreg Roach        if ($host === false) {
293089dadacSGreg Roach            return false;
294089dadacSGreg Roach        }
295089dadacSGreg Roach
296dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
297dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2985c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
299089dadacSGreg Roach            }
300dec352c1SGreg Roach        }
301dec352c1SGreg Roach
302dec352c1SGreg Roach        return false;
303dec352c1SGreg Roach    }
304089dadacSGreg Roach
305089dadacSGreg Roach    /**
306089dadacSGreg Roach     * Perform a whois search for an ASN.
307089dadacSGreg Roach     *
308089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
309089dadacSGreg Roach     *
310089dadacSGreg Roach     * @return array<RangeInterface>
311089dadacSGreg Roach     */
312089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
313089dadacSGreg Roach    {
3146b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
3154a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
316273a564eSGreg Roach
317089dadacSGreg Roach            try {
318089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
319089dadacSGreg Roach                $whois  = new Whois($loader);
320089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
321273a564eSGreg Roach                $routes = $info->routes;
322273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
323089dadacSGreg Roach
324089dadacSGreg Roach                return array_filter($ranges);
32528d026adSGreg Roach            } catch (Throwable) {
326089dadacSGreg Roach                return [];
327089dadacSGreg Roach            }
328089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
329089dadacSGreg Roach    }
330089dadacSGreg Roach
331089dadacSGreg Roach    /**
332d2d58874SGreg Roach     * Fetch a list of IP addresses from a remote file.
333d2d58874SGreg Roach     *
334d2d58874SGreg Roach     * @param string $ua
335d2d58874SGreg Roach     * @param string $url
336d2d58874SGreg Roach     *
337d2d58874SGreg Roach     * @return array<string>
338d2d58874SGreg Roach     */
339d2d58874SGreg Roach    private function fetchIpRangesForUrl(string $ua, string $url): array
340d2d58874SGreg Roach    {
341d2d58874SGreg Roach        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
342d2d58874SGreg Roach            try {
343d2d58874SGreg Roach                $client   = new Client();
344d2d58874SGreg Roach                $response = $client->get($url, ['timeout' => 5]);
345d2d58874SGreg Roach                $contents = $response->getBody()->getContents();
346d2d58874SGreg Roach
347d2d58874SGreg Roach                preg_match_all(self::REGEX_IPV4, $contents, $matches);
348d2d58874SGreg Roach
349d2d58874SGreg Roach                return $matches[0];
350d2d58874SGreg Roach            } catch (GuzzleException) {
351d2d58874SGreg Roach                return [];
352d2d58874SGreg Roach            }
353d2d58874SGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
354d2d58874SGreg Roach    }
355d2d58874SGreg Roach
356d2d58874SGreg Roach    /**
357089dadacSGreg Roach     * @return ResponseInterface
358089dadacSGreg Roach     */
359089dadacSGreg Roach    private function response(): ResponseInterface
360089dadacSGreg Roach    {
361089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
362089dadacSGreg Roach    }
363089dadacSGreg Roach}
364