xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 3a3594e9af1303d3b17ba9aa7e8c31b403b7b9c8)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25d2d58874SGreg Roachuse GuzzleHttp\Client;
26d2d58874SGreg Roachuse GuzzleHttp\Exception\GuzzleException;
27089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
28089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
29089dadacSGreg Roachuse Iodev\Whois\Whois;
30089dadacSGreg Roachuse IPLib\Address\AddressInterface;
3169675509SGreg Roachuse IPLib\Factory as IPFactory;
32089dadacSGreg Roachuse IPLib\Range\RangeInterface;
33089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
34089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
35089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
36089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
37089dadacSGreg Roachuse Throwable;
38089dadacSGreg Roach
39b7e8616fSGreg Roachuse function array_filter;
40089dadacSGreg Roachuse function array_map;
41089dadacSGreg Roachuse function assert;
42089dadacSGreg Roachuse function gethostbyaddr;
43089dadacSGreg Roachuse function gethostbyname;
44b7e8616fSGreg Roachuse function preg_match_all;
45b7e8616fSGreg Roachuse function random_int;
46089dadacSGreg Roachuse function response;
47dec352c1SGreg Roachuse function str_contains;
48dec352c1SGreg Roachuse function str_ends_with;
49089dadacSGreg Roach
50089dadacSGreg Roach/**
51089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
52089dadacSGreg Roach */
53089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
54089dadacSGreg Roach{
55d2d58874SGreg Roach    private const REGEX_OCTET = '(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)';
56d2d58874SGreg Roach    private const REGEX_IPV4  = '/\\b' . self::REGEX_OCTET . '(?:\\.' . self::REGEX_OCTET . '){3}\\b/';
57d2d58874SGreg Roach
58089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
59089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
60089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
61089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
62089dadacSGreg Roach
63ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
64ffa287a1SGreg Roach    public const BAD_ROBOTS = [
65089dadacSGreg Roach        'admantx',
66be5f8e6aSGreg Roach        'Adsbot',
67089dadacSGreg Roach        'AhrefsBot',
687fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
69227c6666SGreg Roach        'AspiegelBot',
7061e93e26SGreg Roach        'Barkrowler',
71a10ff261SGreg Roach        'BLEXBot',
721763aecaSGreg Roach        'DataForSEO',
73*3a3594e9SGreg Roach        'DataForSeoBot', // https://dataforseo.com/dataforseo-bot
74089dadacSGreg Roach        'DotBot',
75089dadacSGreg Roach        'Grapeshot',
76f3d48b69SGreg Roach        'Honolulu-bot', // Aggressive crawer, no info available
77089dadacSGreg Roach        'ia_archiver',
78c8614595SGreg Roach        'linabot', // Aggressive crawer, no info available
7903bad539SGreg Roach        'Linguee',
80089dadacSGreg Roach        'MJ12bot',
81d5bb02daSGreg Roach        'netEstate NE',
82227c6666SGreg Roach        'panscient',
83be5f8e6aSGreg Roach        'PetalBot',
84089dadacSGreg Roach        'proximic',
85089dadacSGreg Roach        'SemrushBot',
86f4b15485SGreg Roach        'serpstatbot',
87d5bb02daSGreg Roach        'SEOkicks',
88d5bb02daSGreg Roach        'SiteKiosk',
89be5f8e6aSGreg Roach        'Turnitin',
90089dadacSGreg Roach        'XoviBot',
91a10ff261SGreg Roach        'ZoominfoBot',
92089dadacSGreg Roach    ];
93089dadacSGreg Roach
94089dadacSGreg Roach    /**
955c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
96089dadacSGreg Roach     *
97891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
98089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
99089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
100089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
101089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
10277d0194eSGreg Roach     * @see https://www.mojeek.com/bot.html
10377d0194eSGreg Roach     * @see https://support.apple.com/en-gb/HT204683
104089dadacSGreg Roach     */
1055c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
106891c4176SGreg Roach        'Amazonbot'   => ['.crawl.amazon.com'],
10777d0194eSGreg Roach        'Applebot'    => ['.applebot.apple.com'],
108089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
109089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
110089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
11177d0194eSGreg Roach        'MojeekBot'   => ['.mojeek.com'],
112d5bb02daSGreg Roach        'Mail.RU_Bot' => ['.mail.ru'],
113089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
114089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
115089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
116089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
117089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
118089dadacSGreg Roach    ];
119089dadacSGreg Roach
120089dadacSGreg Roach    /**
1215c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1225c20d904SGreg Roach     *
1235c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1241ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
125a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1265c20d904SGreg Roach     */
1275c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1286a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1291ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
130a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
131d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1325c20d904SGreg Roach    ];
1335c20d904SGreg Roach
1345c20d904SGreg Roach    /**
135089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
136089dadacSGreg Roach     *
137ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
138089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
139089dadacSGreg Roach     */
140089dadacSGreg Roach    private const ROBOT_IPS = [
141813eb6c8SGreg Roach        'AppleBot'    => [
142813eb6c8SGreg Roach            '17.0.0.0/8',
143813eb6c8SGreg Roach        ],
144089dadacSGreg Roach        'Ask Jeeves'  => [
145089dadacSGreg Roach            '65.214.45.143',
146089dadacSGreg Roach            '65.214.45.148',
147089dadacSGreg Roach            '66.235.124.192',
148089dadacSGreg Roach            '66.235.124.7',
149089dadacSGreg Roach            '66.235.124.101',
150089dadacSGreg Roach            '66.235.124.193',
151089dadacSGreg Roach            '66.235.124.73',
152089dadacSGreg Roach            '66.235.124.196',
153089dadacSGreg Roach            '66.235.124.74',
154089dadacSGreg Roach            '63.123.238.8',
155089dadacSGreg Roach            '202.143.148.61',
156089dadacSGreg Roach        ],
157089dadacSGreg Roach        'DuckDuckBot' => [
158089dadacSGreg Roach            '23.21.227.69',
159089dadacSGreg Roach            '50.16.241.113',
160089dadacSGreg Roach            '50.16.241.114',
161089dadacSGreg Roach            '50.16.241.117',
162089dadacSGreg Roach            '50.16.247.234',
163089dadacSGreg Roach            '52.204.97.54',
164089dadacSGreg Roach            '52.5.190.19',
165089dadacSGreg Roach            '54.197.234.188',
166089dadacSGreg Roach            '54.208.100.253',
167089dadacSGreg Roach            '54.208.102.37',
168089dadacSGreg Roach            '107.21.1.8',
169089dadacSGreg Roach        ],
170089dadacSGreg Roach    ];
171089dadacSGreg Roach
172089dadacSGreg Roach    /**
173d2d58874SGreg Roach     * Some search engines operate from designated IP addresses.
174d2d58874SGreg Roach     *
175d2d58874SGreg Roach     * @see https://bot.seekport.com/
176d2d58874SGreg Roach     */
177d2d58874SGreg Roach    private const ROBOT_IP_FILES = [
178d2d58874SGreg Roach        'SeekportBot' => 'https://bot.seekport.com/seekportbot_ips.txt',
179d2d58874SGreg Roach    ];
180d2d58874SGreg Roach
181d2d58874SGreg Roach    /**
182089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
183089dadacSGreg Roach     *
184089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
185cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
186089dadacSGreg Roach     */
187cc7171a0SGreg Roach    private const ROBOT_ASNS = [
188cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
189cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
190089dadacSGreg Roach    ];
191089dadacSGreg Roach
192089dadacSGreg Roach    /**
193089dadacSGreg Roach     * @param ServerRequestInterface  $request
194089dadacSGreg Roach     * @param RequestHandlerInterface $handler
195089dadacSGreg Roach     *
196089dadacSGreg Roach     * @return ResponseInterface
197089dadacSGreg Roach     */
198089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
199089dadacSGreg Roach    {
200b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
201b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
2024a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
203089dadacSGreg Roach        assert($address instanceof AddressInterface);
204089dadacSGreg Roach
205dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
206dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
207089dadacSGreg Roach                return $this->response();
208089dadacSGreg Roach            }
209dec352c1SGreg Roach        }
210089dadacSGreg Roach
2115c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
212dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
2135c20d904SGreg Roach                return $this->response();
2145c20d904SGreg Roach            }
2155c20d904SGreg Roach        }
2165c20d904SGreg Roach
2175c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
218dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
219089dadacSGreg Roach                return $this->response();
220089dadacSGreg Roach            }
221089dadacSGreg Roach        }
222089dadacSGreg Roach
223d2d58874SGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ip_ranges) {
224dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
225d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
226d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
227d2d58874SGreg Roach
228d2d58874SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
229d2d58874SGreg Roach                        continue 2;
230d2d58874SGreg Roach                    }
231d2d58874SGreg Roach                }
232d2d58874SGreg Roach
233d2d58874SGreg Roach                return $this->response();
234d2d58874SGreg Roach            }
235d2d58874SGreg Roach        }
236d2d58874SGreg Roach
237d2d58874SGreg Roach        foreach (self::ROBOT_IP_FILES as $robot => $url) {
238d2d58874SGreg Roach            if (str_contains($ua, $robot)) {
239d2d58874SGreg Roach                $valid_ip_ranges = $this->fetchIpRangesForUrl($robot, $url);
240d2d58874SGreg Roach
241d2d58874SGreg Roach                foreach ($valid_ip_ranges as $ip_range) {
242d2d58874SGreg Roach                    $range = IPFactory::parseRangeString($ip_range);
243813eb6c8SGreg Roach
244813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
245813eb6c8SGreg Roach                        continue 2;
246813eb6c8SGreg Roach                    }
247813eb6c8SGreg Roach                }
248813eb6c8SGreg Roach
249089dadacSGreg Roach                return $this->response();
250089dadacSGreg Roach            }
251089dadacSGreg Roach        }
252089dadacSGreg Roach
253cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
254cc7171a0SGreg Roach            foreach ($asns as $asn) {
255dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
256089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
257089dadacSGreg Roach                        if ($range->contains($address)) {
258089dadacSGreg Roach                            continue 2;
259089dadacSGreg Roach                        }
260089dadacSGreg Roach                    }
261089dadacSGreg Roach
262089dadacSGreg Roach                    return $this->response();
263089dadacSGreg Roach                }
264089dadacSGreg Roach            }
265cc7171a0SGreg Roach        }
266089dadacSGreg Roach
267617057d4SGreg Roach        // Allow sites to block access from entire networks.
268b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
269b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
270b55cbc6bSGreg Roach
271617057d4SGreg Roach        foreach ($matches[1] as $asn) {
272617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
273617057d4SGreg Roach                if ($range->contains($address)) {
274617057d4SGreg Roach                    return $this->response();
275617057d4SGreg Roach                }
276617057d4SGreg Roach            }
277617057d4SGreg Roach        }
278089dadacSGreg Roach
279089dadacSGreg Roach        return $handler->handle($request);
280089dadacSGreg Roach    }
281089dadacSGreg Roach
282089dadacSGreg Roach    /**
283089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
284089dadacSGreg Roach     *
285089dadacSGreg Roach     * @param string        $ip
286089dadacSGreg Roach     * @param array<string> $valid_domains
2875c20d904SGreg Roach     * @param bool          $reverse_only
288089dadacSGreg Roach     *
289089dadacSGreg Roach     * @return bool
290089dadacSGreg Roach     */
2915c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
292089dadacSGreg Roach    {
293089dadacSGreg Roach        $host = gethostbyaddr($ip);
294089dadacSGreg Roach
295dec352c1SGreg Roach        if ($host === false) {
296089dadacSGreg Roach            return false;
297089dadacSGreg Roach        }
298089dadacSGreg Roach
299dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
300dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
3015c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
302089dadacSGreg Roach            }
303dec352c1SGreg Roach        }
304dec352c1SGreg Roach
305dec352c1SGreg Roach        return false;
306dec352c1SGreg Roach    }
307089dadacSGreg Roach
308089dadacSGreg Roach    /**
309089dadacSGreg Roach     * Perform a whois search for an ASN.
310089dadacSGreg Roach     *
311089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
312089dadacSGreg Roach     *
313089dadacSGreg Roach     * @return array<RangeInterface>
314089dadacSGreg Roach     */
315089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
316089dadacSGreg Roach    {
3176b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
3184a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
319273a564eSGreg Roach
320089dadacSGreg Roach            try {
321089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
322089dadacSGreg Roach                $whois  = new Whois($loader);
323089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
324273a564eSGreg Roach                $routes = $info->routes;
325273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
326089dadacSGreg Roach
327089dadacSGreg Roach                return array_filter($ranges);
32828d026adSGreg Roach            } catch (Throwable) {
329089dadacSGreg Roach                return [];
330089dadacSGreg Roach            }
331089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
332089dadacSGreg Roach    }
333089dadacSGreg Roach
334089dadacSGreg Roach    /**
335d2d58874SGreg Roach     * Fetch a list of IP addresses from a remote file.
336d2d58874SGreg Roach     *
337d2d58874SGreg Roach     * @param string $ua
338d2d58874SGreg Roach     * @param string $url
339d2d58874SGreg Roach     *
340d2d58874SGreg Roach     * @return array<string>
341d2d58874SGreg Roach     */
342d2d58874SGreg Roach    private function fetchIpRangesForUrl(string $ua, string $url): array
343d2d58874SGreg Roach    {
344d2d58874SGreg Roach        return Registry::cache()->file()->remember('url-ip-list-' . $ua, static function () use ($url): array {
345d2d58874SGreg Roach            try {
346d2d58874SGreg Roach                $client   = new Client();
347d2d58874SGreg Roach                $response = $client->get($url, ['timeout' => 5]);
348d2d58874SGreg Roach                $contents = $response->getBody()->getContents();
349d2d58874SGreg Roach
350d2d58874SGreg Roach                preg_match_all(self::REGEX_IPV4, $contents, $matches);
351d2d58874SGreg Roach
352d2d58874SGreg Roach                return $matches[0];
353d2d58874SGreg Roach            } catch (GuzzleException) {
354d2d58874SGreg Roach                return [];
355d2d58874SGreg Roach            }
356d2d58874SGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
357d2d58874SGreg Roach    }
358d2d58874SGreg Roach
359d2d58874SGreg Roach    /**
360089dadacSGreg Roach     * @return ResponseInterface
361089dadacSGreg Roach     */
362089dadacSGreg Roach    private function response(): ResponseInterface
363089dadacSGreg Roach    {
364089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
365089dadacSGreg Roach    }
366089dadacSGreg Roach}
367