xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 7fa18cfdad1985939b9339a38adfaf62175c47de)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
27089dadacSGreg Roachuse Iodev\Whois\Whois;
28089dadacSGreg Roachuse IPLib\Address\AddressInterface;
2969675509SGreg Roachuse IPLib\Factory as IPFactory;
30089dadacSGreg Roachuse IPLib\Range\RangeInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
35089dadacSGreg Roachuse Throwable;
36089dadacSGreg Roach
37b7e8616fSGreg Roachuse function array_filter;
38089dadacSGreg Roachuse function array_map;
39089dadacSGreg Roachuse function assert;
40089dadacSGreg Roachuse function gethostbyaddr;
41089dadacSGreg Roachuse function gethostbyname;
42b7e8616fSGreg Roachuse function preg_match_all;
43b7e8616fSGreg Roachuse function random_int;
44089dadacSGreg Roachuse function response;
45dec352c1SGreg Roachuse function str_contains;
46dec352c1SGreg Roachuse function str_ends_with;
47089dadacSGreg Roach
48089dadacSGreg Roach/**
49089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
50089dadacSGreg Roach */
51089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
52089dadacSGreg Roach{
53089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
54089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
55089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
56089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
57089dadacSGreg Roach
58ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
59ffa287a1SGreg Roach    public const BAD_ROBOTS = [
60089dadacSGreg Roach        'admantx',
61be5f8e6aSGreg Roach        'Adsbot',
62089dadacSGreg Roach        'AhrefsBot',
63*7fa18cfdSGreg Roach        'Amazonbot', // Until it understands crawl-delay and noindex / nofollow
64227c6666SGreg Roach        'AspiegelBot',
6561e93e26SGreg Roach        'Barkrowler',
66a10ff261SGreg Roach        'BLEXBot',
671763aecaSGreg Roach        'DataForSEO',
68089dadacSGreg Roach        'DotBot',
69089dadacSGreg Roach        'Grapeshot',
70089dadacSGreg Roach        'ia_archiver',
7103bad539SGreg Roach        'Linguee',
72089dadacSGreg Roach        'MJ12bot',
73d5bb02daSGreg Roach        'netEstate NE',
74227c6666SGreg Roach        'panscient',
75be5f8e6aSGreg Roach        'PetalBot',
76089dadacSGreg Roach        'proximic',
77089dadacSGreg Roach        'SemrushBot',
78d5bb02daSGreg Roach        'SEOkicks',
79d5bb02daSGreg Roach        'SiteKiosk',
80be5f8e6aSGreg Roach        'Turnitin',
81089dadacSGreg Roach        'XoviBot',
82a10ff261SGreg Roach        'ZoominfoBot',
83089dadacSGreg Roach    ];
84089dadacSGreg Roach
85089dadacSGreg Roach    /**
865c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
87089dadacSGreg Roach     *
88891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
89089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
90089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
91089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
92089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
93089dadacSGreg Roach     */
945c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
95891c4176SGreg Roach        'Amazonbot'   => ['.crawl.amazon.com'],
96089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
97089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
98089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
99d5bb02daSGreg Roach        'Mail.RU_Bot' => ['.mail.ru'],
100089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
101089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
102089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
103089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
104089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
105089dadacSGreg Roach    ];
106089dadacSGreg Roach
107089dadacSGreg Roach    /**
1085c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1095c20d904SGreg Roach     *
1105c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1111ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
112a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1135c20d904SGreg Roach     */
1145c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1156a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1161ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
117a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
118d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1195c20d904SGreg Roach    ];
1205c20d904SGreg Roach
1215c20d904SGreg Roach    /**
122089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
123089dadacSGreg Roach     *
124ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
125089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
126089dadacSGreg Roach     */
127089dadacSGreg Roach    private const ROBOT_IPS = [
128813eb6c8SGreg Roach        'AppleBot'    => [
129813eb6c8SGreg Roach            '17.0.0.0/8',
130813eb6c8SGreg Roach        ],
131089dadacSGreg Roach        'Ask Jeeves'  => [
132089dadacSGreg Roach            '65.214.45.143',
133089dadacSGreg Roach            '65.214.45.148',
134089dadacSGreg Roach            '66.235.124.192',
135089dadacSGreg Roach            '66.235.124.7',
136089dadacSGreg Roach            '66.235.124.101',
137089dadacSGreg Roach            '66.235.124.193',
138089dadacSGreg Roach            '66.235.124.73',
139089dadacSGreg Roach            '66.235.124.196',
140089dadacSGreg Roach            '66.235.124.74',
141089dadacSGreg Roach            '63.123.238.8',
142089dadacSGreg Roach            '202.143.148.61',
143089dadacSGreg Roach        ],
144089dadacSGreg Roach        'DuckDuckBot' => [
145089dadacSGreg Roach            '23.21.227.69',
146089dadacSGreg Roach            '50.16.241.113',
147089dadacSGreg Roach            '50.16.241.114',
148089dadacSGreg Roach            '50.16.241.117',
149089dadacSGreg Roach            '50.16.247.234',
150089dadacSGreg Roach            '52.204.97.54',
151089dadacSGreg Roach            '52.5.190.19',
152089dadacSGreg Roach            '54.197.234.188',
153089dadacSGreg Roach            '54.208.100.253',
154089dadacSGreg Roach            '54.208.102.37',
155089dadacSGreg Roach            '107.21.1.8',
156089dadacSGreg Roach        ],
157089dadacSGreg Roach    ];
158089dadacSGreg Roach
159089dadacSGreg Roach    /**
160089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
161089dadacSGreg Roach     *
162089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
163cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
164089dadacSGreg Roach     */
165cc7171a0SGreg Roach    private const ROBOT_ASNS = [
166cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
167cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
168089dadacSGreg Roach    ];
169089dadacSGreg Roach
170089dadacSGreg Roach    /**
171089dadacSGreg Roach     * @param ServerRequestInterface  $request
172089dadacSGreg Roach     * @param RequestHandlerInterface $handler
173089dadacSGreg Roach     *
174089dadacSGreg Roach     * @return ResponseInterface
175089dadacSGreg Roach     */
176089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
177089dadacSGreg Roach    {
178b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
179b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
1804a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
181089dadacSGreg Roach        assert($address instanceof AddressInterface);
182089dadacSGreg Roach
183dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
184dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
185089dadacSGreg Roach                return $this->response();
186089dadacSGreg Roach            }
187dec352c1SGreg Roach        }
188089dadacSGreg Roach
1895c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
190dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1915c20d904SGreg Roach                return $this->response();
1925c20d904SGreg Roach            }
1935c20d904SGreg Roach        }
1945c20d904SGreg Roach
1955c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
196dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
197089dadacSGreg Roach                return $this->response();
198089dadacSGreg Roach            }
199089dadacSGreg Roach        }
200089dadacSGreg Roach
201089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
202dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
203813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
2044a8d2484SGreg Roach                    $range = IPFactory::parseRangeString($ip);
205813eb6c8SGreg Roach
206813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
207813eb6c8SGreg Roach                        continue 2;
208813eb6c8SGreg Roach                    }
209813eb6c8SGreg Roach                }
210813eb6c8SGreg Roach
211089dadacSGreg Roach                return $this->response();
212089dadacSGreg Roach            }
213089dadacSGreg Roach        }
214089dadacSGreg Roach
215cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
216cc7171a0SGreg Roach            foreach ($asns as $asn) {
217dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
218089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
219089dadacSGreg Roach                        if ($range->contains($address)) {
220089dadacSGreg Roach                            continue 2;
221089dadacSGreg Roach                        }
222089dadacSGreg Roach                    }
223089dadacSGreg Roach
224089dadacSGreg Roach                    return $this->response();
225089dadacSGreg Roach                }
226089dadacSGreg Roach            }
227cc7171a0SGreg Roach        }
228089dadacSGreg Roach
229617057d4SGreg Roach        // Allow sites to block access from entire networks.
230b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
231b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
232b55cbc6bSGreg Roach
233617057d4SGreg Roach        foreach ($matches[1] as $asn) {
234617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
235617057d4SGreg Roach                if ($range->contains($address)) {
236617057d4SGreg Roach                    return $this->response();
237617057d4SGreg Roach                }
238617057d4SGreg Roach            }
239617057d4SGreg Roach        }
240089dadacSGreg Roach
241089dadacSGreg Roach        return $handler->handle($request);
242089dadacSGreg Roach    }
243089dadacSGreg Roach
244089dadacSGreg Roach    /**
245089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
246089dadacSGreg Roach     *
247089dadacSGreg Roach     * @param string        $ip
248089dadacSGreg Roach     * @param array<string> $valid_domains
2495c20d904SGreg Roach     * @param bool          $reverse_only
250089dadacSGreg Roach     *
251089dadacSGreg Roach     * @return bool
252089dadacSGreg Roach     */
2535c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
254089dadacSGreg Roach    {
255089dadacSGreg Roach        $host = gethostbyaddr($ip);
256089dadacSGreg Roach
257dec352c1SGreg Roach        if ($host === false) {
258089dadacSGreg Roach            return false;
259089dadacSGreg Roach        }
260089dadacSGreg Roach
261dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
262dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2635c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
264089dadacSGreg Roach            }
265dec352c1SGreg Roach        }
266dec352c1SGreg Roach
267dec352c1SGreg Roach        return false;
268dec352c1SGreg Roach    }
269089dadacSGreg Roach
270089dadacSGreg Roach    /**
271089dadacSGreg Roach     * Perform a whois search for an ASN.
272089dadacSGreg Roach     *
273089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
274089dadacSGreg Roach     *
275089dadacSGreg Roach     * @return array<RangeInterface>
276089dadacSGreg Roach     */
277089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
278089dadacSGreg Roach    {
2796b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
2804a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
281273a564eSGreg Roach
282089dadacSGreg Roach            try {
283089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
284089dadacSGreg Roach                $whois  = new Whois($loader);
285089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
286273a564eSGreg Roach                $routes = $info->routes;
287273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
288089dadacSGreg Roach
289089dadacSGreg Roach                return array_filter($ranges);
290089dadacSGreg Roach            } catch (Throwable $ex) {
291089dadacSGreg Roach                return [];
292089dadacSGreg Roach            }
293089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
294089dadacSGreg Roach    }
295089dadacSGreg Roach
296089dadacSGreg Roach    /**
297089dadacSGreg Roach     * @return ResponseInterface
298089dadacSGreg Roach     */
299089dadacSGreg Roach    private function response(): ResponseInterface
300089dadacSGreg Roach    {
301089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
302089dadacSGreg Roach    }
303089dadacSGreg Roach}
304