xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision d5bb02daca7577269d7513d178c10eee5d3f8608)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
27089dadacSGreg Roachuse Iodev\Whois\Whois;
28089dadacSGreg Roachuse IPLib\Address\AddressInterface;
2969675509SGreg Roachuse IPLib\Factory as IPFactory;
30089dadacSGreg Roachuse IPLib\Range\RangeInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
35089dadacSGreg Roachuse Throwable;
36089dadacSGreg Roach
37b7e8616fSGreg Roachuse function array_filter;
38089dadacSGreg Roachuse function array_map;
39089dadacSGreg Roachuse function assert;
40089dadacSGreg Roachuse function gethostbyaddr;
41089dadacSGreg Roachuse function gethostbyname;
42b7e8616fSGreg Roachuse function preg_match_all;
43b7e8616fSGreg Roachuse function random_int;
44089dadacSGreg Roachuse function response;
45dec352c1SGreg Roachuse function str_contains;
46dec352c1SGreg Roachuse function str_ends_with;
47089dadacSGreg Roach
48089dadacSGreg Roach/**
49089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
50089dadacSGreg Roach */
51089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
52089dadacSGreg Roach{
53089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
54089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
55089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
56089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
57089dadacSGreg Roach
58ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
59ffa287a1SGreg Roach    public const BAD_ROBOTS = [
60089dadacSGreg Roach        'admantx',
61be5f8e6aSGreg Roach        'Adsbot',
62089dadacSGreg Roach        'AhrefsBot',
63227c6666SGreg Roach        'AspiegelBot',
6461e93e26SGreg Roach        'Barkrowler',
65a10ff261SGreg Roach        'BLEXBot',
661763aecaSGreg Roach        'DataForSEO',
67089dadacSGreg Roach        'DotBot',
68089dadacSGreg Roach        'Grapeshot',
69089dadacSGreg Roach        'ia_archiver',
7003bad539SGreg Roach        'Linguee',
71089dadacSGreg Roach        'MJ12bot',
72*d5bb02daSGreg Roach        'netEstate NE',
73227c6666SGreg Roach        'panscient',
74be5f8e6aSGreg Roach        'PetalBot',
75089dadacSGreg Roach        'proximic',
76089dadacSGreg Roach        'SemrushBot',
77*d5bb02daSGreg Roach        'SEOkicks',
78*d5bb02daSGreg Roach        'SiteKiosk',
79be5f8e6aSGreg Roach        'Turnitin',
80089dadacSGreg Roach        'XoviBot',
81a10ff261SGreg Roach        'ZoominfoBot',
82089dadacSGreg Roach    ];
83089dadacSGreg Roach
84089dadacSGreg Roach    /**
855c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
86089dadacSGreg Roach     *
87891c4176SGreg Roach     * @see https://developer.amazon.com/support/amazonbot
88089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
89089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
90089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
91089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
92089dadacSGreg Roach     */
935c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
94891c4176SGreg Roach        'Amazonbot'   => ['.crawl.amazon.com'],
95089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
96089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
97089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
98*d5bb02daSGreg Roach        'Mail.RU_Bot' => ['.mail.ru'],
99089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
100089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
101089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
102089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
103089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
104089dadacSGreg Roach    ];
105089dadacSGreg Roach
106089dadacSGreg Roach    /**
1075c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1085c20d904SGreg Roach     *
1095c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1101ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
111a9d55ce6SGreg Roach     * @see https://www.ionos.de/terms-gtc/faq-crawler
1125c20d904SGreg Roach     */
1135c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1146a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1151ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
116a9d55ce6SGreg Roach        'IonCrawl'    => ['.1und1.org'],
117*d5bb02daSGreg Roach        'Neevabot'    => ['.neeva.com'],
1185c20d904SGreg Roach    ];
1195c20d904SGreg Roach
1205c20d904SGreg Roach    /**
121089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
122089dadacSGreg Roach     *
123ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
124089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
125089dadacSGreg Roach     */
126089dadacSGreg Roach    private const ROBOT_IPS = [
127813eb6c8SGreg Roach        'AppleBot'    => [
128813eb6c8SGreg Roach            '17.0.0.0/8',
129813eb6c8SGreg Roach        ],
130089dadacSGreg Roach        'Ask Jeeves'  => [
131089dadacSGreg Roach            '65.214.45.143',
132089dadacSGreg Roach            '65.214.45.148',
133089dadacSGreg Roach            '66.235.124.192',
134089dadacSGreg Roach            '66.235.124.7',
135089dadacSGreg Roach            '66.235.124.101',
136089dadacSGreg Roach            '66.235.124.193',
137089dadacSGreg Roach            '66.235.124.73',
138089dadacSGreg Roach            '66.235.124.196',
139089dadacSGreg Roach            '66.235.124.74',
140089dadacSGreg Roach            '63.123.238.8',
141089dadacSGreg Roach            '202.143.148.61',
142089dadacSGreg Roach        ],
143089dadacSGreg Roach        'DuckDuckBot' => [
144089dadacSGreg Roach            '23.21.227.69',
145089dadacSGreg Roach            '50.16.241.113',
146089dadacSGreg Roach            '50.16.241.114',
147089dadacSGreg Roach            '50.16.241.117',
148089dadacSGreg Roach            '50.16.247.234',
149089dadacSGreg Roach            '52.204.97.54',
150089dadacSGreg Roach            '52.5.190.19',
151089dadacSGreg Roach            '54.197.234.188',
152089dadacSGreg Roach            '54.208.100.253',
153089dadacSGreg Roach            '54.208.102.37',
154089dadacSGreg Roach            '107.21.1.8',
155089dadacSGreg Roach        ],
156089dadacSGreg Roach    ];
157089dadacSGreg Roach
158089dadacSGreg Roach    /**
159089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
160089dadacSGreg Roach     *
161089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
162cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
163089dadacSGreg Roach     */
164cc7171a0SGreg Roach    private const ROBOT_ASNS = [
165cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
166cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
167089dadacSGreg Roach    ];
168089dadacSGreg Roach
169089dadacSGreg Roach    /**
170089dadacSGreg Roach     * @param ServerRequestInterface  $request
171089dadacSGreg Roach     * @param RequestHandlerInterface $handler
172089dadacSGreg Roach     *
173089dadacSGreg Roach     * @return ResponseInterface
174089dadacSGreg Roach     */
175089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
176089dadacSGreg Roach    {
177b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
178b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
1794a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
180089dadacSGreg Roach        assert($address instanceof AddressInterface);
181089dadacSGreg Roach
182dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
183dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
184089dadacSGreg Roach                return $this->response();
185089dadacSGreg Roach            }
186dec352c1SGreg Roach        }
187089dadacSGreg Roach
1885c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
189dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1905c20d904SGreg Roach                return $this->response();
1915c20d904SGreg Roach            }
1925c20d904SGreg Roach        }
1935c20d904SGreg Roach
1945c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
195dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
196089dadacSGreg Roach                return $this->response();
197089dadacSGreg Roach            }
198089dadacSGreg Roach        }
199089dadacSGreg Roach
200089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
201dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
202813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
2034a8d2484SGreg Roach                    $range = IPFactory::parseRangeString($ip);
204813eb6c8SGreg Roach
205813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
206813eb6c8SGreg Roach                        continue 2;
207813eb6c8SGreg Roach                    }
208813eb6c8SGreg Roach                }
209813eb6c8SGreg Roach
210089dadacSGreg Roach                return $this->response();
211089dadacSGreg Roach            }
212089dadacSGreg Roach        }
213089dadacSGreg Roach
214cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
215cc7171a0SGreg Roach            foreach ($asns as $asn) {
216dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
217089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
218089dadacSGreg Roach                        if ($range->contains($address)) {
219089dadacSGreg Roach                            continue 2;
220089dadacSGreg Roach                        }
221089dadacSGreg Roach                    }
222089dadacSGreg Roach
223089dadacSGreg Roach                    return $this->response();
224089dadacSGreg Roach                }
225089dadacSGreg Roach            }
226cc7171a0SGreg Roach        }
227089dadacSGreg Roach
228617057d4SGreg Roach        // Allow sites to block access from entire networks.
229b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
230b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
231b55cbc6bSGreg Roach
232617057d4SGreg Roach        foreach ($matches[1] as $asn) {
233617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
234617057d4SGreg Roach                if ($range->contains($address)) {
235617057d4SGreg Roach                    return $this->response();
236617057d4SGreg Roach                }
237617057d4SGreg Roach            }
238617057d4SGreg Roach        }
239089dadacSGreg Roach
240089dadacSGreg Roach        return $handler->handle($request);
241089dadacSGreg Roach    }
242089dadacSGreg Roach
243089dadacSGreg Roach    /**
244089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
245089dadacSGreg Roach     *
246089dadacSGreg Roach     * @param string        $ip
247089dadacSGreg Roach     * @param array<string> $valid_domains
2485c20d904SGreg Roach     * @param bool          $reverse_only
249089dadacSGreg Roach     *
250089dadacSGreg Roach     * @return bool
251089dadacSGreg Roach     */
2525c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
253089dadacSGreg Roach    {
254089dadacSGreg Roach        $host = gethostbyaddr($ip);
255089dadacSGreg Roach
256dec352c1SGreg Roach        if ($host === false) {
257089dadacSGreg Roach            return false;
258089dadacSGreg Roach        }
259089dadacSGreg Roach
260dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
261dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2625c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
263089dadacSGreg Roach            }
264dec352c1SGreg Roach        }
265dec352c1SGreg Roach
266dec352c1SGreg Roach        return false;
267dec352c1SGreg Roach    }
268089dadacSGreg Roach
269089dadacSGreg Roach    /**
270089dadacSGreg Roach     * Perform a whois search for an ASN.
271089dadacSGreg Roach     *
272089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
273089dadacSGreg Roach     *
274089dadacSGreg Roach     * @return array<RangeInterface>
275089dadacSGreg Roach     */
276089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
277089dadacSGreg Roach    {
2786b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
2794a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
280273a564eSGreg Roach
281089dadacSGreg Roach            try {
282089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
283089dadacSGreg Roach                $whois  = new Whois($loader);
284089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
285273a564eSGreg Roach                $routes = $info->routes;
286273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
287089dadacSGreg Roach
288089dadacSGreg Roach                return array_filter($ranges);
289089dadacSGreg Roach            } catch (Throwable $ex) {
290089dadacSGreg Roach                return [];
291089dadacSGreg Roach            }
292089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
293089dadacSGreg Roach    }
294089dadacSGreg Roach
295089dadacSGreg Roach    /**
296089dadacSGreg Roach     * @return ResponseInterface
297089dadacSGreg Roach     */
298089dadacSGreg Roach    private function response(): ResponseInterface
299089dadacSGreg Roach    {
300089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
301089dadacSGreg Roach    }
302089dadacSGreg Roach}
303