xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision b7e8616f5f3bd6f035d5d40dd804283727b5c140)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
25089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
26089dadacSGreg Roachuse Iodev\Whois\Whois;
27089dadacSGreg Roachuse IPLib\Address\AddressInterface;
2869675509SGreg Roachuse IPLib\Factory as IPFactory;
29089dadacSGreg Roachuse IPLib\Range\RangeInterface;
30089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
32089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
33089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
34089dadacSGreg Roachuse Throwable;
35089dadacSGreg Roach
36*b7e8616fSGreg Roachuse function array_filter;
37089dadacSGreg Roachuse function array_map;
38089dadacSGreg Roachuse function assert;
39089dadacSGreg Roachuse function gethostbyaddr;
40089dadacSGreg Roachuse function gethostbyname;
41*b7e8616fSGreg Roachuse function preg_match_all;
42*b7e8616fSGreg Roachuse function random_int;
43089dadacSGreg Roachuse function response;
44dec352c1SGreg Roachuse function str_contains;
45dec352c1SGreg Roachuse function str_ends_with;
46089dadacSGreg Roach
47089dadacSGreg Roach/**
48089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
49089dadacSGreg Roach */
50089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
51089dadacSGreg Roach{
52089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
53089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
54089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
55089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
56089dadacSGreg Roach
57ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
58ffa287a1SGreg Roach    public const BAD_ROBOTS = [
59089dadacSGreg Roach        'admantx',
60be5f8e6aSGreg Roach        'Adsbot',
61089dadacSGreg Roach        'AhrefsBot',
62227c6666SGreg Roach        'AspiegelBot',
6361e93e26SGreg Roach        'Barkrowler',
64a10ff261SGreg Roach        'BLEXBot',
65089dadacSGreg Roach        'DotBot',
66089dadacSGreg Roach        'Grapeshot',
67089dadacSGreg Roach        'ia_archiver',
6803bad539SGreg Roach        'Linguee',
69089dadacSGreg Roach        'MJ12bot',
70227c6666SGreg Roach        'panscient',
71be5f8e6aSGreg Roach        'PetalBot',
72089dadacSGreg Roach        'proximic',
73089dadacSGreg Roach        'SemrushBot',
74be5f8e6aSGreg Roach        'Turnitin',
75089dadacSGreg Roach        'XoviBot',
76a10ff261SGreg Roach        'ZoominfoBot',
77089dadacSGreg Roach    ];
78089dadacSGreg Roach
79089dadacSGreg Roach    /**
805c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
81089dadacSGreg Roach     *
82089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
83089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
84089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
85089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
86089dadacSGreg Roach     */
875c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
88089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
89089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
90089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
91ffa287a1SGreg Roach        'Mail.RU_Bot' => ['mail.ru'],
92089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
93089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
94089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
95089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
96089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
97089dadacSGreg Roach    ];
98089dadacSGreg Roach
99089dadacSGreg Roach    /**
1005c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1015c20d904SGreg Roach     *
1025c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1035c20d904SGreg Roach     */
1045c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1056a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1065c20d904SGreg Roach    ];
1075c20d904SGreg Roach
1085c20d904SGreg Roach    /**
109089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
110089dadacSGreg Roach     *
111ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
112089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
113089dadacSGreg Roach     */
114089dadacSGreg Roach    private const ROBOT_IPS = [
115813eb6c8SGreg Roach        'AppleBot'    => [
116813eb6c8SGreg Roach            '17.0.0.0/8',
117813eb6c8SGreg Roach        ],
118089dadacSGreg Roach        'Ask Jeeves'  => [
119089dadacSGreg Roach            '65.214.45.143',
120089dadacSGreg Roach            '65.214.45.148',
121089dadacSGreg Roach            '66.235.124.192',
122089dadacSGreg Roach            '66.235.124.7',
123089dadacSGreg Roach            '66.235.124.101',
124089dadacSGreg Roach            '66.235.124.193',
125089dadacSGreg Roach            '66.235.124.73',
126089dadacSGreg Roach            '66.235.124.196',
127089dadacSGreg Roach            '66.235.124.74',
128089dadacSGreg Roach            '63.123.238.8',
129089dadacSGreg Roach            '202.143.148.61',
130089dadacSGreg Roach        ],
131089dadacSGreg Roach        'DuckDuckBot' => [
132089dadacSGreg Roach            '23.21.227.69',
133089dadacSGreg Roach            '50.16.241.113',
134089dadacSGreg Roach            '50.16.241.114',
135089dadacSGreg Roach            '50.16.241.117',
136089dadacSGreg Roach            '50.16.247.234',
137089dadacSGreg Roach            '52.204.97.54',
138089dadacSGreg Roach            '52.5.190.19',
139089dadacSGreg Roach            '54.197.234.188',
140089dadacSGreg Roach            '54.208.100.253',
141089dadacSGreg Roach            '54.208.102.37',
142089dadacSGreg Roach            '107.21.1.8',
143089dadacSGreg Roach        ],
144089dadacSGreg Roach    ];
145089dadacSGreg Roach
146089dadacSGreg Roach    /**
147089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
148089dadacSGreg Roach     *
149089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
150cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
151089dadacSGreg Roach     */
152cc7171a0SGreg Roach    private const ROBOT_ASNS = [
153cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
154cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
155089dadacSGreg Roach    ];
156089dadacSGreg Roach
157089dadacSGreg Roach    /**
158089dadacSGreg Roach     * @param ServerRequestInterface  $request
159089dadacSGreg Roach     * @param RequestHandlerInterface $handler
160089dadacSGreg Roach     *
161089dadacSGreg Roach     * @return ResponseInterface
162089dadacSGreg Roach     */
163089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
164089dadacSGreg Roach    {
165089dadacSGreg Roach        $ua      = $request->getServerParams()['HTTP_USER_AGENT'] ?? '';
166089dadacSGreg Roach        $ip      = $request->getAttribute('client-ip');
1674a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
168089dadacSGreg Roach        assert($address instanceof AddressInterface);
169089dadacSGreg Roach
170dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
171dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
172089dadacSGreg Roach                return $this->response();
173089dadacSGreg Roach            }
174dec352c1SGreg Roach        }
175089dadacSGreg Roach
1765c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
177dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1785c20d904SGreg Roach                return $this->response();
1795c20d904SGreg Roach            }
1805c20d904SGreg Roach        }
1815c20d904SGreg Roach
1825c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
183dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
184089dadacSGreg Roach                return $this->response();
185089dadacSGreg Roach            }
186089dadacSGreg Roach        }
187089dadacSGreg Roach
188089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
189dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
190813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
1914a8d2484SGreg Roach                    $range = IPFactory::parseRangeString($ip);
192813eb6c8SGreg Roach
193813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
194813eb6c8SGreg Roach                        continue 2;
195813eb6c8SGreg Roach                    }
196813eb6c8SGreg Roach                }
197813eb6c8SGreg Roach
198089dadacSGreg Roach                return $this->response();
199089dadacSGreg Roach            }
200089dadacSGreg Roach        }
201089dadacSGreg Roach
202cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
203cc7171a0SGreg Roach            foreach ($asns as $asn) {
204dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
205089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
206089dadacSGreg Roach                        if ($range->contains($address)) {
207089dadacSGreg Roach                            continue 2;
208089dadacSGreg Roach                        }
209089dadacSGreg Roach                    }
210089dadacSGreg Roach
211089dadacSGreg Roach                    return $this->response();
212089dadacSGreg Roach                }
213089dadacSGreg Roach            }
214cc7171a0SGreg Roach        }
215089dadacSGreg Roach
216617057d4SGreg Roach        // Allow sites to block access from entire networks.
217617057d4SGreg Roach        preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches);
218617057d4SGreg Roach        foreach ($matches[1] as $asn) {
219617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
220617057d4SGreg Roach                if ($range->contains($address)) {
221617057d4SGreg Roach                    return $this->response();
222617057d4SGreg Roach                }
223617057d4SGreg Roach            }
224617057d4SGreg Roach        }
225089dadacSGreg Roach
226089dadacSGreg Roach        return $handler->handle($request);
227089dadacSGreg Roach    }
228089dadacSGreg Roach
229089dadacSGreg Roach    /**
230089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
231089dadacSGreg Roach     *
232089dadacSGreg Roach     * @param string        $ip
233089dadacSGreg Roach     * @param array<string> $valid_domains
2345c20d904SGreg Roach     * @param bool          $reverse_only
235089dadacSGreg Roach     *
236089dadacSGreg Roach     * @return bool
237089dadacSGreg Roach     */
2385c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
239089dadacSGreg Roach    {
240089dadacSGreg Roach        $host = gethostbyaddr($ip);
241089dadacSGreg Roach
242dec352c1SGreg Roach        if ($host === false) {
243089dadacSGreg Roach            return false;
244089dadacSGreg Roach        }
245089dadacSGreg Roach
246dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
247dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2485c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
249089dadacSGreg Roach            }
250dec352c1SGreg Roach        }
251dec352c1SGreg Roach
252dec352c1SGreg Roach        return false;
253dec352c1SGreg Roach    }
254089dadacSGreg Roach
255089dadacSGreg Roach    /**
256089dadacSGreg Roach     * Perform a whois search for an ASN.
257089dadacSGreg Roach     *
258089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
259089dadacSGreg Roach     *
260089dadacSGreg Roach     * @return array<RangeInterface>
261089dadacSGreg Roach     */
262089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
263089dadacSGreg Roach    {
2646b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
2654a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
266273a564eSGreg Roach
267089dadacSGreg Roach            try {
268089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
269089dadacSGreg Roach                $whois  = new Whois($loader);
270089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
271273a564eSGreg Roach                $routes = $info->routes;
272273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
273089dadacSGreg Roach
274089dadacSGreg Roach                return array_filter($ranges);
275089dadacSGreg Roach            } catch (Throwable $ex) {
276089dadacSGreg Roach                return [];
277089dadacSGreg Roach            }
278089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
279089dadacSGreg Roach    }
280089dadacSGreg Roach
281089dadacSGreg Roach    /**
282089dadacSGreg Roach     * @return ResponseInterface
283089dadacSGreg Roach     */
284089dadacSGreg Roach    private function response(): ResponseInterface
285089dadacSGreg Roach    {
286089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
287089dadacSGreg Roach    }
288089dadacSGreg Roach}
289