xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 1763aecae76691021ba10c281e767c91c92330f4)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
55bfc6897SGreg Roach * Copyright (C) 2022 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24b55cbc6bSGreg Roachuse Fisharebest\Webtrees\Validator;
25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
27089dadacSGreg Roachuse Iodev\Whois\Whois;
28089dadacSGreg Roachuse IPLib\Address\AddressInterface;
2969675509SGreg Roachuse IPLib\Factory as IPFactory;
30089dadacSGreg Roachuse IPLib\Range\RangeInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
35089dadacSGreg Roachuse Throwable;
36089dadacSGreg Roach
37b7e8616fSGreg Roachuse function array_filter;
38089dadacSGreg Roachuse function array_map;
39089dadacSGreg Roachuse function assert;
40089dadacSGreg Roachuse function gethostbyaddr;
41089dadacSGreg Roachuse function gethostbyname;
42b7e8616fSGreg Roachuse function preg_match_all;
43b7e8616fSGreg Roachuse function random_int;
44089dadacSGreg Roachuse function response;
45dec352c1SGreg Roachuse function str_contains;
46dec352c1SGreg Roachuse function str_ends_with;
47089dadacSGreg Roach
48089dadacSGreg Roach/**
49089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
50089dadacSGreg Roach */
51089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
52089dadacSGreg Roach{
53089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
54089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
55089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
56089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
57089dadacSGreg Roach
58ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
59ffa287a1SGreg Roach    public const BAD_ROBOTS = [
60089dadacSGreg Roach        'admantx',
61be5f8e6aSGreg Roach        'Adsbot',
62089dadacSGreg Roach        'AhrefsBot',
63227c6666SGreg Roach        'AspiegelBot',
6461e93e26SGreg Roach        'Barkrowler',
65a10ff261SGreg Roach        'BLEXBot',
66*1763aecaSGreg Roach        'DataForSEO',
67089dadacSGreg Roach        'DotBot',
68089dadacSGreg Roach        'Grapeshot',
69089dadacSGreg Roach        'ia_archiver',
7003bad539SGreg Roach        'Linguee',
71089dadacSGreg Roach        'MJ12bot',
72227c6666SGreg Roach        'panscient',
73be5f8e6aSGreg Roach        'PetalBot',
74089dadacSGreg Roach        'proximic',
75089dadacSGreg Roach        'SemrushBot',
76be5f8e6aSGreg Roach        'Turnitin',
77089dadacSGreg Roach        'XoviBot',
78a10ff261SGreg Roach        'ZoominfoBot',
79089dadacSGreg Roach    ];
80089dadacSGreg Roach
81089dadacSGreg Roach    /**
825c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
83089dadacSGreg Roach     *
84089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
85089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
86089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
87089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
88089dadacSGreg Roach     */
895c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
90089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
91089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
92089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
93ffa287a1SGreg Roach        'Mail.RU_Bot' => ['mail.ru'],
94089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
95089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
96089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
97089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
98089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
99089dadacSGreg Roach    ];
100089dadacSGreg Roach
101089dadacSGreg Roach    /**
1025c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
1035c20d904SGreg Roach     *
1045c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
1051ed9b76dSGreg Roach     * @see https://napoveda.seznam.cz/en/full-text-search/seznambot-crawler
1065c20d904SGreg Roach     */
1075c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1086a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1091ed9b76dSGreg Roach        'FreshBot'    => ['.seznam.cz'],
1101ed9b76dSGreg Roach        'Seznam'      => ['.seznam.cz'],
1115c20d904SGreg Roach    ];
1125c20d904SGreg Roach
1135c20d904SGreg Roach    /**
114089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
115089dadacSGreg Roach     *
116ad3143ccSGreg Roach     * @see https://www.apple.com/go/applebot
117089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
118089dadacSGreg Roach     */
119089dadacSGreg Roach    private const ROBOT_IPS = [
120813eb6c8SGreg Roach        'AppleBot'    => [
121813eb6c8SGreg Roach            '17.0.0.0/8',
122813eb6c8SGreg Roach        ],
123089dadacSGreg Roach        'Ask Jeeves'  => [
124089dadacSGreg Roach            '65.214.45.143',
125089dadacSGreg Roach            '65.214.45.148',
126089dadacSGreg Roach            '66.235.124.192',
127089dadacSGreg Roach            '66.235.124.7',
128089dadacSGreg Roach            '66.235.124.101',
129089dadacSGreg Roach            '66.235.124.193',
130089dadacSGreg Roach            '66.235.124.73',
131089dadacSGreg Roach            '66.235.124.196',
132089dadacSGreg Roach            '66.235.124.74',
133089dadacSGreg Roach            '63.123.238.8',
134089dadacSGreg Roach            '202.143.148.61',
135089dadacSGreg Roach        ],
136089dadacSGreg Roach        'DuckDuckBot' => [
137089dadacSGreg Roach            '23.21.227.69',
138089dadacSGreg Roach            '50.16.241.113',
139089dadacSGreg Roach            '50.16.241.114',
140089dadacSGreg Roach            '50.16.241.117',
141089dadacSGreg Roach            '50.16.247.234',
142089dadacSGreg Roach            '52.204.97.54',
143089dadacSGreg Roach            '52.5.190.19',
144089dadacSGreg Roach            '54.197.234.188',
145089dadacSGreg Roach            '54.208.100.253',
146089dadacSGreg Roach            '54.208.102.37',
147089dadacSGreg Roach            '107.21.1.8',
148089dadacSGreg Roach        ],
149089dadacSGreg Roach    ];
150089dadacSGreg Roach
151089dadacSGreg Roach    /**
152089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
153089dadacSGreg Roach     *
154089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
155cc7171a0SGreg Roach     * @see https://www.facebook.com/peering/
156089dadacSGreg Roach     */
157cc7171a0SGreg Roach    private const ROBOT_ASNS = [
158cc7171a0SGreg Roach        'facebook' => ['AS32934', 'AS63293'],
159cc7171a0SGreg Roach        'twitter'  => ['AS13414'],
160089dadacSGreg Roach    ];
161089dadacSGreg Roach
162089dadacSGreg Roach    /**
163089dadacSGreg Roach     * @param ServerRequestInterface  $request
164089dadacSGreg Roach     * @param RequestHandlerInterface $handler
165089dadacSGreg Roach     *
166089dadacSGreg Roach     * @return ResponseInterface
167089dadacSGreg Roach     */
168089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
169089dadacSGreg Roach    {
170b55cbc6bSGreg Roach        $ua      = Validator::serverParams($request)->string('HTTP_USER_AGENT', '');
171b55cbc6bSGreg Roach        $ip      = Validator::attributes($request)->string('client-ip');
1724a8d2484SGreg Roach        $address = IPFactory::parseAddressString($ip);
173089dadacSGreg Roach        assert($address instanceof AddressInterface);
174089dadacSGreg Roach
175dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
176dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
177089dadacSGreg Roach                return $this->response();
178089dadacSGreg Roach            }
179dec352c1SGreg Roach        }
180089dadacSGreg Roach
1815c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
182dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1835c20d904SGreg Roach                return $this->response();
1845c20d904SGreg Roach            }
1855c20d904SGreg Roach        }
1865c20d904SGreg Roach
1875c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
188dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
189089dadacSGreg Roach                return $this->response();
190089dadacSGreg Roach            }
191089dadacSGreg Roach        }
192089dadacSGreg Roach
193089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
194dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
195813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
1964a8d2484SGreg Roach                    $range = IPFactory::parseRangeString($ip);
197813eb6c8SGreg Roach
198813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
199813eb6c8SGreg Roach                        continue 2;
200813eb6c8SGreg Roach                    }
201813eb6c8SGreg Roach                }
202813eb6c8SGreg Roach
203089dadacSGreg Roach                return $this->response();
204089dadacSGreg Roach            }
205089dadacSGreg Roach        }
206089dadacSGreg Roach
207cc7171a0SGreg Roach        foreach (self::ROBOT_ASNS as $robot => $asns) {
208cc7171a0SGreg Roach            foreach ($asns as $asn) {
209dec352c1SGreg Roach                if (str_contains($ua, $robot)) {
210089dadacSGreg Roach                    foreach ($this->fetchIpRangesForAsn($asn) as $range) {
211089dadacSGreg Roach                        if ($range->contains($address)) {
212089dadacSGreg Roach                            continue 2;
213089dadacSGreg Roach                        }
214089dadacSGreg Roach                    }
215089dadacSGreg Roach
216089dadacSGreg Roach                    return $this->response();
217089dadacSGreg Roach                }
218089dadacSGreg Roach            }
219cc7171a0SGreg Roach        }
220089dadacSGreg Roach
221617057d4SGreg Roach        // Allow sites to block access from entire networks.
222b55cbc6bSGreg Roach        $block_asn = Validator::attributes($request)->string('block_asn', '');
223b55cbc6bSGreg Roach        preg_match_all('/(AS\d+)/', $block_asn, $matches);
224b55cbc6bSGreg Roach
225617057d4SGreg Roach        foreach ($matches[1] as $asn) {
226617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
227617057d4SGreg Roach                if ($range->contains($address)) {
228617057d4SGreg Roach                    return $this->response();
229617057d4SGreg Roach                }
230617057d4SGreg Roach            }
231617057d4SGreg Roach        }
232089dadacSGreg Roach
233089dadacSGreg Roach        return $handler->handle($request);
234089dadacSGreg Roach    }
235089dadacSGreg Roach
236089dadacSGreg Roach    /**
237089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
238089dadacSGreg Roach     *
239089dadacSGreg Roach     * @param string        $ip
240089dadacSGreg Roach     * @param array<string> $valid_domains
2415c20d904SGreg Roach     * @param bool          $reverse_only
242089dadacSGreg Roach     *
243089dadacSGreg Roach     * @return bool
244089dadacSGreg Roach     */
2455c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
246089dadacSGreg Roach    {
247089dadacSGreg Roach        $host = gethostbyaddr($ip);
248089dadacSGreg Roach
249dec352c1SGreg Roach        if ($host === false) {
250089dadacSGreg Roach            return false;
251089dadacSGreg Roach        }
252089dadacSGreg Roach
253dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
254dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2555c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
256089dadacSGreg Roach            }
257dec352c1SGreg Roach        }
258dec352c1SGreg Roach
259dec352c1SGreg Roach        return false;
260dec352c1SGreg Roach    }
261089dadacSGreg Roach
262089dadacSGreg Roach    /**
263089dadacSGreg Roach     * Perform a whois search for an ASN.
264089dadacSGreg Roach     *
265089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
266089dadacSGreg Roach     *
267089dadacSGreg Roach     * @return array<RangeInterface>
268089dadacSGreg Roach     */
269089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
270089dadacSGreg Roach    {
2716b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
2724a8d2484SGreg Roach            $mapper = static fn (AsnRouteInfo $route_info): ?RangeInterface => IPFactory::parseRangeString($route_info->route ?: $route_info->route6);
273273a564eSGreg Roach
274089dadacSGreg Roach            try {
275089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
276089dadacSGreg Roach                $whois  = new Whois($loader);
277089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
278273a564eSGreg Roach                $routes = $info->routes;
279273a564eSGreg Roach                $ranges = array_map($mapper, $routes);
280089dadacSGreg Roach
281089dadacSGreg Roach                return array_filter($ranges);
282089dadacSGreg Roach            } catch (Throwable $ex) {
283089dadacSGreg Roach                return [];
284089dadacSGreg Roach            }
285089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
286089dadacSGreg Roach    }
287089dadacSGreg Roach
288089dadacSGreg Roach    /**
289089dadacSGreg Roach     * @return ResponseInterface
290089dadacSGreg Roach     */
291089dadacSGreg Roach    private function response(): ResponseInterface
292089dadacSGreg Roach    {
293089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
294089dadacSGreg Roach    }
295089dadacSGreg Roach}
296