xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 03bad5398d62fa232e48502fa908a39a7be0197c)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
589f7189bSGreg Roach * Copyright (C) 2021 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
1589f7189bSGreg Roach * along with this program. If not, see <https://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
236b9cb339SGreg Roachuse Fisharebest\Webtrees\Registry;
24089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
25089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
26089dadacSGreg Roachuse Iodev\Whois\Whois;
27089dadacSGreg Roachuse IPLib\Address\AddressInterface;
2869675509SGreg Roachuse IPLib\Factory as IPFactory;
29089dadacSGreg Roachuse IPLib\Range\RangeInterface;
30089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
32089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
33089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
34089dadacSGreg Roachuse Throwable;
35089dadacSGreg Roach
36089dadacSGreg Roachuse function array_map;
37089dadacSGreg Roachuse function assert;
38089dadacSGreg Roachuse function gethostbyaddr;
39089dadacSGreg Roachuse function gethostbyname;
40089dadacSGreg Roachuse function response;
41dec352c1SGreg Roachuse function str_contains;
42dec352c1SGreg Roachuse function str_ends_with;
43089dadacSGreg Roach
44089dadacSGreg Roach/**
45089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
46089dadacSGreg Roach */
47089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
48089dadacSGreg Roach{
49089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
50089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
51089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
52089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
53089dadacSGreg Roach
54ffa287a1SGreg Roach    // Bad robots - SEO optimisers, advertisers, etc.  This list is shared with robots.txt.
55ffa287a1SGreg Roach    public const BAD_ROBOTS = [
56089dadacSGreg Roach        'admantx',
57be5f8e6aSGreg Roach        'Adsbot',
58089dadacSGreg Roach        'AhrefsBot',
59227c6666SGreg Roach        'AspiegelBot',
6061e93e26SGreg Roach        'Barkrowler',
61089dadacSGreg Roach        'DotBot',
62089dadacSGreg Roach        'Grapeshot',
63089dadacSGreg Roach        'ia_archiver',
64*03bad539SGreg Roach        'Linguee',
65089dadacSGreg Roach        'MJ12bot',
66227c6666SGreg Roach        'panscient',
67be5f8e6aSGreg Roach        'PetalBot',
68089dadacSGreg Roach        'proximic',
69089dadacSGreg Roach        'SemrushBot',
70be5f8e6aSGreg Roach        'Turnitin',
71089dadacSGreg Roach        'XoviBot',
72089dadacSGreg Roach    ];
73089dadacSGreg Roach
74089dadacSGreg Roach    /**
755c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
76089dadacSGreg Roach     *
77089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
78089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
79089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
80089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
81089dadacSGreg Roach     */
825c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
83089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
84089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
85089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
86ffa287a1SGreg Roach        'Mail.RU_Bot' => ['mail.ru'],
87089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
88089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
89089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
90089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
91089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
92089dadacSGreg Roach    ];
93089dadacSGreg Roach
94089dadacSGreg Roach    /**
955c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
965c20d904SGreg Roach     *
975c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
985c20d904SGreg Roach     */
995c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
1006a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
1015c20d904SGreg Roach    ];
1025c20d904SGreg Roach
1035c20d904SGreg Roach    /**
104089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
105089dadacSGreg Roach     *
106813eb6c8SGreg Roach     * @see http://www.apple.com/go/applebot
107089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
108089dadacSGreg Roach     */
109089dadacSGreg Roach    private const ROBOT_IPS = [
110813eb6c8SGreg Roach        'AppleBot'    => [
111813eb6c8SGreg Roach            '17.0.0.0/8',
112813eb6c8SGreg Roach        ],
113089dadacSGreg Roach        'Ask Jeeves'  => [
114089dadacSGreg Roach            '65.214.45.143',
115089dadacSGreg Roach            '65.214.45.148',
116089dadacSGreg Roach            '66.235.124.192',
117089dadacSGreg Roach            '66.235.124.7',
118089dadacSGreg Roach            '66.235.124.101',
119089dadacSGreg Roach            '66.235.124.193',
120089dadacSGreg Roach            '66.235.124.73',
121089dadacSGreg Roach            '66.235.124.196',
122089dadacSGreg Roach            '66.235.124.74',
123089dadacSGreg Roach            '63.123.238.8',
124089dadacSGreg Roach            '202.143.148.61',
125089dadacSGreg Roach        ],
126089dadacSGreg Roach        'DuckDuckBot' => [
127089dadacSGreg Roach            '23.21.227.69',
128089dadacSGreg Roach            '50.16.241.113',
129089dadacSGreg Roach            '50.16.241.114',
130089dadacSGreg Roach            '50.16.241.117',
131089dadacSGreg Roach            '50.16.247.234',
132089dadacSGreg Roach            '52.204.97.54',
133089dadacSGreg Roach            '52.5.190.19',
134089dadacSGreg Roach            '54.197.234.188',
135089dadacSGreg Roach            '54.208.100.253',
136089dadacSGreg Roach            '54.208.102.37',
137089dadacSGreg Roach            '107.21.1.8',
138089dadacSGreg Roach        ],
139089dadacSGreg Roach    ];
140089dadacSGreg Roach
141089dadacSGreg Roach    /**
142089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
143089dadacSGreg Roach     *
144089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
145089dadacSGreg Roach     */
146089dadacSGreg Roach    private const ROBOT_ASN = [
147089dadacSGreg Roach        'facebook' => 'AS32934',
148089dadacSGreg Roach        'twitter'  => 'AS13414',
149089dadacSGreg Roach    ];
150089dadacSGreg Roach
151089dadacSGreg Roach    /**
152089dadacSGreg Roach     * @param ServerRequestInterface  $request
153089dadacSGreg Roach     * @param RequestHandlerInterface $handler
154089dadacSGreg Roach     *
155089dadacSGreg Roach     * @return ResponseInterface
156089dadacSGreg Roach     */
157089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
158089dadacSGreg Roach    {
159089dadacSGreg Roach        $ua      = $request->getServerParams()['HTTP_USER_AGENT'] ?? '';
160089dadacSGreg Roach        $ip      = $request->getAttribute('client-ip');
16169675509SGreg Roach        $address = IPFactory::addressFromString($ip);
162089dadacSGreg Roach        assert($address instanceof AddressInterface);
163089dadacSGreg Roach
164dec352c1SGreg Roach        foreach (self::BAD_ROBOTS as $robot) {
165dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
166089dadacSGreg Roach                return $this->response();
167089dadacSGreg Roach            }
168dec352c1SGreg Roach        }
169089dadacSGreg Roach
1705c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
171dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1725c20d904SGreg Roach                return $this->response();
1735c20d904SGreg Roach            }
1745c20d904SGreg Roach        }
1755c20d904SGreg Roach
1765c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
177dec352c1SGreg Roach            if (str_contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
178089dadacSGreg Roach                return $this->response();
179089dadacSGreg Roach            }
180089dadacSGreg Roach        }
181089dadacSGreg Roach
182089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
183dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
184813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
18569675509SGreg Roach                    $range = IPFactory::rangeFromString($ip);
186813eb6c8SGreg Roach
187813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
188813eb6c8SGreg Roach                        continue 2;
189813eb6c8SGreg Roach                    }
190813eb6c8SGreg Roach                }
191813eb6c8SGreg Roach
192089dadacSGreg Roach                return $this->response();
193089dadacSGreg Roach            }
194089dadacSGreg Roach        }
195089dadacSGreg Roach
196089dadacSGreg Roach        foreach (self::ROBOT_ASN as $robot => $asn) {
197dec352c1SGreg Roach            if (str_contains($ua, $robot)) {
198089dadacSGreg Roach                foreach ($this->fetchIpRangesForAsn($asn) as $range) {
199089dadacSGreg Roach                    if ($range->contains($address)) {
200089dadacSGreg Roach                        continue 2;
201089dadacSGreg Roach                    }
202089dadacSGreg Roach                }
203089dadacSGreg Roach
204089dadacSGreg Roach                return $this->response();
205089dadacSGreg Roach            }
206089dadacSGreg Roach        }
207089dadacSGreg Roach
208617057d4SGreg Roach        // Allow sites to block access from entire networks.
209617057d4SGreg Roach        preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches);
210617057d4SGreg Roach        foreach ($matches[1] as $asn) {
211617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
212617057d4SGreg Roach                if ($range->contains($address)) {
213617057d4SGreg Roach                    return $this->response();
214617057d4SGreg Roach                }
215617057d4SGreg Roach            }
216617057d4SGreg Roach        }
217089dadacSGreg Roach
218089dadacSGreg Roach        return $handler->handle($request);
219089dadacSGreg Roach    }
220089dadacSGreg Roach
221089dadacSGreg Roach    /**
222089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
223089dadacSGreg Roach     *
224089dadacSGreg Roach     * @param string        $ip
225089dadacSGreg Roach     * @param array<string> $valid_domains
2265c20d904SGreg Roach     * @param bool          $reverse_only
227089dadacSGreg Roach     *
228089dadacSGreg Roach     * @return bool
229089dadacSGreg Roach     */
2305c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
231089dadacSGreg Roach    {
232089dadacSGreg Roach        $host = gethostbyaddr($ip);
233089dadacSGreg Roach
234dec352c1SGreg Roach        if ($host === false) {
235089dadacSGreg Roach            return false;
236089dadacSGreg Roach        }
237089dadacSGreg Roach
238dec352c1SGreg Roach        foreach ($valid_domains as $domain) {
239dec352c1SGreg Roach            if (str_ends_with($host, $domain)) {
2405c20d904SGreg Roach                return $reverse_only || $ip === gethostbyname($host);
241089dadacSGreg Roach            }
242dec352c1SGreg Roach        }
243dec352c1SGreg Roach
244dec352c1SGreg Roach        return false;
245dec352c1SGreg Roach    }
246089dadacSGreg Roach
247089dadacSGreg Roach    /**
248089dadacSGreg Roach     * Perform a whois search for an ASN.
249089dadacSGreg Roach     *
250089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
251089dadacSGreg Roach     *
252089dadacSGreg Roach     * @return array<RangeInterface>
253089dadacSGreg Roach     */
254089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
255089dadacSGreg Roach    {
2566b9cb339SGreg Roach        return Registry::cache()->file()->remember('whois-asn-' . $asn, static function () use ($asn): array {
257089dadacSGreg Roach            try {
258089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
259089dadacSGreg Roach                $whois  = new Whois($loader);
260089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
261089dadacSGreg Roach                $routes = $info->getRoutes();
262089dadacSGreg Roach                $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface {
26369675509SGreg Roach                    return IPFactory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6());
264089dadacSGreg Roach                }, $routes);
265089dadacSGreg Roach
266089dadacSGreg Roach                return array_filter($ranges);
267089dadacSGreg Roach            } catch (Throwable $ex) {
268089dadacSGreg Roach                return [];
269089dadacSGreg Roach            }
270089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
271089dadacSGreg Roach    }
272089dadacSGreg Roach
273089dadacSGreg Roach    /**
274089dadacSGreg Roach     * @return ResponseInterface
275089dadacSGreg Roach     */
276089dadacSGreg Roach    private function response(): ResponseInterface
277089dadacSGreg Roach    {
278089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
279089dadacSGreg Roach    }
280089dadacSGreg Roach}
281