xref: /webtrees/app/Http/Middleware/BadBotBlocker.php (revision 617057d4ae09fbcd822ba635c6ed70514af7603a)
1089dadacSGreg Roach<?php
2089dadacSGreg Roach
3089dadacSGreg Roach/**
4089dadacSGreg Roach * webtrees: online genealogy
5089dadacSGreg Roach * Copyright (C) 2019 webtrees development team
6089dadacSGreg Roach * This program is free software: you can redistribute it and/or modify
7089dadacSGreg Roach * it under the terms of the GNU General Public License as published by
8089dadacSGreg Roach * the Free Software Foundation, either version 3 of the License, or
9089dadacSGreg Roach * (at your option) any later version.
10089dadacSGreg Roach * This program is distributed in the hope that it will be useful,
11089dadacSGreg Roach * but WITHOUT ANY WARRANTY; without even the implied warranty of
12089dadacSGreg Roach * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13089dadacSGreg Roach * GNU General Public License for more details.
14089dadacSGreg Roach * You should have received a copy of the GNU General Public License
15089dadacSGreg Roach * along with this program. If not, see <http://www.gnu.org/licenses/>.
16089dadacSGreg Roach */
17089dadacSGreg Roach
18089dadacSGreg Roachdeclare(strict_types=1);
19089dadacSGreg Roach
20089dadacSGreg Roachnamespace Fisharebest\Webtrees\Http\Middleware;
21089dadacSGreg Roach
22089dadacSGreg Roachuse Fig\Http\Message\StatusCodeInterface;
23089dadacSGreg Roachuse Fisharebest\Webtrees\Cache;
24089dadacSGreg Roachuse Illuminate\Support\Str;
25089dadacSGreg Roachuse Iodev\Whois\Loaders\CurlLoader;
26089dadacSGreg Roachuse Iodev\Whois\Modules\Asn\AsnRouteInfo;
27089dadacSGreg Roachuse Iodev\Whois\Whois;
28089dadacSGreg Roachuse IPLib\Address\AddressInterface;
29089dadacSGreg Roachuse IPLib\Factory;
30089dadacSGreg Roachuse IPLib\Range\RangeInterface;
31089dadacSGreg Roachuse Psr\Http\Message\ResponseInterface;
32089dadacSGreg Roachuse Psr\Http\Message\ServerRequestInterface;
33089dadacSGreg Roachuse Psr\Http\Server\MiddlewareInterface;
34089dadacSGreg Roachuse Psr\Http\Server\RequestHandlerInterface;
35089dadacSGreg Roachuse Throwable;
36089dadacSGreg Roach
37089dadacSGreg Roachuse function app;
38089dadacSGreg Roachuse function array_map;
39089dadacSGreg Roachuse function assert;
40089dadacSGreg Roachuse function gethostbyaddr;
41089dadacSGreg Roachuse function gethostbyname;
42089dadacSGreg Roachuse function in_array;
43089dadacSGreg Roachuse function response;
44089dadacSGreg Roach
45089dadacSGreg Roach/**
46089dadacSGreg Roach * Middleware to block bad robots before they waste our valuable CPU cycles.
47089dadacSGreg Roach */
48089dadacSGreg Roachclass BadBotBlocker implements MiddlewareInterface
49089dadacSGreg Roach{
50089dadacSGreg Roach    // Cache whois requests.  Try to avoid all caches expiring at the same time.
51089dadacSGreg Roach    private const WHOIS_TTL_MIN = 28 * 86400;
52089dadacSGreg Roach    private const WHOIS_TTL_MAX = 35 * 86400;
53089dadacSGreg Roach    private const WHOIS_TIMEOUT = 5;
54089dadacSGreg Roach
55089dadacSGreg Roach    // Bad robots - SEO optimisers, advertisers, etc
56089dadacSGreg Roach    private const BAD_ROBOTS = [
57089dadacSGreg Roach        'admantx',
58089dadacSGreg Roach        'AhrefsBot',
59227c6666SGreg Roach        'AspiegelBot',
60089dadacSGreg Roach        'DotBot',
61089dadacSGreg Roach        'Grapeshot',
62089dadacSGreg Roach        'ia_archiver',
63089dadacSGreg Roach        'MJ12bot',
64227c6666SGreg Roach        'panscient',
65089dadacSGreg Roach        'proximic',
66089dadacSGreg Roach        'SemrushBot',
67089dadacSGreg Roach        'XoviBot',
68089dadacSGreg Roach    ];
69089dadacSGreg Roach
70089dadacSGreg Roach    /**
715c20d904SGreg Roach     * Some search engines use reverse/forward DNS to verify the IP address.
72089dadacSGreg Roach     *
73089dadacSGreg Roach     * @see https://support.google.com/webmasters/answer/80553?hl=en
74089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0
75089dadacSGreg Roach     * @see https://www.bing.com/webmaster/help/how-to-verify-bingbot-3905dc26
76089dadacSGreg Roach     * @see https://yandex.com/support/webmaster/robot-workings/check-yandex-robots.html
77089dadacSGreg Roach     */
785c20d904SGreg Roach    private const ROBOT_REV_FWD_DNS = [
79089dadacSGreg Roach        'bingbot'     => ['.search.msn.com'],
80089dadacSGreg Roach        'BingPreview' => ['.search.msn.com'],
81089dadacSGreg Roach        'Google'      => ['.google.com', '.googlebot.com'],
82089dadacSGreg Roach        'msnbot'      => ['.search.msn.com'],
83089dadacSGreg Roach        'Qwantify'    => ['.search.qwant.com'],
84089dadacSGreg Roach        'Sogou'       => ['.crawl.sogou.com'],
85089dadacSGreg Roach        'Yahoo'       => ['.crawl.yahoo.net'],
86089dadacSGreg Roach        'Yandex'      => ['.yandex.ru', '.yandex.net', '.yandex.com'],
87089dadacSGreg Roach    ];
88089dadacSGreg Roach
89089dadacSGreg Roach    /**
905c20d904SGreg Roach     * Some search engines only use reverse DNS to verify the IP address.
915c20d904SGreg Roach     *
925c20d904SGreg Roach     * @see https://help.baidu.com/question?prod_id=99&class=0&id=3001
935c20d904SGreg Roach     */
945c20d904SGreg Roach    private const ROBOT_REV_ONLY_DNS = [
956a8ee1d2SGreg Roach        'Baiduspider' => ['.baidu.com', '.baidu.jp'],
965c20d904SGreg Roach    ];
975c20d904SGreg Roach
985c20d904SGreg Roach    /**
99089dadacSGreg Roach     * Some search engines operate from designated IP addresses.
100089dadacSGreg Roach     *
101813eb6c8SGreg Roach     * @see http://www.apple.com/go/applebot
102089dadacSGreg Roach     * @see https://help.duckduckgo.com/duckduckgo-help-pages/results/duckduckbot
103089dadacSGreg Roach     */
104089dadacSGreg Roach    private const ROBOT_IPS = [
105813eb6c8SGreg Roach        'AppleBot'    => [
106813eb6c8SGreg Roach            '17.0.0.0/8',
107813eb6c8SGreg Roach        ],
108089dadacSGreg Roach        'Ask Jeeves'  => [
109089dadacSGreg Roach            '65.214.45.143',
110089dadacSGreg Roach            '65.214.45.148',
111089dadacSGreg Roach            '66.235.124.192',
112089dadacSGreg Roach            '66.235.124.7',
113089dadacSGreg Roach            '66.235.124.101',
114089dadacSGreg Roach            '66.235.124.193',
115089dadacSGreg Roach            '66.235.124.73',
116089dadacSGreg Roach            '66.235.124.196',
117089dadacSGreg Roach            '66.235.124.74',
118089dadacSGreg Roach            '63.123.238.8',
119089dadacSGreg Roach            '202.143.148.61',
120089dadacSGreg Roach        ],
121089dadacSGreg Roach        'DuckDuckBot' => [
122089dadacSGreg Roach            '23.21.227.69',
123089dadacSGreg Roach            '50.16.241.113',
124089dadacSGreg Roach            '50.16.241.114',
125089dadacSGreg Roach            '50.16.241.117',
126089dadacSGreg Roach            '50.16.247.234',
127089dadacSGreg Roach            '52.204.97.54',
128089dadacSGreg Roach            '52.5.190.19',
129089dadacSGreg Roach            '54.197.234.188',
130089dadacSGreg Roach            '54.208.100.253',
131089dadacSGreg Roach            '54.208.102.37',
132089dadacSGreg Roach            '107.21.1.8',
133089dadacSGreg Roach        ],
134089dadacSGreg Roach    ];
135089dadacSGreg Roach
136089dadacSGreg Roach    /**
137089dadacSGreg Roach     * Some search engines operate from within a designated autonomous system.
138089dadacSGreg Roach     *
139089dadacSGreg Roach     * @see https://developers.facebook.com/docs/sharing/webmasters/crawler
140089dadacSGreg Roach     */
141089dadacSGreg Roach    private const ROBOT_ASN = [
142089dadacSGreg Roach        'facebook' => 'AS32934',
143089dadacSGreg Roach        'twitter'  => 'AS13414',
144089dadacSGreg Roach    ];
145089dadacSGreg Roach
146089dadacSGreg Roach    /**
147089dadacSGreg Roach     * @param ServerRequestInterface  $request
148089dadacSGreg Roach     * @param RequestHandlerInterface $handler
149089dadacSGreg Roach     *
150089dadacSGreg Roach     * @return ResponseInterface
151089dadacSGreg Roach     */
152089dadacSGreg Roach    public function process(ServerRequestInterface $request, RequestHandlerInterface $handler): ResponseInterface
153089dadacSGreg Roach    {
154089dadacSGreg Roach        $ua      = $request->getServerParams()['HTTP_USER_AGENT'] ?? '';
155089dadacSGreg Roach        $ip      = $request->getAttribute('client-ip');
156089dadacSGreg Roach        $address = Factory::addressFromString($ip);
157089dadacSGreg Roach        assert($address instanceof AddressInterface);
158089dadacSGreg Roach
159089dadacSGreg Roach        if (Str::contains($ua, self::BAD_ROBOTS)) {
160089dadacSGreg Roach            return $this->response();
161089dadacSGreg Roach        }
162089dadacSGreg Roach
1635c20d904SGreg Roach        foreach (self::ROBOT_REV_FWD_DNS as $robot => $valid_domains) {
1645c20d904SGreg Roach            if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, false)) {
1655c20d904SGreg Roach                return $this->response();
1665c20d904SGreg Roach            }
1675c20d904SGreg Roach        }
1685c20d904SGreg Roach
1695c20d904SGreg Roach        foreach (self::ROBOT_REV_ONLY_DNS as $robot => $valid_domains) {
1705c20d904SGreg Roach            if (Str::contains($ua, $robot) && !$this->checkRobotDNS($ip, $valid_domains, true)) {
171089dadacSGreg Roach                return $this->response();
172089dadacSGreg Roach            }
173089dadacSGreg Roach        }
174089dadacSGreg Roach
175089dadacSGreg Roach        foreach (self::ROBOT_IPS as $robot => $valid_ips) {
176813eb6c8SGreg Roach            if (Str::contains($ua, $robot)) {
177813eb6c8SGreg Roach                foreach ($valid_ips as $ip) {
178813eb6c8SGreg Roach                    $range = Factory::rangeFromString($ip);
179813eb6c8SGreg Roach
180813eb6c8SGreg Roach                    if ($range instanceof RangeInterface && $range->contains($address)) {
181813eb6c8SGreg Roach                        continue 2;
182813eb6c8SGreg Roach                    }
183813eb6c8SGreg Roach                }
184813eb6c8SGreg Roach
185089dadacSGreg Roach                return $this->response();
186089dadacSGreg Roach            }
187089dadacSGreg Roach        }
188089dadacSGreg Roach
189089dadacSGreg Roach        foreach (self::ROBOT_ASN as $robot => $asn) {
190089dadacSGreg Roach            if (Str::contains($ua, $robot)) {
191089dadacSGreg Roach                foreach ($this->fetchIpRangesForAsn($asn) as $range) {
192089dadacSGreg Roach                    if ($range->contains($address)) {
193089dadacSGreg Roach                        continue 2;
194089dadacSGreg Roach                    }
195089dadacSGreg Roach                }
196089dadacSGreg Roach
197089dadacSGreg Roach                return $this->response();
198089dadacSGreg Roach            }
199089dadacSGreg Roach        }
200089dadacSGreg Roach
201*617057d4SGreg Roach        // Allow sites to block access from entire networks.
202*617057d4SGreg Roach        preg_match_all('/(AS\d+)/', $request->getAttribute('block_asn', ''), $matches);
203*617057d4SGreg Roach        foreach ($matches[1] as $asn) {
204*617057d4SGreg Roach            foreach ($this->fetchIpRangesForAsn($asn) as $range) {
205*617057d4SGreg Roach                if ($range->contains($address)) {
206*617057d4SGreg Roach                    return $this->response();
207*617057d4SGreg Roach                }
208*617057d4SGreg Roach            }
209*617057d4SGreg Roach        }
210089dadacSGreg Roach
211089dadacSGreg Roach        return $handler->handle($request);
212089dadacSGreg Roach    }
213089dadacSGreg Roach
214089dadacSGreg Roach    /**
215089dadacSGreg Roach     * Check that an IP address belongs to a robot operator using a forward/reverse DNS lookup.
216089dadacSGreg Roach     *
217089dadacSGreg Roach     * @param string        $ip
218089dadacSGreg Roach     * @param array<string> $valid_domains
2195c20d904SGreg Roach     * @param bool          $reverse_only
220089dadacSGreg Roach     *
221089dadacSGreg Roach     * @return bool
222089dadacSGreg Roach     */
2235c20d904SGreg Roach    private function checkRobotDNS(string $ip, array $valid_domains, bool $reverse_only): bool
224089dadacSGreg Roach    {
225089dadacSGreg Roach        $host = gethostbyaddr($ip);
226089dadacSGreg Roach
227089dadacSGreg Roach        if ($host === false || !Str::endsWith($host, $valid_domains)) {
228089dadacSGreg Roach            return false;
229089dadacSGreg Roach        }
230089dadacSGreg Roach
2315c20d904SGreg Roach        return $reverse_only || $ip === gethostbyname($host);
232089dadacSGreg Roach    }
233089dadacSGreg Roach
234089dadacSGreg Roach    /**
235089dadacSGreg Roach     * Perform a whois search for an ASN.
236089dadacSGreg Roach     *
237089dadacSGreg Roach     * @param string $asn - The autonomous system number to query
238089dadacSGreg Roach     *
239089dadacSGreg Roach     * @return array<RangeInterface>
240089dadacSGreg Roach     */
241089dadacSGreg Roach    private function fetchIpRangesForAsn(string $asn): array
242089dadacSGreg Roach    {
243089dadacSGreg Roach        $cache = app('cache.files');
244089dadacSGreg Roach        assert($cache instanceof Cache);
245089dadacSGreg Roach
246089dadacSGreg Roach        return $cache->remember('whois-asn-' . $asn, static function () use ($asn): array {
247089dadacSGreg Roach            try {
248089dadacSGreg Roach                $loader = new CurlLoader(self::WHOIS_TIMEOUT);
249089dadacSGreg Roach                $whois  = new Whois($loader);
250089dadacSGreg Roach                $info   = $whois->loadAsnInfo($asn);
251089dadacSGreg Roach                $routes = $info->getRoutes();
252089dadacSGreg Roach                $ranges = array_map(static function (AsnRouteInfo $route_info): ?RangeInterface {
253089dadacSGreg Roach                    return Factory::rangeFromString($route_info->getRoute() ?: $route_info->getRoute6());
254089dadacSGreg Roach                }, $routes);
255089dadacSGreg Roach
256089dadacSGreg Roach                return array_filter($ranges);
257089dadacSGreg Roach            } catch (Throwable $ex) {
258089dadacSGreg Roach                return [];
259089dadacSGreg Roach            }
260089dadacSGreg Roach        }, random_int(self::WHOIS_TTL_MIN, self::WHOIS_TTL_MAX));
261089dadacSGreg Roach    }
262089dadacSGreg Roach
263089dadacSGreg Roach    /**
264089dadacSGreg Roach     * @return ResponseInterface
265089dadacSGreg Roach     */
266089dadacSGreg Roach    private function response(): ResponseInterface
267089dadacSGreg Roach    {
268089dadacSGreg Roach        return response('Not acceptable', StatusCodeInterface::STATUS_NOT_ACCEPTABLE);
269089dadacSGreg Roach    }
270089dadacSGreg Roach}
271