]> git.mxchange.org Git - friendica.git/commitdiff
Use mattwright/urlresolver for HTTPClient::finalUrl()
authorPhilipp <admin@philipp.info>
Mon, 23 Aug 2021 12:28:25 +0000 (14:28 +0200)
committerPhilipp <admin@philipp.info>
Wed, 25 Aug 2021 12:22:42 +0000 (14:22 +0200)
composer.json
composer.lock
src/Factory/HTTPClientFactory.php
src/Network/HTTPClient.php
src/Network/IHTTPClient.php

index 2dd5dec7b9a3a72b3992ab939f2bd5c858d79d30..bf0559254e45d538f98a1af9a7398ebfdc891602 100644 (file)
@@ -69,7 +69,8 @@
                "npm-asset/perfect-scrollbar": "0.6.16",
                "npm-asset/textcomplete": "^0.18.2",
                "npm-asset/typeahead.js": "^0.11.1",
-               "minishlink/web-push": "^6.0"
+               "minishlink/web-push": "^6.0",
+               "mattwright/urlresolver": "^2.0"
        },
        "repositories": [
                {
index 5e8f1a20aa556e0355b431e3defae42532c36868..906a681e452967cd89436b5d91090b4d556fc0f9 100644 (file)
@@ -4,7 +4,7 @@
         "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
         "This file is @generated automatically"
     ],
-    "content-hash": "7d6dee6e449da931e8fe209e61b2e78e",
+    "content-hash": "c9e0a9eacc23d884012042eeab01cc8b",
     "packages": [
         {
             "name": "asika/simple-console",
             ],
             "time": "2017-07-19T15:11:19+00:00"
         },
+        {
+            "name": "mattwright/urlresolver",
+            "version": "2.0",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/mattwright/URLResolver.php.git",
+                "reference": "416039192cb6d9158bdacd68349bceff8739b857"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/mattwright/URLResolver.php/zipball/416039192cb6d9158bdacd68349bceff8739b857",
+                "reference": "416039192cb6d9158bdacd68349bceff8739b857",
+                "shasum": ""
+            },
+            "require": {
+                "ext-curl": "*",
+                "ext-mbstring": "*",
+                "php": ">=5.3"
+            },
+            "type": "library",
+            "autoload": {
+                "psr-4": {
+                    "mattwright\\": "."
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "MIT"
+            ],
+            "authors": [
+                {
+                    "name": "Matt Wright",
+                    "email": "mw@mattwright.com"
+                }
+            ],
+            "description": "PHP class that attempts to resolve URLs to a final, canonical link.",
+            "homepage": "https://github.com/mattwright/URLResolver.php",
+            "keywords": [
+                "canonical",
+                "link",
+                "redirect",
+                "resolve",
+                "url"
+            ],
+            "time": "2019-01-18T00:59:34+00:00"
+        },
         {
             "name": "michelf/php-markdown",
             "version": "1.9.0",
index 636f8a46d988cc3d59bcc325272cb0548f9ca1d1..c1cb47541407d9bb32bc95990a444e61697b39e2 100644 (file)
@@ -10,6 +10,7 @@ use Friendica\Network\IHTTPClient;
 use Friendica\Util\Profiler;
 use GuzzleHttp\Client;
 use GuzzleHttp\RequestOptions;
+use mattwright\URLResolver;
 use Psr\Http\Message\RequestInterface;
 use Psr\Http\Message\ResponseInterface;
 use Psr\Http\Message\UriInterface;
@@ -85,6 +86,13 @@ class HTTPClientFactory extends BaseFactory
                        ],
                ]);
 
-               return new HTTPClient($logger, $this->profiler, $this->config, $userAgent, $guzzle);
+               $resolver = new URLResolver();
+               $resolver->setUserAgent($userAgent);
+               $resolver->setMaxRedirects(10);
+               $resolver->setRequestTimeout(10);
+               // if the file is too large then exit
+               $resolver->setMaxResponseDataSize(1000000);
+
+               return new HTTPClient($logger, $this->profiler, $guzzle, $resolver);
        }
 }
index 000d3c76af34bc69ff15d7dcd9eae4fe6a3b94ca..d83b805df0d278229769228aa58f032d00ef2501 100644 (file)
@@ -21,9 +21,6 @@
 
 namespace Friendica\Network;
 
-use DOMDocument;
-use DomXPath;
-use Friendica\Core\Config\IConfig;
 use Friendica\Core\System;
 use Friendica\Util\Network;
 use Friendica\Util\Profiler;
@@ -32,6 +29,7 @@ use GuzzleHttp\Cookie\FileCookieJar;
 use GuzzleHttp\Exception\RequestException;
 use GuzzleHttp\Exception\TransferException;
 use GuzzleHttp\RequestOptions;
+use mattwright\URLResolver;
 use Psr\Http\Message\ResponseInterface;
 use Psr\Log\LoggerInterface;
 
@@ -44,20 +42,17 @@ class HTTPClient implements IHTTPClient
        private $logger;
        /** @var Profiler */
        private $profiler;
-       /** @var IConfig */
-       private $config;
-       /** @var string */
-       private $userAgent;
        /** @var Client */
        private $client;
+       /** @var URLResolver */
+       private $resolver;
 
-       public function __construct(LoggerInterface $logger, Profiler $profiler, IConfig $config, string $userAgent, Client $client)
+       public function __construct(LoggerInterface $logger, Profiler $profiler, Client $client, URLResolver $resolver)
        {
-               $this->logger    = $logger;
-               $this->profiler  = $profiler;
-               $this->config    = $config;
-               $this->userAgent = $userAgent;
-               $this->client    = $client;
+               $this->logger   = $logger;
+               $this->profiler = $profiler;
+               $this->client   = $client;
+               $this->resolver = $resolver;
        }
 
        /**
@@ -97,6 +92,11 @@ class HTTPClient implements IHTTPClient
                        return CurlResult::createErrorCurl($url);
                }
 
+               if (Network::isRedirectBlocked($url)) {
+                       $this->logger->info('Domain should not be redirected.', ['url' => $url]);
+                       return CurlResult::createErrorCurl($url);
+               }
+
                $conf = [];
 
                if (!empty($opts['cookiejar'])) {
@@ -197,10 +197,12 @@ class HTTPClient implements IHTTPClient
        /**
         * {@inheritDoc}
         */
-       public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false)
+       public function finalUrl(string $url)
        {
+               $this->profiler->startRecording('network');
+
                if (Network::isLocalLink($url)) {
-                       $this->logger->info('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
+                       $this->logger->debug('Local link', ['url' => $url, 'callstack' => System::callstack(20)]);
                }
 
                if (Network::isUrlBlocked($url)) {
@@ -215,104 +217,19 @@ class HTTPClient implements IHTTPClient
 
                $url = Network::stripTrackingQueryParams($url);
 
-               if ($depth > 10) {
-                       return $url;
-               }
-
                $url = trim($url, "'");
 
-               $this->profiler->startRecording('network');
-
-               $ch = curl_init();
-               curl_setopt($ch, CURLOPT_URL, $url);
-               curl_setopt($ch, CURLOPT_HEADER, 1);
-               curl_setopt($ch, CURLOPT_NOBODY, 1);
-               curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
-               curl_setopt($ch, CURLOPT_TIMEOUT, 10);
-               curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-               curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
-
-               curl_exec($ch);
-               $curl_info = @curl_getinfo($ch);
-               $http_code = $curl_info['http_code'];
-               curl_close($ch);
-
-               $this->profiler->stopRecording();
-
-               if ($http_code == 0) {
-                       return $url;
-               }
+               // Designate a temporary file that will store cookies during the session.
+               // Some websites test the browser for cookie support, so this enhances results.
+               $this->resolver->setCookieJar(tempnam(get_temppath() , 'url_resolver-'));
 
-               if (in_array($http_code, ['301', '302'])) {
-                       if (!empty($curl_info['redirect_url'])) {
-                               return $this->finalUrl($curl_info['redirect_url'], ++$depth, $fetchbody);
-                       } elseif (!empty($curl_info['location'])) {
-                               return $this->finalUrl($curl_info['location'], ++$depth, $fetchbody);
-                       }
-               }
+               $urlResult = $this->resolver->resolveURL($url);
 
-               // Check for redirects in the meta elements of the body if there are no redirects in the header.
-               if (!$fetchbody) {
-                       return $this->finalUrl($url, ++$depth, true);
-               }
-
-               // if the file is too large then exit
-               if ($curl_info["download_content_length"] > 1000000) {
-                       return $url;
-               }
-
-               // if it isn't a HTML file then exit
-               if (!empty($curl_info["content_type"]) && !strstr(strtolower($curl_info["content_type"]), "html")) {
-                       return $url;
-               }
-
-               $this->profiler->startRecording('network');
-
-               $ch = curl_init();
-               curl_setopt($ch, CURLOPT_URL, $url);
-               curl_setopt($ch, CURLOPT_HEADER, 0);
-               curl_setopt($ch, CURLOPT_NOBODY, 0);
-               curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
-               curl_setopt($ch, CURLOPT_TIMEOUT, 10);
-               curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-               curl_setopt($ch, CURLOPT_USERAGENT, $this->userAgent);
-
-               $body = curl_exec($ch);
-               curl_close($ch);
-
-               $this->profiler->stopRecording();
-
-               if (trim($body) == "") {
-                       return $url;
-               }
-
-               // Check for redirect in meta elements
-               $doc = new DOMDocument();
-               @$doc->loadHTML($body);
-
-               $xpath = new DomXPath($doc);
-
-               $list = $xpath->query("//meta[@content]");
-               foreach ($list as $node) {
-                       $attr = [];
-                       if ($node->attributes->length) {
-                               foreach ($node->attributes as $attribute) {
-                                       $attr[$attribute->name] = $attribute->value;
-                               }
-                       }
-
-                       if (@$attr["http-equiv"] == 'refresh') {
-                               $path = $attr["content"];
-                               $pathinfo = explode(";", $path);
-                               foreach ($pathinfo as $value) {
-                                       if (substr(strtolower($value), 0, 4) == "url=") {
-                                               return $this->finalUrl(substr($value, 4), ++$depth);
-                                       }
-                               }
-                       }
+               if ($urlResult->didErrorOccur()) {
+                       throw new TransferException($urlResult->getErrorMessageString());
                }
 
-               return $url;
+               return $urlResult->getURL();
        }
 
        /**
index 8fa5285d264156b50afb0c195eeb0da862317b3f..180908eede4dd449115e20f6e42a49468343441d 100644 (file)
@@ -104,14 +104,10 @@ interface IHTTPClient
         * through HTTP code or meta refresh tags. Stops after 10 redirections.
         *
         * @param string $url       A user-submitted URL
-        * @param int    $depth     The current redirection recursion level (internal)
-        * @param bool   $fetchbody Wether to fetch the body or not after the HEAD requests
         *
         * @return string A canonical URL
         * @throws \Friendica\Network\HTTPException\InternalServerErrorException
         * @see   ParseUrl::getSiteinfo
-        *
-        * @todo  Remove the $fetchbody parameter that generates an extraneous HEAD request
         */
-       public function finalUrl(string $url, int $depth = 1, bool $fetchbody = false);
+       public function finalUrl(string $url);
 }