X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=src%2FUtil%2FParseUrl.php;h=4590f39a99cfa83aa813199619ce0323eff8d47f;hb=2a68ad9b26d0a36968128be1627f5b1a427957d7;hp=70c5279cc84a42ce9314a202375cb2f1382c6fd5;hpb=6bc865ff4d68170c83a675cfc75cdc025a192b15;p=friendica.git
diff --git a/src/Util/ParseUrl.php b/src/Util/ParseUrl.php
index 70c5279cc8..13cb55b73e 100644
--- a/src/Util/ParseUrl.php
+++ b/src/Util/ParseUrl.php
@@ -1,76 +1,139 @@
.
+ *
*/
+
namespace Friendica\Util;
use DOMDocument;
use DOMXPath;
use Friendica\Content\OEmbed;
-use Friendica\Core\Addon;
+use Friendica\Core\Hook;
use Friendica\Core\Logger;
+use Friendica\Database\Database;
use Friendica\Database\DBA;
-use Friendica\Object\Image;
-use Friendica\Util\Strings;
+use Friendica\DI;
+use Friendica\Network\HTTPException;
/**
- * @brief Class with methods for extracting certain content from an url
+ * Get information about a given URL
+ *
+ * Class with methods for extracting certain content from an url
*/
class ParseUrl
{
+ const DEFAULT_EXPIRATION_FAILURE = 'now + 1 day';
+ const DEFAULT_EXPIRATION_SUCCESS = 'now + 3 months';
+
+ /**
+ * Maximum number of characters for the description
+ */
+ const MAX_DESC_COUNT = 250;
+
+ /**
+ * Minimum number of characters for the description
+ */
+ const MIN_DESC_COUNT = 100;
+
+ /**
+ * Fetch the content type of the given url
+ * @param string $url URL of the page
+ * @return array content type
+ */
+ public static function getContentType(string $url)
+ {
+ $curlResult = DI::httpRequest()->head($url);
+ if (!$curlResult->isSuccess()) {
+ return [];
+ }
+
+ $contenttype = $curlResult->getHeader('Content-Type');
+ if (empty($contenttype)) {
+ return [];
+ }
+
+ return explode('/', current(explode(';', $contenttype)));
+ }
+
/**
- * @brief Search for chached embeddable data of an url otherwise fetch it
+ * Search for chached embeddable data of an url otherwise fetch it
*
* @param string $url The url of the page which should be scraped
- * @param bool $no_guessing If true the parse doens't search for
- * preview pictures
- * @param bool $do_oembed The false option is used by the function fetch_oembed()
- * to avoid endless loops
+ * @param bool $do_oembed The false option is used by the function fetch_oembed()
+ * to avoid endless loops
*
* @return array which contains needed data for embedding
- * string 'url' => The url of the parsed page
- * string 'type' => Content type
- * string 'title' => The title of the content
- * string 'text' => The description for the content
- * string 'image' => A preview image of the content (only available
- * if $no_geuessing = false
- * array'images' = Array of preview pictures
- * string 'keywords' => The tags which belong to the content
+ * string 'url' => The url of the parsed page
+ * string 'type' => Content type
+ * string 'title' => (optional) The title of the content
+ * string 'text' => (optional) The description for the content
+ * string 'image' => (optional) A preview image of the content
+ * array 'images' => (optional) Array of preview pictures
+ * string 'keywords' => (optional) The tags which belong to the content
*
- * @see ParseUrl::getSiteinfo() for more information about scraping
+ * @throws HTTPException\InternalServerErrorException
+ * @see ParseUrl::getSiteinfo() for more information about scraping
* embeddable content
*/
- public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true)
+ public static function getSiteinfoCached($url, $do_oembed = true): array
{
- if ($url == "") {
- return false;
+ if (empty($url)) {
+ return [
+ 'url' => '',
+ 'type' => 'error',
+ ];
}
+ $urlHash = hash('sha256', $url);
+
$parsed_url = DBA::selectFirst('parsed_url', ['content'],
- ['url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing, 'oembed' => $do_oembed]
+ ['url_hash' => $urlHash, 'oembed' => $do_oembed]
);
if (!empty($parsed_url['content'])) {
$data = unserialize($parsed_url['content']);
return $data;
}
- $data = self::getSiteinfo($url, $no_guessing, $do_oembed);
+ $data = self::getSiteinfo($url, $do_oembed);
+
+ $expires = $data['expires'];
+
+ unset($data['expires']);
- DBA::insert(
+ DI::dba()->insert(
'parsed_url',
[
- 'url' => Strings::normaliseLink($url), 'guessing' => !$no_guessing,
- 'oembed' => $do_oembed, 'content' => serialize($data),
- 'created' => DateTimeFormat::utcNow()
+ 'url_hash' => $urlHash,
+ 'oembed' => $do_oembed,
+ 'url' => $url,
+ 'content' => serialize($data),
+ 'created' => DateTimeFormat::utcNow(),
+ 'expires' => $expires,
],
- true
+ Database::INSERT_UPDATE
);
return $data;
}
+
/**
- * @brief Parse a page for embeddable content information
+ * Parse a page for embeddable content information
*
* This method parses to url for meta data which can be used to embed
* the content. If available it prioritizes Open Graph meta tags.
@@ -80,23 +143,21 @@ class ParseUrl
* \
*
* @param string $url The url of the page which should be scraped
- * @param bool $no_guessing If true the parse doens't search for
- * preview pictures
- * @param bool $do_oembed The false option is used by the function fetch_oembed()
- * to avoid endless loops
- * @param int $count Internal counter to avoid endless loops
+ * @param bool $do_oembed The false option is used by the function fetch_oembed()
+ * to avoid endless loops
+ * @param int $count Internal counter to avoid endless loops
*
* @return array which contains needed data for embedding
- * string 'url' => The url of the parsed page
- * string 'type' => Content type
- * string 'title' => The title of the content
- * string 'text' => The description for the content
- * string 'image' => A preview image of the content (only available
- * if $no_geuessing = false
- * array'images' = Array of preview pictures
- * string 'keywords' => The tags which belong to the content
+ * string 'url' => The url of the parsed page
+ * string 'type' => Content type (error, link, photo, image, audio, video)
+ * string 'title' => (optional) The title of the content
+ * string 'text' => (optional) The description for the content
+ * string 'image' => (optional) A preview image of the content
+ * array 'images' => (optional) Array of preview pictures
+ * string 'keywords' => (optional) The tags which belong to the content
*
- * @todo https://developers.google.com/+/plugins/snippet/
+ * @throws \Friendica\Network\HTTPException\InternalServerErrorException
+ * @todo https://developers.google.com/+/plugins/snippet/
* @verbatim
*
*
@@ -109,79 +170,129 @@ class ParseUrl
*