X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=include%2FParseUrl.php;h=549d705da46fa2078449df4c16d6fd82157d3b66;hb=b067a1114679ebc3dc6132759c511c3ac8b73e49;hp=8a3392e73bcecdb0ea18d61a0786cda360fccdf7;hpb=e9226eaf45e70bcd5a9a9f66b6b922dbc15c47ba;p=friendica.git diff --git a/include/ParseUrl.php b/include/ParseUrl.php index 8a3392e73b..549d705da4 100644 --- a/include/ParseUrl.php +++ b/include/ParseUrl.php @@ -12,12 +12,35 @@ use \Friendica\Core\Config; require_once("include/network.php"); require_once("include/Photo.php"); require_once("include/oembed.php"); +require_once("include/xml.php"); /** * @brief Class with methods for extracting certain content from an url */ class ParseUrl { + /** + * @brief Search for chached embeddable data of an url otherwise fetch it + * + * @param type $url The url of the page which should be scraped + * @param type $no_guessing If true the parse doens't search for + * preview pictures + * @param type $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * + * @return array which contains needed data for embedding + * string 'url' => The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @see ParseUrl::getSiteinfo() for more information about scraping + * embeddable content + */ public static function getSiteinfoCached($url, $no_guessing = false, $do_oembed = true) { if ($url == "") { @@ -46,7 +69,46 @@ class ParseUrl { return $data; } - + /** + * @brief Parse a page for embeddable content information + * + * This method parses to url for meta data which can be used to embed + * the content. If available it prioritizes Open Graph meta tags. + * If this is not available it uses the twitter cards meta tags. + * As fallback it uses standard html elements with meta informations + * like \Awesome Title\ or + * \ + * + * @param type $url The url of the page which should be scraped + * @param type $no_guessing If true the parse doens't search for + * preview pictures + * @param type $do_oembed The false option is used by the function fetch_oembed() + * to avoid endless loops + * @param type $count Internal counter to avoid endless loops + * + * @return array which contains needed data for embedding + * string 'url' => The url of the parsed page + * string 'type' => Content type + * string 'title' => The title of the content + * string 'text' => The description for the content + * string 'image' => A preview image of the content (only available + * if $no_geuessing = false + * array'images' = Array of preview pictures + * string 'keywords' => The tags which belong to the content + * + * @todo https://developers.google.com/+/plugins/snippet/ + * @verbatim + * + * + * + * + * + *

Shiny Trinket

+ *

Shiny trinkets are shiny.

+ * + * @endverbatim + */ public static function getSiteinfo($url, $no_guessing = false, $do_oembed = true, $count = 1) { $a = get_app(); @@ -184,17 +246,17 @@ class ParseUrl { $doc = new \DOMDocument(); @$doc->loadHTML($body); - self::deleteNode($doc, "style"); - self::deleteNode($doc, "script"); - self::deleteNode($doc, "option"); - self::deleteNode($doc, "h1"); - self::deleteNode($doc, "h2"); - self::deleteNode($doc, "h3"); - self::deleteNode($doc, "h4"); - self::deleteNode($doc, "h5"); - self::deleteNode($doc, "h6"); - self::deleteNode($doc, "ol"); - self::deleteNode($doc, "ul"); + \xml::deleteNode($doc, "style"); + \xml::deleteNode($doc, "script"); + \xml::deleteNode($doc, "option"); + \xml::deleteNode($doc, "h1"); + \xml::deleteNode($doc, "h2"); + \xml::deleteNode($doc, "h3"); + \xml::deleteNode($doc, "h4"); + \xml::deleteNode($doc, "h5"); + \xml::deleteNode($doc, "h6"); + \xml::deleteNode($doc, "ol"); + \xml::deleteNode($doc, "ul"); $xpath = new \DomXPath($doc); @@ -440,17 +502,25 @@ class ParseUrl { $tag = "#" . $tag; } - private static function deleteNode(&$doc, $node) { - $xpath = new \DomXPath($doc); - $list = $xpath->query("//".$node); - foreach ($list as $child) { - $child->parentNode->removeChild($child); - } - } - + /** + * @brief Add a scheme to an url + * + * The src attribute of some html elements (e.g. images) + * can miss the scheme so we need to add the correct + * scheme + * + * @param string $url The url which possibly does have + * a missing scheme (a link to an image) + * @param string $scheme The url with a correct scheme + * (e.g. the url from the webpage which does contain the image) + * + * @return string The url with a scheme + */ private static function completeUrl($url, $scheme) { $urlarr = parse_url($url); + // If the url does allready have an scheme + // we can stop the process here if (isset($urlarr["scheme"])) { return($url); }