From: hannes Date: Tue, 26 Jan 2016 13:37:52 +0000 (+0000) Subject: don't mess upp charsets in oembed/og! check for utf-8 in http header and meta tags... X-Git-Url: https://git.mxchange.org/?a=commitdiff_plain;h=aa76e5863f6a00e3ee642326c3d7d11dd89875ed;p=quix0rs-gnu-social.git don't mess upp charsets in oembed/og! check for utf-8 in http header and meta tags, and add prolog when loading html with DOMDocument() --- diff --git a/plugins/Oembed/lib/oembedhelper.php b/plugins/Oembed/lib/oembedhelper.php index 5d84d68d8c..6f514983f3 100644 --- a/plugins/Oembed/lib/oembedhelper.php +++ b/plugins/Oembed/lib/oembedhelper.php @@ -74,20 +74,57 @@ class oEmbedHelper if (Event::handle('GetRemoteUrlMetadata', array($url, &$metadata))) { // If that event didn't return anything, try downloading the body and parse it - $body = HTTPClient::quickGet($url); + + // don't use quickGet since we want to check Content-Type header for utf-8 + $client = new HTTPClient(); + $response = $client->get($url); + if (!$response->isOk()) { + // TRANS: Exception. %s is the URL we tried to GET. + throw new Exception(sprintf(_m('Could not GET URL %s.'), $url), $response->getStatus()); + } + $body = $response->getBody(); // DOMDocument::loadHTML may throw warnings on unrecognized elements, // and notices on unrecognized namespaces. $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE)); + + // DOMDocument assumes ISO-8859-1 per HTML spec + // use UTF-8 if we find any evidence of that encoding + $utf8_evidence = false; + $unicode_check_dom = new DOMDocument(); + $ok = $unicode_check_dom->loadHTML($body); + if (!$ok) throw new oEmbedHelper_BadHtmlException(); + $metaNodes = $unicode_check_dom->getElementsByTagName('meta'); + foreach($metaNodes as $metaNode) { + // case in-sensitive since Content-type and utf-8 can be written in many ways + if(stristr($metaNode->getAttribute('http-equiv'),'content-type') + && stristr($metaNode->getAttribute('content'),'utf-8')) { + $utf8_evidence = true; + break; + } elseif(stristr($metaNode->getAttribute('charset'),'utf-8')) { + $utf8_evidence = true; + break; + } + } + unset($unicode_check_dom); + + // The Content-Type HTTP response header overrides encoding metatags in DOM + if(stristr($response->getHeader('Content-Type'),'utf-8')) { + $utf8_evidence = true; + } + + // add utf-8 encoding prolog if we have reason to believe this is utf-8 content + $utf8_tag = $utf8_evidence ? '' : ''; + $dom = new DOMDocument(); - $ok = $dom->loadHTML($body); + $ok = $dom->loadHTML($utf8_tag.$body); unset($body); // storing the DOM in memory is enough... error_reporting($old); if (!$ok) { throw new oEmbedHelper_BadHtmlException(); } - + Event::handle('GetRemoteUrlMetadataFromDom', array($url, $dom, &$metadata)); }