]> git.mxchange.org Git - quix0rs-gnu-social.git/commitdiff
don't mess upp charsets in oembed/og! check for utf-8 in http header and meta tags...
authorhannes <h@nnesmannerhe.im>
Tue, 26 Jan 2016 13:37:52 +0000 (13:37 +0000)
committerhannes <h@nnesmannerhe.im>
Tue, 26 Jan 2016 13:37:52 +0000 (13:37 +0000)
plugins/Oembed/lib/oembedhelper.php

index 5d84d68d8c35af8f64c6109046176b865f791cb7..6f514983f365a57d52c846058ab539e4644d463b 100644 (file)
@@ -74,20 +74,57 @@ class oEmbedHelper
 
         if (Event::handle('GetRemoteUrlMetadata', array($url, &$metadata))) {
             // If that event didn't return anything, try downloading the body and parse it
-            $body = HTTPClient::quickGet($url);
+
+            // don't use quickGet since we want to check Content-Type header for utf-8
+            $client = new HTTPClient();
+            $response = $client->get($url);
+            if (!$response->isOk()) {
+                // TRANS: Exception. %s is the URL we tried to GET.
+                throw new Exception(sprintf(_m('Could not GET URL %s.'), $url), $response->getStatus());
+            }
+            $body = $response->getBody();
 
             // DOMDocument::loadHTML may throw warnings on unrecognized elements,
             // and notices on unrecognized namespaces.
             $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
+            
+            // DOMDocument assumes ISO-8859-1 per HTML spec
+            // use UTF-8 if we find any evidence of that encoding
+            $utf8_evidence = false;
+            $unicode_check_dom = new DOMDocument();
+            $ok = $unicode_check_dom->loadHTML($body);
+            if (!$ok) throw new oEmbedHelper_BadHtmlException();
+            $metaNodes = $unicode_check_dom->getElementsByTagName('meta');
+            foreach($metaNodes as $metaNode) {
+                // case in-sensitive since Content-type and utf-8 can be written in many ways
+                if(stristr($metaNode->getAttribute('http-equiv'),'content-type')
+                && stristr($metaNode->getAttribute('content'),'utf-8')) {
+                    $utf8_evidence = true;        
+                    break;                  
+                } elseif(stristr($metaNode->getAttribute('charset'),'utf-8')) {
+                    $utf8_evidence = true;        
+                    break;
+                }
+            }
+            unset($unicode_check_dom);
+            
+            // The Content-Type HTTP response header overrides encoding metatags in DOM
+            if(stristr($response->getHeader('Content-Type'),'utf-8')) {
+                $utf8_evidence = true;              
+            }
+           
+            // add utf-8 encoding prolog if we have reason to believe this is utf-8 content        
+            $utf8_tag = $utf8_evidence ? '<?xml encoding="utf-8" ?>' : '';          
+            
             $dom = new DOMDocument();
-            $ok = $dom->loadHTML($body);
+            $ok = $dom->loadHTML($utf8_tag.$body);
             unset($body);   // storing the DOM in memory is enough...
             error_reporting($old);
 
             if (!$ok) {
                 throw new oEmbedHelper_BadHtmlException();
             }
-
+            
             Event::handle('GetRemoteUrlMetadataFromDom', array($url, $dom, &$metadata));
         }