]> git.mxchange.org Git - friendica.git/commitdiff
Improve charset detection in Util\ParseUrl
authorHypolite Petovan <hypolite@mrpetovan.com>
Wed, 23 Sep 2020 00:35:08 +0000 (20:35 -0400)
committerHypolite Petovan <hypolite@mrpetovan.com>
Wed, 23 Sep 2020 00:35:36 +0000 (20:35 -0400)
- Pages with charset meta tag weren't properly decoded

src/Util/ParseUrl.php

index 01ad79d4f114128d092c1eaec32d3853d2a59427..bb3ebbc10b219863d23dbda496200fd97d2fee7d 100644 (file)
@@ -201,9 +201,18 @@ class ParseUrl
                        }
                }
 
-               // Fetch the first mentioned charset. Can be in body or header
                $charset = '';
-               if (preg_match('/charset=(.*?)[\'"\s\n]/', $header, $matches)) {
+               // Look for a charset, first in headers
+               // Expected form: Content-Type: text/html; charset=ISO-8859-4
+               if (preg_match('/charset=(.+?)\s/', $header, $matches)) {
+                       $charset = trim(trim(trim(array_pop($matches)), ';,'));
+               }
+
+               // Then in body that gets precedence
+               // Expected forms:
+               // - <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+               // - <meta charset="utf-8">
+               if (preg_match('/charset=["\']?([^\'"]*?)[\'"]/', $body, $matches)) {
                        $charset = trim(trim(trim(array_pop($matches)), ';,'));
                }