]> git.mxchange.org Git - friendica.git/commitdiff
Improve HTML::toPlaintext
authorHypolite Petovan <hypolite@mrpetovan.com>
Sun, 4 Aug 2019 14:11:59 +0000 (10:11 -0400)
committerHypolite Petovan <hypolite@mrpetovan.com>
Sun, 4 Aug 2019 14:11:59 +0000 (10:11 -0400)
- Ignore empty trimmed text nodes
- Ignore anchor links
- Ignore blank tags and avoids adding a doctype to transitional DOM objects

src/Content/Text/HTML.php

index 4be217b3e43dd213175f6f2aba258fc1e8ce6e1d..b9132c5d46e90d1663c82957ce5e7203e5c59ded 100644 (file)
@@ -56,6 +56,7 @@ class HTML
 
                $xpath = new DOMXPath($doc);
 
+               /** @var \DOMNode[] $list */
                $list = $xpath->query("//" . $tag);
                foreach ($list as $node) {
                        $attr = [];
@@ -98,9 +99,12 @@ class HTML
                                $node->parentNode->insertBefore($StartCode, $node);
 
                                if ($node->hasChildNodes()) {
+                                       /** @var \DOMNode $child */
                                        foreach ($node->childNodes as $child) {
-                                               $newNode = $child->cloneNode(true);
-                                               $node->parentNode->insertBefore($newNode, $node);
+                                               if (trim($child->nodeValue)) {
+                                                       $newNode = $child->cloneNode(true);
+                                                       $node->parentNode->insertBefore($newNode, $node);
+                                               }
                                        }
                                }
 
@@ -559,6 +563,8 @@ class HTML
                                $ignore = false;
                        }
 
+                       $ignore = $ignore || strpos($treffer[1], '#') === 0;
+
                        if (!$ignore) {
                                $urls[$treffer[1]] = $treffer[1];
                        }
@@ -582,7 +588,7 @@ class HTML
 
                $message = mb_convert_encoding($message, 'HTML-ENTITIES', "UTF-8");
 
-               @$doc->loadHTML($message);
+               @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
 
                $message = $doc->saveHTML();
                // Remove eventual UTF-8 BOM
@@ -591,7 +597,7 @@ class HTML
                // Collecting all links
                $urls = self::collectURLs($message);
 
-               @$doc->loadHTML($message);
+               @$doc->loadHTML($message, LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS);
 
                self::tagToBBCode($doc, 'html', [], '', '');
                self::tagToBBCode($doc, 'body', [], '', '');