]> git.mxchange.org Git - quix0rs-gnu-social.git/commitdiff
Test cases and fixes for Atom and RSS content decoding.
authorBrion Vibber <brion@pobox.com>
Fri, 23 Apr 2010 22:40:48 +0000 (15:40 -0700)
committerBrion Vibber <brion@pobox.com>
Fri, 23 Apr 2010 22:40:48 +0000 (15:40 -0700)
Fix extraction of Atom <content type="text"> and <content type="html">; we were failing to escape plaintext source data to HTML, and doing an extraneous double-deescape on HTML source resulting in breakage of notices containing text that looks like HTML. Only <content type="xhtml"> was working correctly previously.
Fixes for RSS2 content processing: we were failing to load <content:encoded> at all due to using wrong element name, and were applying an extraneous de-escape for <description> rather than the escaping that is required to turn plaintext into HTML. (Per spec, <description> must be plaintext.)

lib/activity.php
lib/activityutils.php
tests/ActivityParseTests.php

index 5d6230c6df623ac9000d76aa92d6d9dac1929622..27f09ab4d4639d66f37aef672c9be5fb88e1526e 100644 (file)
@@ -83,6 +83,7 @@ class Activity
     const CREATOR = 'creator';
 
     const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
+    const ENCODED = 'encoded';
 
     public $actor;   // an ActivityObject
     public $verb;    // a string (the URL)
@@ -268,14 +269,21 @@ class Activity
 
         $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);
 
-        $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
+        $contentEl = ActivityUtils::child($item, self::ENCODED, self::CONTENTNS);
 
         if (!empty($contentEl)) {
-            $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
+            // <content:encoded> XML node's text content is HTML; no further processing needed.
+            $this->content = $contentEl->textContent;
         } else {
             $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
             if (!empty($descriptionEl)) {
-                $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
+                // Per spec, <description> must be plaintext.
+                // In practice, often there's HTML... but these days good
+                // feeds are using <content:encoded> which is explicitly
+                // real HTML.
+                // We'll treat this following spec, and do HTML escaping
+                // to convert from plaintext to HTML.
+                $this->content = htmlspecialchars($descriptionEl->textContent);
             }
         }
 
index a7e99fb11e32b91762649247fb51a2b80c9d3b42..401fd7fc283ff92b9b5d5a021ecbd6e74780fa74 100644 (file)
@@ -213,11 +213,19 @@ class ActivityUtils
         // slavishly following http://atompub.org/rfc4287.html#rfc.section.4.1.3.3
 
         if (empty($type) || $type == 'text') {
-            return $el->textContent;
+            // We have plaintext saved as the XML text content.
+            // Since we want HTML, we need to escape any special chars.
+            return htmlspecialchars($el->textContent);
         } else if ($type == 'html') {
+            // We have HTML saved as the XML text content.
+            // No additional processing required once we've got it.
             $text = $el->textContent;
-            return htmlspecialchars_decode($text, ENT_QUOTES);
+            return $text;
         } else if ($type == 'xhtml') {
+            // Per spec, the <content type="xhtml"> contains a single
+            // HTML <div> with XHTML namespace on it as a child node.
+            // We need to pull all of that <div>'s child nodes and
+            // serialize them back to an (X)HTML source fragment.
             $divEl = ActivityUtils::child($el, 'div', 'http://www.w3.org/1999/xhtml');
             if (empty($divEl)) {
                 return null;
index 4563da914689cb1d642799509364bfef0ff0b0d9..378478d741d1cd3ce205a206ab8aaa7df0f6923b 100644 (file)
@@ -32,6 +32,18 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
         $this->assertEquals('tag:versioncentral.example.org,2009:/change/1643245', $act->objects[0]->id);
     }
 
+    public function testExample2()
+    {
+        global $_example2;
+        $dom = DOMDocument::loadXML($_example2);
+        $act = new Activity($dom->documentElement);
+
+        $this->assertFalse(empty($act));
+        // Did we handle <content type="html"> correctly with a typical payload?
+        $this->assertEquals("<p>Geraldine posted a Photo on PhotoPanic</p>\n     " .
+                            "<img src=\"/geraldine/photo1.jpg\">", trim($act->content));
+    }
+
     public function testExample3()
     {
         global $_example3;
@@ -305,6 +317,71 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
 
     }
 
+    public function testAtomContent()
+    {
+        $tests = array(array("<content>Some regular plain text.</content>",
+                             "Some regular plain text."),
+                       array("<content>&lt;b&gt;this is not HTML&lt;/b&gt;</content>",
+                             "&lt;b&gt;this is not HTML&lt;/b&gt;"),
+                       array("<content type='html'>Some regular plain HTML.</content>",
+                             "Some regular plain HTML."),
+                       array("<content type='html'>&lt;b&gt;this is too HTML&lt;/b&gt;</content>",
+                             "<b>this is too HTML</b>"),
+                       array("<content type='html'>&amp;lt;b&amp;gt;but this is not HTML!&amp;lt;/b&amp;gt;</content>",
+                             "&lt;b&gt;but this is not HTML!&lt;/b&gt;"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>Some regular plain XHTML.</div></content>",
+                             "Some regular plain XHTML."),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'><b>This is some XHTML!</b></div></content>",
+                             "<b>This is some XHTML!</b>"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&lt;b&gt;This is not some XHTML!&lt;/b&gt;</div></content>",
+                             "&lt;b&gt;This is not some XHTML!&lt;/b&gt;"),
+                       array("<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;</div></content>",
+                             "&amp;lt;b&amp;gt;This is not some XHTML either!&amp;lt;/b&amp;gt;"));
+        foreach ($tests as $data) {
+            list($source, $output) = $data;
+            $xml = "<entry xmlns='http://www.w3.org/2005/Atom'>" .
+                   "<id>http://example.com/fakeid</id>" .
+                   "<author><name>Test</name></author>" .
+                   "<title>Atom content tests</title>" .
+                   $source .
+                   "</entry>";
+            $dom = DOMDocument::loadXML($xml);
+            $act = new Activity($dom->documentElement);
+
+            $this->assertFalse(empty($act));
+            $this->assertEquals($output, trim($act->content));
+        }
+    }
+
+    public function testRssContent()
+    {
+        $tests = array(array("<content:encoded>Some regular plain HTML.</content:encoded>",
+                             "Some regular plain HTML."),
+                       array("<content:encoded>Some &lt;b&gt;exciting bold HTML&lt;/b&gt;</content:encoded>",
+                             "Some <b>exciting bold HTML</b>"),
+                       array("<content:encoded>Some &amp;lt;b&amp;gt;escaped non-HTML.&amp;lt;/b&amp;gt;</content:encoded>",
+                             "Some &lt;b&gt;escaped non-HTML.&lt;/b&gt;"),
+                       array("<description>Some plain text.</description>",
+                             "Some plain text."),
+                       array("<description>Some &lt;b&gt;non-HTML text&lt;/b&gt;</description>",
+                             "Some &lt;b&gt;non-HTML text&lt;/b&gt;"),
+                       array("<description>Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;</description>",
+                             "Some &amp;lt;b&amp;gt;double-escaped text&amp;lt;/b&amp;gt;"));
+        foreach ($tests as $data) {
+            list($source, $output) = $data;
+            $xml = "<item xmlns:content='http://purl.org/rss/1.0/modules/content/'>" .
+                   "<guid>http://example.com/fakeid</guid>" .
+                   "<title>RSS content tests</title>" .
+                   $source .
+                   "</item>";
+            $dom = DOMDocument::loadXML($xml);
+            $act = new Activity($dom->documentElement);
+
+            $this->assertFalse(empty($act));
+            $this->assertEquals($output, trim($act->content));
+        }
+    }
+
 }
 
 $_example1 = <<<EXAMPLE1