]> git.mxchange.org Git - quix0rs-gnu-social.git/commitdiff
Parse RSS items as activities
authorEvan Prodromou <evan@status.net>
Fri, 19 Mar 2010 14:48:39 +0000 (09:48 -0500)
committerEvan Prodromou <evan@status.net>
Fri, 19 Mar 2010 14:48:39 +0000 (09:48 -0500)
First steps to parsing RSS items as activities. RSS feeds don't seem
to have enough data to make good remote profiles, but this may work
with some "hints".

lib/activity.php
tests/ActivityParseTests.php

index c67d090f725fedcaa5ce7dc7cda9ebd8530a91e8..5b304020d6be228cf51c707a205f400880510700 100644 (file)
@@ -643,38 +643,11 @@ class ActivityObject
         );
 
         if ($element->tagName == 'author') {
-
-            $this->type  = self::PERSON; // XXX: is this fair?
-            $this->title = $this->_childContent($element, self::NAME);
-            $this->id    = $this->_childContent($element, self::URI);
-
-            if (empty($this->id)) {
-                $email = $this->_childContent($element, self::EMAIL);
-                if (!empty($email)) {
-                    // XXX: acct: ?
-                    $this->id = 'mailto:'.$email;
-                }
-            }
-
+            $this->_fromAuthor($element);
+        } else if ($element->tagName == 'item') {
+            $this->_fromRssItem($element);
         } else {
-
-            $this->type = $this->_childContent($element, Activity::OBJECTTYPE,
-                                               Activity::SPEC);
-
-            if (empty($this->type)) {
-                $this->type = ActivityObject::NOTE;
-            }
-
-            $this->id      = $this->_childContent($element, self::ID);
-            $this->title   = $this->_childContent($element, self::TITLE);
-            $this->summary = $this->_childContent($element, self::SUMMARY);
-
-            $this->source  = $this->_getSource($element);
-
-            $this->content = ActivityUtils::getContent($element);
-
-            $this->link = ActivityUtils::getPermalink($element);
-
+            $this->_fromAtomEntry($element);
         }
 
         // Some per-type attributes...
@@ -697,6 +670,72 @@ class ActivityObject
         }
     }
 
+    private function _fromAuthor($element)
+    {
+        $this->type  = self::PERSON; // XXX: is this fair?
+        $this->title = $this->_childContent($element, self::NAME);
+        $this->id    = $this->_childContent($element, self::URI);
+
+        if (empty($this->id)) {
+            $email = $this->_childContent($element, self::EMAIL);
+            if (!empty($email)) {
+                // XXX: acct: ?
+                $this->id = 'mailto:'.$email;
+            }
+        }
+    }
+
+    private function _fromAtomEntry($element)
+    {
+        $this->type = $this->_childContent($element, Activity::OBJECTTYPE,
+                                           Activity::SPEC);
+
+        if (empty($this->type)) {
+            $this->type = ActivityObject::NOTE;
+        }
+
+        $this->id      = $this->_childContent($element, self::ID);
+        $this->title   = $this->_childContent($element, self::TITLE);
+        $this->summary = $this->_childContent($element, self::SUMMARY);
+
+        $this->source  = $this->_getSource($element);
+
+        $this->content = ActivityUtils::getContent($element);
+
+        $this->link = ActivityUtils::getPermalink($element);
+    }
+
+    // @fixme rationalize with Activity::_fromRssItem()
+
+    private function _fromRssItem($item)
+    {
+        $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, Activity::RSS);
+
+        $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, Activity::CONTENTNS);
+
+        if (!empty($contentEl)) {
+            $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
+        } else {
+            $descriptionEl = ActivityUtils::child($item, Activity::DESCRIPTION, Activity::RSS);
+            if (!empty($descriptionEl)) {
+                $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
+            }
+        }
+
+        $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, Activity::RSS);
+
+        $guidEl = ActivityUtils::child($item, Activity::GUID, Activity::RSS);
+
+        if (!empty($guidEl)) {
+            $this->id = $guidEl->textContent;
+
+            if ($guidEl->hasAttribute('isPermaLink')) {
+                // overwrites <link>
+                $this->link = $this->id;
+            }
+        }
+    }
+
     private function _childContent($element, $tag, $namespace=ActivityUtils::ATOM)
     {
         return ActivityUtils::childContent($element, $tag, $namespace);
@@ -1051,6 +1090,21 @@ class Activity
     const PUBLISHED = 'published';
     const UPDATED   = 'updated';
 
+    const RSS = null; // no namespace!
+
+    const PUBDATE     = 'pubDate';
+    const DESCRIPTION = 'description';
+    const GUID        = 'guid';
+    const SELF        = 'self';
+    const IMAGE       = 'image';
+    const URL         = 'url';
+
+    const DC = 'http://purl.org/dc/elements/1.1/';
+
+    const CREATOR = 'creator';
+
+    const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
+
     public $actor;   // an ActivityObject
     public $verb;    // a string (the URL)
     public $object;  // an ActivityObject
@@ -1081,8 +1135,6 @@ class Activity
             return;
         }
 
-        $this->entry = $entry;
-
         // Insist on a feed's root DOMElement; don't allow a DOMDocument
         if ($feed instanceof DOMDocument) {
             throw new ClientException(
@@ -1090,8 +1142,22 @@ class Activity
             );
         }
 
+        $this->entry = $entry;
         $this->feed  = $feed;
 
+        if ($entry->namespaceURI == Activity::ATOM &&
+            $entry->localName == 'entry') {
+            $this->_fromAtomEntry($entry, $feed);
+        } else if ($entry->namespaceURI == Activity::RSS &&
+                   $entry->localName == 'item') {
+            $this->_fromRssItem($entry, $feed);
+        } else {
+            throw new Exception("Unknown DOM element: {$entry->namespaceURI} {$entry->localName}");
+        }
+    }
+
+    function _fromAtomEntry($entry, $feed)
+    {
         $pubEl = $this->_child($entry, self::PUBLISHED, self::ATOM);
 
         if (!empty($pubEl)) {
@@ -1177,6 +1243,69 @@ class Activity
         }
     }
 
+    function _fromRssItem($item, $rss)
+    {
+        $verbEl = $this->_child($item, self::VERB);
+
+        if (!empty($verbEl)) {
+            $this->verb = trim($verbEl->textContent);
+        } else {
+            $this->verb = ActivityVerb::POST;
+            // XXX: do other implied stuff here
+        }
+
+        $pubDateEl = $this->_child($item, self::PUBDATE, self::RSS);
+
+        if (!empty($pubDateEl)) {
+            $this->time = strtotime($pubDateEl->textContent);
+        }
+
+        $authorEl = $this->_child($item, self::AUTHOR, self::RSS);
+
+        if (!empty($authorEl)) {
+            $this->actor = $this->_fromRssAuthor($authorEl);
+        } else {
+            $dcCreatorEl = $this->_child($item, self::CREATOR, self::DC);
+            if (!empty($dcCreatorEl)) {
+                $this->actor = $this->_fromDcCreator($dcCreatorEl);
+            } else if (!empty($rss)) {
+                $this->actor = $this->_fromRss($rss);
+            }
+        }
+
+        $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);
+
+        $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
+
+        if (!empty($contentEl)) {
+            $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
+        } else {
+            $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
+            if (!empty($descriptionEl)) {
+                $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
+            }
+        }
+
+        $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, self::RSS);
+
+        // @fixme enclosures
+        // @fixme thumbnails... maybe
+
+        $guidEl = ActivityUtils::child($item, self::GUID, self::RSS);
+
+        if (!empty($guidEl)) {
+            $this->id = $guidEl->textContent;
+
+            if ($guidEl->hasAttribute('isPermaLink') && $guidEl->getAttribute('isPermaLink') != 'false') {
+                // overwrites <link>
+                $this->link = $this->id;
+            }
+        }
+
+        $this->object  = new ActivityObject($item);
+        $this->context = new ActivityContext($item);
+    }
+
     /**
      * Returns an Atom <entry> based on this activity
      *
@@ -1249,6 +1378,83 @@ class Activity
         return $xs->getString();
     }
 
+    function _fromRssAuthor($el)
+    {
+        $text = $el->textContent;
+
+        if (preg_match('/^(.*?) \((.*)\)$/', $text, $match)) {
+            $email = $match[1];
+            $name = $match[2];
+        } else if (preg_match('/^(.*?) <(.*)>$/', $text, $match)) {
+            $name = $match[1];
+            $email = $match[2];
+        } else if (preg_match('/.*@.*/', $text)) {
+            $email = $text;
+            $name = null;
+        } else {
+            $name = $text;
+            $email = null;
+        }
+
+        // Not really enough info
+
+        $actor = new ActivityObject();
+
+        $actor->element = $el;
+
+        $actor->type  = ActivityObject::PERSON;
+        $actor->title = $name;
+
+        if (!empty($email)) {
+            $actor->id = 'mailto:'.$email;
+        }
+
+        return $actor;
+    }
+
+    function _fromDcCreator($el)
+    {
+        // Not really enough info
+
+        $text = $el->textContent;
+
+        $actor = new ActivityObject();
+
+        $actor->element = $el;
+
+        $actor->title = $text;
+        $actor->type  = ActivityObject::PERSON;
+
+        return $actor;
+    }
+
+    function _fromRss($el)
+    {
+        $actor = new ActivityObject();
+
+        $actor->element = $el;
+
+        $actor->type = ActivityObject::PERSON; // @fixme guess better
+
+        $actor->title = ActivityUtils::childContent($el, ActivityObject::TITLE, self::RSS);
+        $actor->link  = ActivityUtils::childContent($el, ActivityUtils::LINK, self::RSS);
+        $actor->id    = ActivityUtils::getLink($el, self::SELF);
+
+        $desc = ActivityUtils::childContent($el, self::DESCRIPTION, self::RSS);
+
+        if (!empty($desc)) {
+            $actor->content = htmlspecialchars_decode($desc, ENT_QUOTES);
+        }
+
+        $imageEl = ActivityUtils::child($el, self::IMAGE, self::RSS);
+
+        if (!empty($imageEl)) {
+            $actor->avatarLinks[] = ActivityUtils::childContent($imageEl, self::URL, self::RSS);
+        }
+
+        return $actor;
+    }
+
     private function _child($element, $tag, $namespace=self::SPEC)
     {
         return ActivityUtils::child($element, $tag, $namespace);
index 7bf9cec7c453d57e048b15ec2671ba31c602faf2..b6980a6bb96a9f521ca9b9a5206af7cafa6c4de8 100644 (file)
@@ -138,9 +138,38 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
         $this->assertEquals($poco->urls[0]->value, 'http://example.com/blog.html');
         $this->assertEquals($poco->urls[0]->primary, 'true');
         $this->assertEquals($act->actor->geopoint, '37.7749295 -122.4194155');
-
     }
 
+    public function testExample6()
+    {
+        global $_example6;
+
+        $dom = DOMDocument::loadXML($_example6);
+
+        $rss = $dom->documentElement;
+
+        $channels = $dom->getElementsByTagName('channel');
+
+        $channel = $channels->item(0);
+
+        $items = $channel->getElementsByTagName('item');
+
+        $item = $items->item(0);
+
+        $act = new Activity($item, $channel);
+
+        $this->assertEquals($act->verb, ActivityVerb::POST);
+
+        $this->assertEquals($act->id, 'http://en.blog.wordpress.com/?p=3857');
+        $this->assertEquals($act->link, 'http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/');
+        $this->assertEquals($act->title, 'Rub-a-Dub-Dub in the PubSubHubbub');
+        $this->assertEquals($act->time, 1267634892);
+
+        $actor = $act->actor;
+
+        $this->assertFalse(empty($actor));
+        $this->assertEquals($actor->title, "Joseph Scott");
+    }
 }
 
 $_example1 = <<<EXAMPLE1
@@ -330,3 +359,67 @@ $_example5 = <<<EXAMPLE5
 </entry>
 </feed>
 EXAMPLE5;
+
+$_example6 = <<<EXAMPLE6
+<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0"
+       xmlns:content="http://purl.org/rss/1.0/modules/content/"
+       xmlns:wfw="http://wellformedweb.org/CommentAPI/"
+       xmlns:dc="http://purl.org/dc/elements/1.1/"
+       xmlns:atom="http://www.w3.org/2005/Atom"
+       xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
+       xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
+       xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
+       >
+
+       <channel>
+               <title>WordPress.com News</title>
+               <atom:link href="http://en.blog.wordpress.com/feed/" rel="self" type="application/rss+xml" />
+               <link>http://en.blog.wordpress.com</link>
+               <description>The latest news on WordPress.com and the WordPress community.</description>
+               <lastBuildDate>Thu, 18 Mar 2010 23:25:35 +0000</lastBuildDate>
+
+               <generator>http://wordpress.com/</generator>
+               <language>en</language>
+               <sy:updatePeriod>hourly</sy:updatePeriod>
+               <sy:updateFrequency>1</sy:updateFrequency>
+               <cloud domain='en.blog.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
+               <image>
+                       <url>http://www.gravatar.com/blavatar/e6392390e3bcfadff3671c5a5653d95b?s=96&#038;d=http://s2.wp.com/i/buttonw-com.png</url>
+                       <title>WordPress.com News</title>
+                       <link>http://en.blog.wordpress.com</link>
+               </image>
+               <atom:link rel="search" type="application/opensearchdescription+xml" href="http://en.blog.wordpress.com/osd.xml" title="WordPress.com News" />
+               <atom:link rel='hub' href='http://en.blog.wordpress.com/?pushpress=hub'/>
+
+               <item>
+                       <title>Rub-a-Dub-Dub in the PubSubHubbub</title>
+                       <link>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/</link>
+                       <comments>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/#comments</comments>
+                       <pubDate>Wed, 03 Mar 2010 16:48:12 +0000</pubDate>
+                       <dc:creator>Joseph Scott</dc:creator>
+
+                       <category><![CDATA[Feeds]]></category>
+                       <category><![CDATA[atom]]></category>
+                       <category><![CDATA[pubsubhubbub]]></category>
+                       <category><![CDATA[rss]]></category>
+
+                       <guid isPermaLink="false">http://en.blog.wordpress.com/?p=3857</guid>
+                       <description><![CDATA[From the tongue twisting name department we welcome PubSubHubbub, or as some people have shortened it to: PuSH.  Like rssCloud, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster.  In a nutshell, instead of having to periodically ask [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></description>
+                               <content:encoded><![CDATA[<p>From the tongue twisting name department we welcome <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub</a>, or as some people have shortened it to: PuSH.  Like <a href="http://en.blog.wordpress.com/2009/09/07/rss-in-the-clouds/">rssCloud</a>, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster.  In a nutshell, instead of having to periodically ask your blog if there are any updates they can now register to automatically receive updates each time you publish new content.  In most cases these updates are sent out within a second or two of when you hit the publish button.</p>
+       <p>Today we&#8217;ve turned on PuSH support for the more than 10.5 million blogs on WordPress.com.  There&#8217;s nothing to configure, it&#8217;s working right now behind the scenes to help others keep up to date with your posts.</p>
+       <p>For those using the WordPress.org software we are releasing a new PuSH plugin: <a href="http://wordpress.org/extend/plugins/pushpress/">PuSHPress</a>.  This plugin differs from the current PuSH related plugins by including a built-in hub.</p>
+       <p>For more PuSH related reading check out the <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub project site</a> and <a href="http://groups.google.com/group/pubsubhubbub?pli=1">Google Group</a>.  And if you really want to geek out there&#8217;s always the <a href="http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html">PubSubHubbub Spec</a> <img src='http://s.wordpress.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p>
+       <br />  <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/en.blog.wordpress.com/3857/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></content:encoded>
+                               <wfw:commentRss>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/feed/</wfw:commentRss>
+
+                       <slash:comments>96</slash:comments>
+
+                       <media:content url="http://1.gravatar.com/avatar/582b66ad5ae1b69c7601a990cb9a661a?s=96&#38;d=identicon" medium="image">
+                               <media:title type="html">josephscott</media:title>
+                       </media:content>
+               </item>
+       </channel>
+</rss>
+EXAMPLE6;
+