+<?php
+/**
+ * StatusNet - the distributed open-source microblogging tool
+ * Copyright (C) 2010, StatusNet, Inc.
+ *
+ * Importer class for Delicious.com backups
+ *
+ * PHP version 5
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * @category Bookmark
+ * @package StatusNet
+ * @author Evan Prodromou <evan@status.net>
+ * @copyright 2010 StatusNet, Inc.
+ * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
+ * @link http://status.net/
+ */
+
+if (!defined('STATUSNET')) {
+ // This check helps protect against security problems;
+ // your code file can't be executed directly from the web.
+ exit(1);
+}
+
+/**
+ * Importer class for Delicious bookmarks
+ *
+ * @category Bookmark
+ * @package StatusNet
+ * @author Evan Prodromou <evan@status.net>
+ * @copyright 2010 StatusNet, Inc.
+ * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
+ * @link http://status.net/
+ */
+class DeliciousBackupImporter extends QueueHandler
+{
+ /**
+ * Transport of the importer
+ *
+ * @return string transport string
+ */
+ function transport()
+ {
+ return 'dlcsback';
+ }
+
+ /**
+ * Import an in-memory bookmark list to a user's account
+ *
+ * Take a delicious.com backup file (same as Netscape bookmarks.html)
+ * and import to StatusNet as Bookmark activities.
+ *
+ * The document format is terrible. It consists of a <dl> with
+ * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
+ * There are sometimes <p>'s lost inside.
+ *
+ * @param array $data pair of user, text
+ *
+ * @return boolean success value
+ */
+ function handle($data)
+ {
+ list($user, $body) = $data;
+
+ try {
+ $doc = $this->importHTML($body);
+ } catch (ClientException $cex) {
+ // XXX: message to the user
+ common_log(LOG_WARNING, $cex->getMessage());
+ return true;
+ }
+
+ // If we can't parse it, it's no good
+
+ if (empty($doc)) {
+ return true;
+ }
+
+ $dls = $doc->getElementsByTagName('dl');
+
+ if ($dls->length != 1) {
+ // XXX: message to the user
+ common_log(LOG_WARNING, 'Bad input file');
+ return true;
+ }
+
+ $dl = $dls->item(0);
+
+ $children = $dl->childNodes;
+
+ $dt = null;
+
+ for ($i = 0; $i < $children->length; $i++) {
+ try {
+ $child = $children->item($i);
+ if ($child->nodeType != XML_ELEMENT_NODE) {
+ continue;
+ }
+ switch (strtolower($child->tagName)) {
+ case 'dt':
+ // <dt> nodes contain primary information about a bookmark.
+ // We can't import the current one just yet though, since
+ // it may be followed by a <dd>.
+ if (!empty($dt)) {
+ // No DD provided
+ $this->importBookmark($user, $dt);
+ $dt = null;
+ }
+ $dt = $child;
+ break;
+ case 'dd':
+ $dd = $child;
+
+ if (!empty($dt)) {
+ // This <dd> contains a description for the bookmark in
+ // the preceding <dt> node.
+ $saved = $this->importBookmark($user, $dt, $dd);
+ }
+
+ $dt = null;
+ $dd = null;
+ break;
+ case 'p':
+ common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
+ break;
+ default:
+ common_log(LOG_WARNING,
+ "Unexpected element $child->tagName ".
+ " found in import.");
+ }
+ } catch (Exception $e) {
+ common_log(LOG_ERR, $e->getMessage());
+ $dt = $dd = null;
+ }
+ }
+ if (!empty($dt)) {
+ // There was a final bookmark without a description.
+ try {
+ $this->importBookmark($user, $dt);
+ } catch (Exception $e) {
+ common_log(LOG_ERR, $e->getMessage());
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Import a single bookmark
+ *
+ * Takes a <dt>/<dd> pair. The <dt> has a single
+ * <a> in it with some non-standard attributes.
+ *
+ * A <dt><dt><dd> sequence will appear as a <dt> with
+ * anothe <dt> as a child. We handle this case recursively.
+ *
+ * @param User $user User to import data as
+ * @param DOMElement $dt <dt> element
+ * @param DOMElement $dd <dd> element
+ *
+ * @return Notice imported notice
+ */
+ function importBookmark($user, $dt, $dd = null)
+ {
+ $as = $dt->getElementsByTagName('a');
+
+ if ($as->length == 0) {
+ // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
+ throw new ClientException(_m("No <A> tag in a <DT>."));
+ }
+
+ $a = $as->item(0);
+
+ $private = $a->getAttribute('private');
+
+ if ($private != 0) {
+ // TRANS: Client exception thrown when a bookmark in an import file is private.
+ throw new ClientException(_m('Skipping private bookmark.'));
+ }
+
+ if (!empty($dd)) {
+ $description = $dd->nodeValue;
+ } else {
+ $description = null;
+ }
+ $addDate = $a->getAttribute('add_date');
+
+ $data = array(
+ 'profile_id' => $user->id,
+ 'title' => $a->nodeValue,
+ 'description' => $description,
+ 'url' => $a->getAttribute('href'),
+ 'tags' => $a->getAttribute('tags'),
+ 'created' => common_sql_date(intval($addDate))
+ );
+
+ $qm = QueueManager::get();
+ $qm->enqueue($data, 'dlcsbkmk');
+ }
+
+ /**
+ * Parse some HTML
+ *
+ * Hides the errors that the dom parser returns
+ *
+ * @param string $body Data to import
+ *
+ * @return DOMDocument parsed document
+ */
+
+ function importHTML($body)
+ {
+ // DOMDocument::loadHTML may throw warnings on unrecognized elements,
+ // and notices on unrecognized namespaces.
+ $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
+ $dom = new DOMDocument();
+ $ok = $dom->loadHTML($body);
+ error_reporting($old);
+
+ if ($ok) {
+ foreach ($dom->getElementsByTagName('body') as $node) {
+ $this->fixListsIn($node);
+ }
+ return $dom;
+ } else {
+ return null;
+ }
+ }
+
+
+ function fixListsIn(DOMNode $body) {
+ $toFix = array();
+
+ foreach ($body->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dl') {
+ $toFix[] = $node;
+ }
+ }
+ }
+
+ foreach ($toFix as $node) {
+ $this->fixList($node);
+ }
+ }
+
+ function fixList(DOMNode $list) {
+ $toFix = array();
+
+ foreach ($list->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dt' || $el == 'dd') {
+ $toFix[] = $node;
+ }
+ if ($el == 'dl') {
+ // Sublist.
+ // Technically, these can only appear inside a <dd>...
+ $this->fixList($node);
+ }
+ }
+ }
+
+ foreach ($toFix as $node) {
+ $this->fixListItem($node);
+ }
+ }
+
+ function fixListItem(DOMNode $item) {
+ // The HTML parser in libxml2 doesn't seem to properly handle
+ // many cases of implied close tags, apparently because it doesn't
+ // understand the nesting rules specified in the HTML DTD.
+ //
+ // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
+ // interpreted as parent->child trees instead of siblings:
+ //
+ // When parsing this input: "<dt>aaa <dt>bbb"
+ // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
+ // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
+ //
+ // It does at least know that going from dt to dd, or dd to dt,
+ // should make a break.
+
+ $toMove = array();
+
+ foreach ($item->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dt' || $el == 'dd') {
+ // dt & dd cannot contain each other;
+ // This node was incorrectly placed; move it up a level!
+ $toMove[] = $node;
+ }
+ if ($el == 'dl') {
+ // Sublist.
+ // Technically, these can only appear inside a <dd>.
+ $this->fixList($node);
+ }
+ }
+ }
+
+ $parent = $item->parentNode;
+ $next = $item->nextSibling;
+ foreach ($toMove as $node) {
+ $item->removeChild($node);
+ $parent->insertBefore($node, $next);
+ $this->fixListItem($node);
+ }
+ }
+}