* Copyright (C) 2010, StatusNet, Inc.
*
* Importer class for Delicious.com backups
- *
+ *
* PHP version 5
*
* This program is free software: you can redistribute it and/or modify
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
-
class DeliciousBackupImporter extends QueueHandler
{
/**
*
* @return string transport string
*/
-
function transport()
{
return 'dlcsback';
* and import to StatusNet as Bookmark activities.
*
* The document format is terrible. It consists of a <dl> with
- * a bunch of <dt>'s, occasionally with <dd>'s.
+ * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
* There are sometimes <p>'s lost inside.
*
* @param array $data pair of user, text
*
* @return boolean success value
*/
-
function handle($data)
{
list($user, $body) = $data;
- $doc = $this->importHTML($body);
+ try {
+ $doc = $this->importHTML($body);
+ } catch (ClientException $cex) {
+ // XXX: message to the user
+ common_log(LOG_WARNING, $cex->getMessage());
+ return true;
+ }
+
+ // If we can't parse it, it's no good
+
+ if (empty($doc)) {
+ return true;
+ }
$dls = $doc->getElementsByTagName('dl');
if ($dls->length != 1) {
- throw new ClientException(_("Bad import file."));
+ // XXX: message to the user
+ common_log(LOG_WARNING, 'Bad input file');
+ return true;
}
$dl = $dls->item(0);
}
switch (strtolower($child->tagName)) {
case 'dt':
+ // <dt> nodes contain primary information about a bookmark.
+ // We can't import the current one just yet though, since
+ // it may be followed by a <dd>.
if (!empty($dt)) {
// No DD provided
$this->importBookmark($user, $dt);
case 'dd':
$dd = $child;
- $saved = $this->importBookmark($user, $dt, $dd);
+ if (!empty($dt)) {
+ // This <dd> contains a description for the bookmark in
+ // the preceding <dt> node.
+ $saved = $this->importBookmark($user, $dt, $dd);
+ }
$dt = null;
$dd = null;
+ break;
case 'p':
common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
break;
default:
- common_log(LOG_WARNING,
+ common_log(LOG_WARNING,
"Unexpected element $child->tagName ".
" found in import.");
}
$dt = $dd = null;
}
}
+ if (!empty($dt)) {
+ // There was a final bookmark without a description.
+ try {
+ $this->importBookmark($user, $dt);
+ } catch (Exception $e) {
+ common_log(LOG_ERR, $e->getMessage());
+ }
+ }
return true;
}
/**
* Import a single bookmark
- *
+ *
* Takes a <dt>/<dd> pair. The <dt> has a single
* <a> in it with some non-standard attributes.
- *
+ *
* A <dt><dt><dd> sequence will appear as a <dt> with
- * anothe <dt> as a child. We handle this case recursively.
+ * anothe <dt> as a child. We handle this case recursively.
*
* @param User $user User to import data as
* @param DOMElement $dt <dt> element
*
* @return Notice imported notice
*/
-
function importBookmark($user, $dt, $dd = null)
{
- // We have to go squirrelling around in the child nodes
- // on the off chance that we've received another <dt>
- // as a child.
-
- for ($i = 0; $i < $dt->childNodes->length; $i++) {
- $child = $dt->childNodes->item($i);
- if ($child->nodeType == XML_ELEMENT_NODE) {
- if ($child->tagName == 'dt' && !is_null($dd)) {
- $this->importBookmark($user, $dt);
- $this->importBookmark($user, $child, $dd);
- return;
- }
- }
+ $as = $dt->getElementsByTagName('a');
+
+ if ($as->length == 0) {
+ // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
+ throw new ClientException(_m("No <A> tag in a <DT>."));
}
+ $a = $as->item(0);
+
+ $private = $a->getAttribute('private');
+
+ if ($private != 0) {
+ // TRANS: Client exception thrown when a bookmark in an import file is private.
+ throw new ClientException(_m('Skipping private bookmark.'));
+ }
+
+ if (!empty($dd)) {
+ $description = $dd->nodeValue;
+ } else {
+ $description = null;
+ }
+ $addDate = $a->getAttribute('add_date');
+
+ $data = array(
+ 'profile_id' => $user->id,
+ 'title' => $a->nodeValue,
+ 'description' => $description,
+ 'url' => $a->getAttribute('href'),
+ 'tags' => $a->getAttribute('tags'),
+ 'created' => common_sql_date(intval($addDate))
+ );
+
$qm = QueueManager::get();
-
- $qm->enqueue(array($user, $dt, $dd), 'dlcsbkmk');
+ $qm->enqueue($data, 'dlcsbkmk');
}
/**
error_reporting($old);
if ($ok) {
+ foreach ($dom->getElementsByTagName('body') as $node) {
+ $this->fixListsIn($node);
+ }
return $dom;
} else {
return null;
}
}
+
+
+ function fixListsIn(DOMNode $body) {
+ $toFix = array();
+
+ foreach ($body->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dl') {
+ $toFix[] = $node;
+ }
+ }
+ }
+
+ foreach ($toFix as $node) {
+ $this->fixList($node);
+ }
+ }
+
+ function fixList(DOMNode $list) {
+ $toFix = array();
+
+ foreach ($list->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dt' || $el == 'dd') {
+ $toFix[] = $node;
+ }
+ if ($el == 'dl') {
+ // Sublist.
+ // Technically, these can only appear inside a <dd>...
+ $this->fixList($node);
+ }
+ }
+ }
+
+ foreach ($toFix as $node) {
+ $this->fixListItem($node);
+ }
+ }
+
+ function fixListItem(DOMNode $item) {
+ // The HTML parser in libxml2 doesn't seem to properly handle
+ // many cases of implied close tags, apparently because it doesn't
+ // understand the nesting rules specified in the HTML DTD.
+ //
+ // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
+ // interpreted as parent->child trees instead of siblings:
+ //
+ // When parsing this input: "<dt>aaa <dt>bbb"
+ // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
+ // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
+ //
+ // It does at least know that going from dt to dd, or dd to dt,
+ // should make a break.
+
+ $toMove = array();
+
+ foreach ($item->childNodes as $node) {
+ if ($node->nodeType == XML_ELEMENT_NODE) {
+ $el = strtolower($node->nodeName);
+ if ($el == 'dt' || $el == 'dd') {
+ // dt & dd cannot contain each other;
+ // This node was incorrectly placed; move it up a level!
+ $toMove[] = $node;
+ }
+ if ($el == 'dl') {
+ // Sublist.
+ // Technically, these can only appear inside a <dd>.
+ $this->fixList($node);
+ }
+ }
+ }
+
+ $parent = $item->parentNode;
+ $next = $item->nextSibling;
+ foreach ($toMove as $node) {
+ $item->removeChild($node);
+ $parent->insertBefore($node, $next);
+ $this->fixListItem($node);
+ }
+ }
}