From fedfde9bbb3c898328c6395a41c3243d6e97a2bf Mon Sep 17 00:00:00 2001 From: Brion Vibber Date: Fri, 31 Dec 2010 12:09:54 -0800 Subject: [PATCH] Bookmark plugin: fixes for bad DOM element nesting in delicious import data delicious bookmark exports use the godawful HTML bookmark file format that ancient versions of Netscape used (and has thus been the common import/export format for bookmarks since the dark ages of the web :) This arranges bookmark entries as an HTML definition list, using a lot of implied close tags (leaving off the and ). DOMDocument->loadHTML() uses libxml2's HTML mode, which generally does ok with muddling through things but apparently is really, really bad about handling those implied close tags. Sequences of adjacent
elements (eg bookmark without a description, followed by another bookmark "
"), end up interpreted as nested ("
") instead of as siblings ("
"). The first round of code tried to resolve the nesting inline, but ended up a bit funky in places. I've replaced this with a standalone run through the data to re-order the elements, based on our knowing that
and
cannot directly contain one another; once that's done, our main logic loop can be a bit cleaner. I'm not 100% sure it's doing nested sublists correctly, but these don't seem to show up in delicious export (and even if they do, with the way we flatten the input it shouldn't make a difference). Also fixed a clearer edge case where some bookmarks didn't get imported when missing descriptions. --- plugins/Bookmark/deliciousbackupimporter.php | 117 ++++++++++++++++--- 1 file changed, 101 insertions(+), 16 deletions(-) diff --git a/plugins/Bookmark/deliciousbackupimporter.php b/plugins/Bookmark/deliciousbackupimporter.php index 1b55115d6d..bc5a91be80 100644 --- a/plugins/Bookmark/deliciousbackupimporter.php +++ b/plugins/Bookmark/deliciousbackupimporter.php @@ -65,7 +65,7 @@ class DeliciousBackupImporter extends QueueHandler * and import to StatusNet as Bookmark activities. * * The document format is terrible. It consists of a
with - * a bunch of
's, occasionally with
's. + * a bunch of
's, occasionally with
's adding descriptions. * There are sometimes

's lost inside. * * @param array $data pair of user, text @@ -99,6 +99,9 @@ class DeliciousBackupImporter extends QueueHandler } switch (strtolower($child->tagName)) { case 'dt': + //

nodes contain primary information about a bookmark. + // We can't import the current one just yet though, since + // it may be followed by a
. if (!empty($dt)) { // No DD provided $this->importBookmark($user, $dt); @@ -109,10 +112,13 @@ class DeliciousBackupImporter extends QueueHandler case 'dd': $dd = $child; + // This
contains a description for the bookmark in + // the preceding
node. $saved = $this->importBookmark($user, $dt, $dd); $dt = null; $dd = null; + break; case 'p': common_log(LOG_INFO, 'Skipping the

in the

.'); break; @@ -126,6 +132,14 @@ class DeliciousBackupImporter extends QueueHandler $dt = $dd = null; } } + if (!empty($dt)) { + // There was a final bookmark without a description. + try { + $this->importBookmark($user, $dt); + } catch (Exception $e) { + common_log(LOG_ERR, $e->getMessage()); + } + } return true; } @@ -148,21 +162,6 @@ class DeliciousBackupImporter extends QueueHandler function importBookmark($user, $dt, $dd = null) { - // We have to go squirrelling around in the child nodes - // on the off chance that we've received another
- // as a child. - - for ($i = 0; $i < $dt->childNodes->length; $i++) { - $child = $dt->childNodes->item($i); - if ($child->nodeType == XML_ELEMENT_NODE) { - if ($child->tagName == 'dt' && !is_null($dd)) { - $this->importBookmark($user, $dt); - $this->importBookmark($user, $child, $dd); - return; - } - } - } - $qm = QueueManager::get(); $qm->enqueue(array($user, $dt, $dd), 'dlcsbkmk'); @@ -188,9 +187,95 @@ class DeliciousBackupImporter extends QueueHandler error_reporting($old); if ($ok) { + foreach ($dom->getElementsByTagName('body') as $node) { + $this->fixListsIn($node); + } return $dom; } else { return null; } } + + + function fixListsIn(DOMNode $body) { + $toFix = array(); + + foreach ($body->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dl') { + $toFix[] = $node; + } + } + } + + foreach ($toFix as $node) { + $this->fixList($node); + } + } + + function fixList(DOMNode $list) { + $toFix = array(); + + foreach ($list->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dt' || $el == 'dd') { + $toFix[] = $node; + } + if ($el == 'dl') { + // Sublist. + // Technically, these can only appear inside a
... + $this->fixList($node); + } + } + } + + foreach ($toFix as $node) { + $this->fixListItem($node); + } + } + + function fixListItem(DOMNode $item) { + // The HTML parser in libxml2 doesn't seem to properly handle + // many cases of implied close tags, apparently because it doesn't + // understand the nesting rules specified in the HTML DTD. + // + // This leads to sequences of adjacent
s or
s being incorrectly + // interpreted as parent->child trees instead of siblings: + // + // When parsing this input: "
aaa
bbb" + // should be equivalent to: "
aaa
bbb
" + // but we're seeing instead: "
aaa
bbb
" + // + // It does at least know that going from dt to dd, or dd to dt, + // should make a break. + + $toMove = array(); + + foreach ($item->childNodes as $node) { + if ($node->nodeType == XML_ELEMENT_NODE) { + $el = strtolower($node->nodeName); + if ($el == 'dt' || $el == 'dd') { + // dt & dd cannot contain each other; + // This node was incorrectly placed; move it up a level! + $toMove[] = $node; + } + if ($el == 'dl') { + // Sublist. + // Technically, these can only appear inside a
. + $this->fixList($node); + } + } + } + + $parent = $item->parentNode; + $next = $item->nextSibling; + foreach ($toMove as $node) { + $item->removeChild($node); + $parent->insertBefore($node, $next); + $this->fixListItem($node); + } + } + } -- 2.39.5