3 * StatusNet - the distributed open-source microblogging tool
4 * Copyright (C) 2010, StatusNet, Inc.
6 * Importer class for Delicious.com backups
10 * This program is free software: you can redistribute it and/or modify
11 * it under the terms of the GNU Affero General Public License as published by
12 * the Free Software Foundation, either version 3 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Affero General Public License for more details.
20 * You should have received a copy of the GNU Affero General Public License
21 * along with this program. If not, see <http://www.gnu.org/licenses/>.
25 * @author Evan Prodromou <evan@status.net>
26 * @copyright 2010 StatusNet, Inc.
27 * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
28 * @link http://status.net/
31 if (!defined('STATUSNET')) {
32 // This check helps protect against security problems;
33 // your code file can't be executed directly from the web.
38 * Importer class for Delicious bookmarks
42 * @author Evan Prodromou <evan@status.net>
43 * @copyright 2010 StatusNet, Inc.
44 * @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
45 * @link http://status.net/
47 class DeliciousBackupImporter extends QueueHandler
50 * Transport of the importer
52 * @return string transport string
60 * Import an in-memory bookmark list to a user's account
62 * Take a delicious.com backup file (same as Netscape bookmarks.html)
63 * and import to StatusNet as Bookmark activities.
65 * The document format is terrible. It consists of a <dl> with
66 * a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
67 * There are sometimes <p>'s lost inside.
69 * @param array $data pair of user, text
71 * @return boolean success value
73 function handle($data)
75 list($user, $body) = $data;
78 $doc = $this->importHTML($body);
79 } catch (ClientException $cex) {
80 // XXX: message to the user
81 common_log(LOG_WARNING, $cex->getMessage());
85 // If we can't parse it, it's no good
91 $dls = $doc->getElementsByTagName('dl');
93 if ($dls->length != 1) {
94 // XXX: message to the user
95 common_log(LOG_WARNING, 'Bad input file');
101 $children = $dl->childNodes;
105 for ($i = 0; $i < $children->length; $i++) {
107 $child = $children->item($i);
108 if ($child->nodeType != XML_ELEMENT_NODE) {
111 switch (strtolower($child->tagName)) {
113 // <dt> nodes contain primary information about a bookmark.
114 // We can't import the current one just yet though, since
115 // it may be followed by a <dd>.
118 $this->importBookmark($user, $dt);
127 // This <dd> contains a description for the bookmark in
128 // the preceding <dt> node.
129 $saved = $this->importBookmark($user, $dt, $dd);
136 common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
139 common_log(LOG_WARNING,
140 "Unexpected element $child->tagName ".
141 " found in import.");
143 } catch (Exception $e) {
144 common_log(LOG_ERR, $e->getMessage());
149 // There was a final bookmark without a description.
151 $this->importBookmark($user, $dt);
152 } catch (Exception $e) {
153 common_log(LOG_ERR, $e->getMessage());
161 * Import a single bookmark
163 * Takes a <dt>/<dd> pair. The <dt> has a single
164 * <a> in it with some non-standard attributes.
166 * A <dt><dt><dd> sequence will appear as a <dt> with
167 * anothe <dt> as a child. We handle this case recursively.
169 * @param User $user User to import data as
170 * @param DOMElement $dt <dt> element
171 * @param DOMElement $dd <dd> element
173 * @return Notice imported notice
175 function importBookmark($user, $dt, $dd = null)
177 $as = $dt->getElementsByTagName('a');
179 if ($as->length == 0) {
180 // TRANS: Client exception thrown when a bookmark in an import file is incorrectly formatted.
181 throw new ClientException(_m("No <A> tag in a <DT>."));
186 $private = $a->getAttribute('private');
189 // TRANS: Client exception thrown when a bookmark in an import file is private.
190 throw new ClientException(_m('Skipping private bookmark.'));
194 $description = $dd->nodeValue;
198 $addDate = $a->getAttribute('add_date');
201 'profile_id' => $user->id,
202 'title' => $a->nodeValue,
203 'description' => $description,
204 'url' => $a->getAttribute('href'),
205 'tags' => preg_split('/[\s,]+/', $a->getAttribute('tags'), null, PREG_SPLIT_NO_EMPTY),
206 'created' => common_sql_date(intval($addDate))
209 $qm = QueueManager::get();
210 $qm->enqueue($data, 'dlcsbkmk');
216 * Hides the errors that the dom parser returns
218 * @param string $body Data to import
220 * @return DOMDocument parsed document
223 function importHTML($body)
225 // DOMDocument::loadHTML may throw warnings on unrecognized elements,
226 // and notices on unrecognized namespaces.
227 $old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
228 $dom = new DOMDocument();
229 $ok = $dom->loadHTML($body);
230 error_reporting($old);
233 foreach ($dom->getElementsByTagName('body') as $node) {
234 $this->fixListsIn($node);
243 function fixListsIn(DOMNode $body) {
246 foreach ($body->childNodes as $node) {
247 if ($node->nodeType == XML_ELEMENT_NODE) {
248 $el = strtolower($node->nodeName);
255 foreach ($toFix as $node) {
256 $this->fixList($node);
260 function fixList(DOMNode $list) {
263 foreach ($list->childNodes as $node) {
264 if ($node->nodeType == XML_ELEMENT_NODE) {
265 $el = strtolower($node->nodeName);
266 if ($el == 'dt' || $el == 'dd') {
271 // Technically, these can only appear inside a <dd>...
272 $this->fixList($node);
277 foreach ($toFix as $node) {
278 $this->fixListItem($node);
282 function fixListItem(DOMNode $item) {
283 // The HTML parser in libxml2 doesn't seem to properly handle
284 // many cases of implied close tags, apparently because it doesn't
285 // understand the nesting rules specified in the HTML DTD.
287 // This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
288 // interpreted as parent->child trees instead of siblings:
290 // When parsing this input: "<dt>aaa <dt>bbb"
291 // should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
292 // but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
294 // It does at least know that going from dt to dd, or dd to dt,
295 // should make a break.
299 foreach ($item->childNodes as $node) {
300 if ($node->nodeType == XML_ELEMENT_NODE) {
301 $el = strtolower($node->nodeName);
302 if ($el == 'dt' || $el == 'dd') {
303 // dt & dd cannot contain each other;
304 // This node was incorrectly placed; move it up a level!
309 // Technically, these can only appear inside a <dd>.
310 $this->fixList($node);
315 $parent = $item->parentNode;
316 $next = $item->nextSibling;
317 foreach ($toMove as $node) {
318 $item->removeChild($node);
319 $parent->insertBefore($node, $next);
320 $this->fixListItem($node);