]> git.mxchange.org Git - friendica.git/commitdiff
better handling of troublesome feeds.
authorFriendika <info@friendika.com>
Tue, 5 Apr 2011 02:36:18 +0000 (19:36 -0700)
committerFriendika <info@friendika.com>
Tue, 5 Apr 2011 02:36:18 +0000 (19:36 -0700)
boot.php
include/Scrape.php
include/items.php
include/poller.php
mod/dfrn_confirm.php
mod/dfrn_poll.php
mod/follow.php

index 3b86d0dbeb3f5484de1a090ca171c2531ace9cd2..f5c0e6f92c4caa8fe83e604df8bd3d4138f537c6 100644 (file)
--- a/boot.php
+++ b/boot.php
@@ -1478,7 +1478,9 @@ function lrdd($uri) {
                return array();
 
        logger('lrdd: host_meta: ' . $xml, LOGGER_DATA);
-       $h = simplexml_load_string($xml);
+
+       $h = parse_xml_string($xml);
+
        $arr = convert_xml_element_to_array($h);
 
        if(isset($arr['xrd']['property'])) {
@@ -1550,16 +1552,19 @@ function lrdd($uri) {
        $headers = $a->get_curl_headers();
        logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
 
-       require_once('library/HTML5/Parser.php');
-       $dom = @HTML5_Parser::parse($html);
-
-       if($dom) {
-               $items = $dom->getElementsByTagName('link');
-               foreach($items as $item) {
-                       $x = $item->getAttribute('rel');
-                       if($x == "lrdd") {
-                               $pagelink = $item->getAttribute('href');
-                               break;
+       // don't try and parse raw xml as html
+       if(! strstr($html,'<?xml')) {
+               require_once('library/HTML5/Parser.php');
+               $dom = @HTML5_Parser::parse($html);
+
+               if($dom) {
+                       $items = $dom->getElementsByTagName('link');
+                       foreach($items as $item) {
+                               $x = $item->getAttribute('rel');
+                               if($x == "lrdd") {
+                                       $pagelink = $item->getAttribute('href');
+                                       break;
+                               }
                        }
                }
        }
@@ -1638,7 +1643,7 @@ function fetch_xrd_links($url) {
                return array();
 
        logger('fetch_xrd_links: ' . $xml, LOGGER_DATA);
-       $h = simplexml_load_string($xml);
+       $h = parse_xml_string($xml);
        $arr = convert_xml_element_to_array($h);
 
        $links = array();
@@ -2759,3 +2764,18 @@ function lang_selector() {
        $o .= '</select></form></div>';
        return $o;
 }}
+
+
+if(! function_exists('parse_xml_string')) {
+function parse_xml_string($s) {
+       if(! strstr($s,'<?xml'))
+               return false;
+       $s2 = substr($s,strpos($s,'<?xml'));
+       libxml_use_internal_errors(true);
+       $x = @simplexml_load_string($s2);
+       if(count(libxml_get_errors()))
+               foreach(libxml_get_errors() as $err)
+                       logger('libxml: parse: ' . $err, LOGGER_DATA);
+       libxml_clear_errors();
+       return $x;
+}}
index ff98992526314474b6dcc7aa20386a92f114c332..21820ddaff10cdb2886e9cc87ea9d7edc8ccfb7f 100644 (file)
@@ -216,7 +216,7 @@ function scrape_feed($url) {
                                }
                                if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
                                        $ret['feed_rss'] = $url;
-                                       return ret;
+                                       return $ret;
                                }
                        }
                }
index 1dd39d2ba9f4f7012d8b8bb520c16088b6cd05c1..a9ac859694277b0b2819804b72704ddb3b050a10 100644 (file)
@@ -180,7 +180,7 @@ function construct_activity_object($item) {
 
        if($item['object']) {
                $o = '<as:object>' . "\r\n";
-               $r = @simplexml_load_string($item['object']);
+               $r = parse_xml_string($item['object']);
                if($r->type)
                        $o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n";
                if($r->id)
@@ -206,7 +206,7 @@ function construct_activity_target($item) {
 
        if($item['target']) {
                $o = '<as:target>' . "\r\n";
-               $r = @simplexml_load_string($item['target']);
+               $r = parse_xml_string($item['target']);
                if($r->type)
                        $o .= '<as:object-type>' . xmlify($r->type) . '</as:object-type>' . "\r\n";
                if($r->id)
@@ -241,8 +241,14 @@ function get_atom_elements($feed,$item) {
        $res = array();
 
        $author = $item->get_author();
-       $res['author-name'] = unxmlify($author->get_name());
-       $res['author-link'] = unxmlify($author->get_link());
+       if($author) { 
+               $res['author-name'] = unxmlify($author->get_name());
+               $res['author-link'] = unxmlify($author->get_link());
+       }
+       else {
+               $res['author-name'] = unxmlify($feed->get_title());
+               $res['author-link'] = unxmlify($feed->get_permalink());
+       }
        $res['uri'] = unxmlify($item->get_id());
        $res['title'] = unxmlify($item->get_title());
        $res['body'] = unxmlify($item->get_content());
@@ -343,7 +349,6 @@ function get_atom_elements($feed,$item) {
        // the wild, by sanitising it and converting supported tags to bbcode before we rip out any remaining 
        // html.
 
-
        if((strpos($res['body'],'<') !== false) || (strpos($res['body'],'>') !== false)) {
 
                $res['body'] = preg_replace('#<object[^>]+>.+?' . 'http://www.youtube.com/((?:v|cp)/[A-Za-z0-9\-_=]+).+?</object>#s',
@@ -783,7 +788,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) {
                return 3;
        }
 
-       $res = simplexml_load_string($xml);
+       $res = parse_xml_string($xml);
 
        if((intval($res->status) != 0) || (! strlen($res->challenge)) || (! strlen($res->dfrn_id)))
                return (($res->status) ? $res->status : 3);
@@ -878,7 +883,7 @@ function dfrn_deliver($owner,$contact,$atom, $dissolve = false) {
                return 3;
        }
 
-       $res = simplexml_load_string($xml);
+       $res = parse_xml_string($xml);
 
        return $res->status;
  
@@ -916,6 +921,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
        if($feed->error())
                logger('consume_feed: Error parsing XML: ' . $feed->error());
 
+       $permalink = $feed->get_permalink();
 
        // Check at the feed level for updated contact name and/or photo
 
@@ -1230,6 +1236,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
                                // Head post of a conversation. Have we seen it? If not, import it.
 
                                $item_id  = $item->get_id();
+
                                $datarray = get_atom_elements($feed,$item);
 
                                $r = q("SELECT `uid`, `last-child`, `edited`, `body` FROM `item` WHERE `uri` = '%s' AND `uid` = %d LIMIT 1",
@@ -1275,7 +1282,7 @@ function consume_feed($xml,$importer,&$contact, &$hub, $datedir = 0, $secure_fee
                                if(! is_array($contact))
                                        return;
 
-                               if($contact['network'] === 'stat') {
+                               if($contact['network'] === 'stat' || stristr($permalink,'twitter.com')) {
                                        if(strlen($datarray['title']))
                                                unset($datarray['title']);
                                        $datarray['last-child'] = 1;
index 3b80c1c04b2aa79dba370a03c7b3f13af68c5971..9362c28b31bb4997127cf1773a992069a2720148 100644 (file)
@@ -203,7 +203,7 @@ function poller_run($argv, $argc){
                                }
 
 
-                               $res = simplexml_load_string($xml);
+                               $res = parse_xml_string($xml);
        
                                if(intval($res->status) == 1) {
                                        logger("poller: $url replied status 1 - marking for death ");
index 1bf1ba9549ebe2c86d5080e6a7de03a7d1867898..2db745d25eceb89bc37148a2e4b4d60d8f8b3832 100644 (file)
@@ -240,7 +240,7 @@ function dfrn_confirm_post(&$a,$handsfree = null) {
                                notice( t('Unexpected response from remote site: ') . EOL . $leading_junk . EOL );
                        }
 
-                       $xml = simplexml_load_string($res);
+                       $xml = parse_xml_string($res);
                        $status = (int) $xml->status;
                        $message = unxmlify($xml->message);   // human readable text of what may have gone wrong.
                        switch($status) {
index 5149dc3b211afd6b62de6074bd92d3a841c1b1fe..2ccfadd03e0fc0198c0f924f4d84a01398ac89ff 100644 (file)
@@ -69,7 +69,7 @@ function dfrn_poll_init(&$a) {
 
                        if(strlen($s)) {
 
-                               $xml = simplexml_load_string($s);
+                               $xml = parse_xml_string($s);
 
                                if((int) $xml->status == 1) {
                                        $_SESSION['authenticated'] = 1;
@@ -468,7 +468,7 @@ function dfrn_poll_content(&$a) {
 
                        if(strlen($s) && strstr($s,'<?xml')) {
 
-                               $xml = simplexml_load_string($s);
+                               $xml = parse_xml_string($s);
 
                                logger('dfrn_poll: profile: parsed xml: ' . print_r($xml,true), LOGGER_DATA);
 
index 4ce3ccb82c0188440191fb0df60ee6d4b8dc64b0..06e81ceedd66b7aed2f7ac62d00fca875ff71842 100644 (file)
@@ -15,7 +15,8 @@ function follow_post(&$a) {
        $email_conversant = false;
 
        if($url) {
-               $links = @lrdd($url);
+               $links = lrdd($url);
+
                if(count($links)) {
                        foreach($links as $link) {
                                if($link['@attributes']['rel'] === NAMESPACE_DFRN)
@@ -107,7 +108,7 @@ function follow_post(&$a) {
        if((! isset($vcard)) && (! $poll)) {
 
                $ret = scrape_feed($url);
-
+               logger('mod_follow: scrape_feed returns: ' . print_r($ret,true), LOGGER_DATA);
                if(count($ret) && ($ret['feed_atom'] || $ret['feed_rss'])) {
                        $poll = ((x($ret,'feed_atom')) ? unamp($ret['feed_atom']) : unamp($ret['feed_rss']));
                        $vcard = array();
@@ -156,7 +157,14 @@ function follow_post(&$a) {
                        }
                        if((! $vcard['photo']) && strlen($email))
                                $vcard['photo'] = gravatar_img($email);
-                       
+                       if($poll === $profile)
+                               $lnk = $feed->get_permalink();
+                       if(isset($lnk) && strlen($lnk))
+                               $profile = $lnk;        
+                       if(! (x($vcard,'fn')))
+                               $vcard['fn'] = notags($feed->get_title());
+                       if(! (x($vcard,'fn')))
+                               $vcard['fn'] = notags($feed->get_description());
                        $network = 'feed';
                        $priority = 2;
                }