]> git.mxchange.org Git - friendica.git/blobdiff - include/Scrape.php
Scrape now doesn't fetch page data when its twitter and Co./Update script that update...
[friendica.git] / include / Scrape.php
index 611cbda8860e05a48eb6237523001f0da90af354..7df86d9f2d7cd4fe3d85372a85e8f80b61873cf9 100644 (file)
@@ -14,7 +14,7 @@ function scrape_dfrn($url) {
 
        $s = fetch_url($url);
 
-       if(! $s) 
+       if(! $s)
                return $ret;
 
        $headers = $a->get_curl_headers();
@@ -23,7 +23,7 @@ function scrape_dfrn($url) {
 
        $lines = explode("\n",$headers);
        if(count($lines)) {
-               foreach($lines as $line) {                              
+               foreach($lines as $line) {
                        // don't try and run feeds through the html5 parser
                        if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
                                return ret;
@@ -120,7 +120,7 @@ function scrape_meta($url) {
 
        $s = fetch_url($url);
 
-       if(! $s) 
+       if(! $s)
                return $ret;
 
        $headers = $a->get_curl_headers();
@@ -128,7 +128,7 @@ function scrape_meta($url) {
 
        $lines = explode("\n",$headers);
        if(count($lines)) {
-               foreach($lines as $line) {                              
+               foreach($lines as $line) {
                        // don't try and run feeds through the html5 parser
                        if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
                                return ret;
@@ -169,13 +169,13 @@ function scrape_vcard($url) {
 
        $s = fetch_url($url);
 
-       if(! $s) 
+       if(! $s)
                return $ret;
 
        $headers = $a->get_curl_headers();
        $lines = explode("\n",$headers);
        if(count($lines)) {
-               foreach($lines as $line) {                              
+               foreach($lines as $line) {
                        // don't try and run feeds through the html5 parser
                        if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
                                return ret;
@@ -236,14 +236,14 @@ function scrape_feed($url) {
        logger('scrape_feed: returns: ' . $code . ' headers=' . $headers, LOGGER_DEBUG);
 
        if(! $s) {
-               logger('scrape_feed: no data returned for ' . $url); 
+               logger('scrape_feed: no data returned for ' . $url);
                return $ret;
        }
 
 
        $lines = explode("\n",$headers);
        if(count($lines)) {
-               foreach($lines as $line) {                              
+               foreach($lines as $line) {
                        if(stristr($line,'content-type:')) {
                                if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
                                        $ret['feed_atom'] = $url;
@@ -299,7 +299,7 @@ function scrape_feed($url) {
                                if(! x($ret,'feed_rss'))
                                        $ret['feed_rss'] = $item->getAttribute('href');
                        }
-               }       
+               }
        }
 
        // Drupal and perhaps others only provide relative URL's. Turn them into absolute.
@@ -343,20 +343,44 @@ function probe_url($url, $mode = PROBE_NORMAL) {
        if(! $url)
                return $result;
 
+       $result = Cache::get("probe_url:".$mode.":".$url);
+       if (!is_null($result)) {
+               $result = unserialize($result);
+               return $result;
+       }
+
        $network = null;
        $diaspora = false;
        $diaspora_base = '';
-       $diaspora_guid = '';    
+       $diaspora_guid = '';
        $diaspora_key = '';
        $has_lrdd = false;
        $email_conversant = false;
+       $connectornetworks = false;
+       $appnet = false;
+
+       if (strpos($url,'twitter.com')) {
+               $connectornetworks = true;
+               $network = NETWORK_TWITTER;
+       }
+
+       if (strpos($url,'www.facebook.com')) {
+               $connectornetworks = true;
+               $network = NETWORK_FACEBOOK;
+       }
+
+       if (strpos($url,'alpha.app.net')) {
+               $appnet = true;
+               $network = NETWORK_APPNET;
+       }
 
-       $twitter = ((strpos($url,'twitter.com') !== false) ? true : false);
+       // Twitter is deactivated since twitter closed its old API
+       //$twitter = ((strpos($url,'twitter.com') !== false) ? true : false);
        $lastfm  = ((strpos($url,'last.fm/user') !== false) ? true : false);
 
        $at_addr = ((strpos($url,'@') !== false) ? true : false);
 
-       if((! $twitter) && (! $lastfm)) {
+       if((!$appnet) && (!$lastfm) && !$connectornetworks) {
 
                if(strpos($url,'mailto:') !== false && $at_addr) {
                        $url = str_replace('mailto:','',$url);
@@ -400,6 +424,9 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                                $pubkey = $diaspora_key;
                                        $diaspora = true;
                                }
+                               if($link['@attributes']['rel'] === 'http://ostatus.org/schema/1.0/subscribe') {
+                                       $diaspora = false;
+                               }
                        }
 
                        // Status.Net can have more than one profile URL. We need to match the profile URL
@@ -485,7 +512,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                }
                        }
                }
-       }       
+       }
 
        if($mode == PROBE_NORMAL) {
                if(strlen($zot)) {
@@ -531,7 +558,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                }
                if(strpos($url,'@'))
                        $addr = str_replace('acct:', '', $url);
-       }                       
+       }
 
        if($network !== NETWORK_ZOT && $network !== NETWORK_DFRN && $network !== NETWORK_MAIL) {
                if($diaspora)
@@ -544,13 +571,13 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                        $vcard = scrape_vcard($hcard);
 
                        // Google doesn't use absolute url in profile photos
-       
+
                        if((x($vcard,'photo')) && substr($vcard['photo'],0,1) == '/') {
                                $h = @parse_url($hcard);
                                if($h)
                                        $vcard['photo'] = $h['scheme'] . '://' . $h['host'] . $vcard['photo'];
                        }
-               
+
                        logger('probe_url: scrape_vcard: ' . print_r($vcard,true), LOGGER_DATA);
                }
 
@@ -561,7 +588,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                        $vcard['nick'] = $addr_parts[0];
                }
 
-               if($twitter) {          
+               /* if($twitter) {
                        logger('twitter: setup');
                        $tid = basename($url);
                        $tapi = 'https://api.twitter.com/1/statuses/user_timeline.rss';
@@ -574,7 +601,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                        $vcard['photo'] = 'https://api.twitter.com/1/users/profile_image?screen_name=' . $tid . '&size=bigger';
                        $vcard['nick'] = $tid;
                        $vcard['fn'] = $tid;
-               }
+               } */
 
                if($lastfm) {
                        $profile = $url;
@@ -596,27 +623,30 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                        // Will leave it to others to figure out how to grab the avatar, which is on the $url page in the open graph meta links
                }
 
-               if($twitter || ! $poll)
+               if($appnet || ! $poll)
                        $check_feed = true;
                if((! isset($vcard)) || (! x($vcard,'fn')) || (! $profile))
                        $check_feed = true;
                if(($at_addr) && (! count($links)))
                        $check_feed = false;
 
+               if ($connectornetworks)
+                       $check_feed = false;
+
                if($check_feed) {
 
                        $feedret = scrape_feed(($poll) ? $poll : $url);
                        logger('probe_url: scrape_feed ' . (($poll)? $poll : $url) . ' returns: ' . print_r($feedret,true), LOGGER_DATA);
                        if(count($feedret) && ($feedret['feed_atom'] || $feedret['feed_rss'])) {
                                $poll = ((x($feedret,'feed_atom')) ? unamp($feedret['feed_atom']) : unamp($feedret['feed_rss']));
-                               if(! x($vcard)) 
+                               if(! x($vcard))
                                        $vcard = array();
                        }
 
                        if(x($feedret,'photo') && (! x($vcard,'photo')))
                                $vcard['photo'] = $feedret['photo'];
                        require_once('library/simplepie/simplepie.inc');
-                   $feed = new SimplePie();
+                       $feed = new SimplePie();
                        $xml = fetch_url($poll);
 
                        logger('probe_url: fetch feed: ' . $poll . ' returns: ' . $xml, LOGGER_DATA);
@@ -624,9 +654,10 @@ function probe_url($url, $mode = PROBE_NORMAL) {
 
                        logger('probe_url: scrape_feed: headers: ' . $a->get_curl_headers(), LOGGER_DATA);
 
-                       $feed->set_raw_data($xml);
+                       // Don't try and parse an empty string
+                       $feed->set_raw_data(($xml) ? $xml : '<?xml version="1.0" encoding="utf-8" ?><xml></xml>');
 
-                   $feed->init();
+                       $feed->init();
                        if($feed->error())
                                logger('probe_url: scrape_feed: Error parsing XML: ' . $feed->error());
 
@@ -635,7 +666,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                $vcard['photo'] = $feed->get_image_url();
                        $author = $feed->get_author();
 
-                       if($author) {                   
+                       if($author) {
                                $vcard['fn'] = unxmlify(trim($author->get_name()));
                                if(! $vcard['fn'])
                                        $vcard['fn'] = trim(unxmlify($author->get_email()));
@@ -646,18 +677,18 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                        $profile = trim(unxmlify($author->get_link()));
                                if(! $vcard['photo']) {
                                        $rawtags = $feed->get_feed_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'author');
-                               if($rawtags) {
+                                       if($rawtags) {
                                                $elems = $rawtags[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10];
                                                if((x($elems,'link')) && ($elems['link'][0]['attribs']['']['rel'] === 'photo'))
                                                        $vcard['photo'] = $elems['link'][0]['attribs']['']['href'];
-                               }
+                                       }
                                }
                        }
                        else {
                                $item = $feed->get_item(0);
                                if($item) {
                                        $author = $item->get_author();
-                                       if($author) {                   
+                                       if($author) {
                                                $vcard['fn'] = trim(unxmlify($author->get_name()));
                                                if(! $vcard['fn'])
                                                        $vcard['fn'] = trim(unxmlify($author->get_email()));
@@ -674,11 +705,11 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                        }
                                        if(! $vcard['photo']) {
                                                $rawtags = $item->get_item_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'author');
-                                       if($rawtags) {
+                                               if($rawtags) {
                                                        $elems = $rawtags[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10];
                                                        if((x($elems,'link')) && ($elems['link'][0]['attribs']['']['rel'] === 'photo'))
                                                                $vcard['photo'] = $elems['link'][0]['attribs']['']['href'];
-                                       }
+                                               }
                                        }
                                }
                        }
@@ -688,8 +719,13 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                        if($poll === $profile)
                                $lnk = $feed->get_permalink();
                        if(isset($lnk) && strlen($lnk))
-                               $profile = $lnk;        
+                               $profile = $lnk;
 
+                       if(! $network) {
+                               $network = NETWORK_FEED;
+                               // If it is a feed, don't take the author name as feed name
+                               unset($vcard['fn']);
+                       }
                        if(! (x($vcard,'fn')))
                                $vcard['fn'] = notags($feed->get_title());
                        if(! (x($vcard,'fn')))
@@ -704,8 +740,6 @@ function probe_url($url, $mode = PROBE_NORMAL) {
                                if(strpos($vcard['nick'],' '))
                                        $vcard['nick'] = trim(substr($vcard['nick'],0,strpos($vcard['nick'],' ')));
                        }
-                       if(! $network)
-                               $network = NETWORK_FEED;
                        if(! $priority)
                                $priority = 2;
                }
@@ -713,7 +747,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
 
        if(! x($vcard,'photo')) {
                $a = get_app();
-               $vcard['photo'] = $a->get_baseurl() . '/images/person-175.jpg' ; 
+               $vcard['photo'] = $a->get_baseurl() . '/images/person-175.jpg' ;
        }
 
        if(! $profile)
@@ -726,7 +760,7 @@ function probe_url($url, $mode = PROBE_NORMAL) {
 
        $vcard['fn'] = notags($vcard['fn']);
        $vcard['nick'] = str_replace(' ','',notags($vcard['nick']));
-               
+
        $result['name'] = $vcard['fn'];
        $result['nick'] = $vcard['nick'];
        $result['url'] = $profile;
@@ -745,5 +779,16 @@ function probe_url($url, $mode = PROBE_NORMAL) {
 
        logger('probe_url: ' . print_r($result,true), LOGGER_DEBUG);
 
+       // Trying if it maybe a diaspora account
+       if ($result['network'] == NETWORK_FEED) {
+               require_once('include/bbcode.php');
+               $address = GetProfileUsername($url, "", true);
+               $result2 = probe_url($address, $mode);
+               if ($result2['network'] != "")
+                       $result = $result2;
+       }
+
+       Cache::set("probe_url:".$mode.":".$url,serialize($result));
+
        return $result;
 }