]> git.mxchange.org Git - friendica.git/blobdiff - include/Scrape.php
quattro: in post display page, scroll to and flash selected comment
[friendica.git] / include / Scrape.php
index 9bf89a49eda8952e51d86dcf0450ffefb5f13e12..ca8f6e83ae3d9f3b0fa0e2508f76374465c6e722 100644 (file)
@@ -1,6 +1,7 @@
 <?php
 
 require_once('library/HTML5/Parser.php');
+require_once('include/crypto.php');
 
 if(! function_exists('scrape_dfrn')) {
 function scrape_dfrn($url) {
@@ -29,8 +30,11 @@ function scrape_dfrn($url) {
                }
        }
 
-
-       $dom = HTML5_Parser::parse($s);
+       try {
+               $dom = HTML5_Parser::parse($s);
+       } catch (DOMException $e) {
+               logger('scrape_dfrn: parse error: ' . $e);
+       }
 
        if(! $dom)
                return $ret;
@@ -43,8 +47,9 @@ function scrape_dfrn($url) {
                $x = $item->getAttribute('rel');
                if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
                        $ret['feed_atom'] = $item->getAttribute('href');
-               if(substr($x,0,5) == "dfrn-")
+               if(substr($x,0,5) == "dfrn-") {
                        $ret[$x] = $item->getAttribute('href');
+               }
                if($x === 'lrdd') {
                        $decoded = urldecode($item->getAttribute('href'));
                        if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
@@ -54,17 +59,28 @@ function scrape_dfrn($url) {
 
        // Pull out hCard profile elements
 
+       $largest_photo = 0;
+
        $items = $dom->getElementsByTagName('*');
        foreach($items as $item) {
                if(attribute_contains($item->getAttribute('class'), 'vcard')) {
                        $level2 = $item->getElementsByTagName('*');
                        foreach($level2 as $x) {
-                               if(attribute_contains($x->getAttribute('class'),'fn'))
+                               if(attribute_contains($x->getAttribute('class'),'fn')) {
                                        $ret['fn'] = $x->textContent;
-                               if(attribute_contains($x->getAttribute('class'),'photo'))
-                                       $ret['photo'] = $x->getAttribute('src');
-                               if(attribute_contains($x->getAttribute('class'),'key'))
+                               }
+                               if((attribute_contains($x->getAttribute('class'),'photo'))
+                                       || (attribute_contains($x->getAttribute('class'),'avatar'))) {
+                                       $size = intval($x->getAttribute('width'));
+                                       // dfrn prefers 175, so if we find this, we set largest_size so it can't be topped.
+                                       if(($size > $largest_photo) || ($size == 175) || (! $largest_photo)) {
+                                               $ret['photo'] = $x->getAttribute('src');
+                                               $largest_photo = (($size == 175) ? 9999 : $size);
+                                       }
+                               }
+                               if(attribute_contains($x->getAttribute('class'),'key')) {
                                        $ret['key'] = $x->textContent;
+                               }
                        }
                }
        }
@@ -119,9 +135,11 @@ function scrape_meta($url) {
                }
        }
 
-
-
-       $dom = HTML5_Parser::parse($s);
+       try {
+               $dom = HTML5_Parser::parse($s);
+       } catch (DOMException $e) {
+               logger('scrape_meta: parse error: ' . $e);
+       }
 
        if(! $dom)
                return $ret;
@@ -164,13 +182,19 @@ function scrape_vcard($url) {
                }
        }
 
-       $dom = HTML5_Parser::parse($s);
+       try {
+               $dom = HTML5_Parser::parse($s);
+       } catch (DOMException $e) {
+               logger('scrape_vcard: parse error: ' . $e);
+       }
 
        if(! $dom)
                return $ret;
 
        // Pull out hCard profile elements
 
+       $largest_photo = 0;
+
        $items = $dom->getElementsByTagName('*');
        foreach($items as $item) {
                if(attribute_contains($item->getAttribute('class'), 'vcard')) {
@@ -179,11 +203,17 @@ function scrape_vcard($url) {
                                if(attribute_contains($x->getAttribute('class'),'fn'))
                                        $ret['fn'] = $x->textContent;
                                if((attribute_contains($x->getAttribute('class'),'photo'))
-                                       || (attribute_contains($x->getAttribute('class'),'avatar')))
-                                       $ret['photo'] = $x->getAttribute('src');
+                                       || (attribute_contains($x->getAttribute('class'),'avatar'))) {
+                                       $size = intval($x->getAttribute('width'));
+                                       if(($size > $largest_photo) || (! $largest_photo)) {
+                                               $ret['photo'] = $x->getAttribute('src');
+                                               $largest_photo = $size;
+                                       }
+                               }
                                if((attribute_contains($x->getAttribute('class'),'nickname'))
-                                       || (attribute_contains($x->getAttribute('class'),'uid')))
+                                       || (attribute_contains($x->getAttribute('class'),'uid'))) {
                                        $ret['nick'] = $x->textContent;
+                               }
                        }
                }
        }
@@ -200,11 +230,16 @@ function scrape_feed($url) {
        $ret = array();
        $s = fetch_url($url);
 
-       if(! $s) 
+       $headers = $a->get_curl_headers();
+       $code = $a->get_curl_code();
+
+       logger('scrape_feed: returns: ' . $code . ' headers=' . $headers, LOGGER_DEBUG);
+
+       if(! $s) {
+               logger('scrape_feed: no data returned for ' . $url); 
                return $ret;
+       }
 
-       $headers = $a->get_curl_headers();
-       logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
 
        $lines = explode("\n",$headers);
        if(count($lines)) {
@@ -220,25 +255,22 @@ function scrape_feed($url) {
                                }
                        }
                }
+               // perhaps an RSS version 1 feed with a generic or incorrect content-type?
+               if(stristr($s,'</item>')) {
+                       $ret['feed_rss'] = $url;
+                       return $ret;
+               }
        }
 
-       $dom = HTML5_Parser::parse($s);
+       try {
+               $dom = HTML5_Parser::parse($s);
+       } catch (DOMException $e) {
+               logger('scrape_feed: parse error: ' . $e);
+       }
 
-       if(! $dom)
+       if(! $dom) {
+               logger('scrape_feed: failed to parse.');
                return $ret;
-
-
-       $items = $dom->getElementsByTagName('img');
-
-       // get img elements (twitter)
-
-       if($items) {
-               foreach($items as $item) {
-                       $x = $item->getAttribute('id');
-                       if($x === 'profile-image') {
-                               $ret['photo'] = $item->getAttribute('src');
-                       }
-               }
        }
 
 
@@ -250,7 +282,7 @@ function scrape_feed($url) {
                }
        }
        if(! $basename)
-               $basename = substr($url,0,strrpos($url,'/')) . '/';
+               $basename = implode('/', array_slice(explode('/',$url),0,3)) . '/';
 
        $items = $dom->getElementsByTagName('link');
 
@@ -281,7 +313,29 @@ function scrape_feed($url) {
 }}
 
 
-function probe_url($url) {
+/**
+ *
+ * Probe a network address to discover what kind of protocols we need to communicate with it.
+ *
+ * Warning: this function is a bit touchy and there are some subtle dependencies within the logic flow.
+ * Edit with care.
+ *
+ */
+
+/**
+ *
+ * PROBE_DIASPORA has a bias towards returning Diaspora information
+ * while PROBE_NORMAL has a bias towards dfrn/zot - in the case where
+ * an address (such as a Friendica address) supports more than one type
+ * of network. 
+ *
+ */
+
+
+define ( 'PROBE_NORMAL',   0);
+define ( 'PROBE_DIASPORA', 1);
+
+function probe_url($url, $mode = PROBE_NORMAL) {
        require_once('include/email.php');
 
        $result = array();
@@ -289,7 +343,12 @@ function probe_url($url) {
        if(! $url)
                return $result;
 
-       $diaspora = false;      
+       $network = null;
+       $diaspora = false;
+       $diaspora_base = '';
+       $diaspora_guid = '';    
+       $diaspora_key = '';
+       $has_lrdd = false;
        $email_conversant = false;
 
        $twitter = ((strpos($url,'twitter.com') !== false) ? true : false);
@@ -297,9 +356,17 @@ function probe_url($url) {
        $at_addr = ((strpos($url,'@') !== false) ? true : false);
 
        if(! $twitter) {
-               $links = lrdd($url);
+
+               if(strpos($url,'mailto:') !== false && $at_addr) {
+                       $url = str_replace('mailto:','',$url);
+                       $links = array();
+               }
+               else
+                       $links = lrdd($url);
 
                if(count($links)) {
+                       $has_lrdd = true;
+
                        logger('probe_url: found lrdd links: ' . print_r($links,true), LOGGER_DATA);
                        foreach($links as $link) {
                                if($link['@attributes']['rel'] === NAMESPACE_ZOT)
@@ -314,8 +381,21 @@ function probe_url($url) {
                                        $hcard = unamp($link['@attributes']['href']);
                                if($link['@attributes']['rel'] === 'http://webfinger.net/rel/profile-page')
                                        $profile = unamp($link['@attributes']['href']);
-                               if($link['@attributes']['rel'] === 'http://joindiaspora.com/seed_location')
+                               if($link['@attributes']['rel'] === 'http://portablecontacts.net/spec/1.0')
+                                       $poco = unamp($link['@attributes']['href']);
+                               if($link['@attributes']['rel'] === 'http://joindiaspora.com/seed_location') {
+                                       $diaspora_base = unamp($link['@attributes']['href']);
+                                       $diaspora = true;
+                               }
+                               if($link['@attributes']['rel'] === 'http://joindiaspora.com/guid') {
+                                       $diaspora_guid = unamp($link['@attributes']['href']);
                                        $diaspora = true;
+                               }
+                               if($link['@attributes']['rel'] === 'diaspora-public-key') {
+                                       $diaspora_key = base64_decode(unamp($link['@attributes']['href']));
+                                       $pubkey = rsatopem($diaspora_key);
+                                       $diaspora = true;
+                               }
                        }
 
                        // Status.Net can have more than one profile URL. We need to match the profile URL
@@ -338,7 +418,7 @@ function probe_url($url) {
                                }
                        }
                }
-               else {
+               elseif($mode == PROBE_NORMAL) {
 
                        // Check email
 
@@ -355,17 +435,21 @@ function probe_url($url) {
                                        $password = '';
                                        openssl_private_decrypt(hex2bin($r[0]['pass']),$password,$x[0]['prvkey']);
                                        $mbox = email_connect($mailbox,$r[0]['user'],$password);
+                                       if(! $mbox)
+                                               logger('probe_url: email_connect failed.');
                                        unset($password);
                                }
                                if($mbox) {
                                        $msgs = email_poll($mbox,$orig_url);
+                                       logger('probe_url: searching ' . $orig_url . ', ' . count($msgs) . ' messages found.', LOGGER_DEBUG);
                                        if(count($msgs)) {
                                                $addr = $orig_url;
                                                $network = NETWORK_MAIL;
                                                $name = substr($url,0,strpos($url,'@'));
-                                               $profile = 'http://' . substr($url,strpos($url,'@')+1);
+                                               $phost = substr($url,strpos($url,'@')+1);
+                                               $profile = 'http://' . $phost;
                                                // fix nick character range
-                                               $vcard = array('fn' => $name, 'nick' => $name, 'photo' => gravatar_img($url));
+                                               $vcard = array('fn' => $name, 'nick' => $name, 'photo' => avatar_img($url));
                                                $notify = 'smtp ' . random_string();
                                                $poll = 'email ' . random_string();
                                                $priority = 0;
@@ -374,8 +458,24 @@ function probe_url($url) {
                                                        $adr = imap_rfc822_parse_adrlist($x->from,'');
                                                elseif(stristr($x->to,$orig_url))
                                                        $adr = imap_rfc822_parse_adrlist($x->to,'');
-                                               if(isset($adr) && strlen($adr[0]->personal))
-                                                       $vcard['fn'] = notags($adr[0]->personal);
+                                               if(isset($adr)) {
+                                                       foreach($adr as $feadr) {
+                                                               if((strcasecmp($feadr->mailbox,$name) == 0)
+                                                                       &&(strcasecmp($feadr->host,$phost) == 0)
+                                                                       && (strlen($feadr->personal))) {
+
+                                                                       $personal = imap_mime_header_decode($feadr->personal);
+                                                                       $vcard['fn'] = "";
+                                                                       foreach($personal as $perspart)
+                                                                               if ($perspart->charset != "default")
+                                                                                       $vcard['fn'] .= iconv($perspart->charset, 'UTF-8//IGNORE', $perspart->text);
+                                                                               else
+                                                                                       $vcard['fn'] .= $perspart->text;
+
+                                                                       $vcard['fn'] = notags($vcard['fn']);
+                                                               }
+                                                       }
+                                               }
                                        }
                                        imap_close($mbox);
                                }
@@ -383,41 +483,60 @@ function probe_url($url) {
                }
        }       
 
-       if(strlen($zot)) {
-               $s = fetch_url($zot);
-               if($s) {
-                       $j = json_decode($s);
-                       if($j) {
-                               $network = NETWORK_ZOT;
-                               $vcard   = array(
-                                       'fn'    => $j->fullname, 
-                                       'nick'  => $j->nickname, 
-                                       'photo' => $j->photo
-                               );
-                               $profile  = $j->url;
-                               $notify   = $j->post;
-                               $pubkey   = $j->pubkey;
-                               $poll     = 'N/A';
+       if($mode == PROBE_NORMAL) {
+               if(strlen($zot)) {
+                       $s = fetch_url($zot);
+                       if($s) {
+                               $j = json_decode($s);
+                               if($j) {
+                                       $network = NETWORK_ZOT;
+                                       $vcard   = array(
+                                               'fn'    => $j->fullname, 
+                                               'nick'  => $j->nickname, 
+                                               'photo' => $j->photo
+                                       );
+                                       $profile  = $j->url;
+                                       $notify   = $j->post;
+                                       $pubkey   = $j->pubkey;
+                                       $poll     = 'N/A';
+                               }
                        }
                }
-       }
 
-       if(strlen($dfrn)) {
-               $ret = scrape_dfrn($dfrn);
-               if(is_array($ret) && x($ret,'dfrn-request')) {
-                       $network = NETWORK_DFRN;
-                       $request = $ret['dfrn-request'];
-                       $confirm = $ret['dfrn-confirm'];
-                       $notify  = $ret['dfrn-notify'];
-                       $poll    = $ret['dfrn-poll'];
+               if(strlen($dfrn)) {
+                       $ret = scrape_dfrn(($hcard) ? $hcard : $dfrn);
+                       if(is_array($ret) && x($ret,'dfrn-request')) {
+                               $network = NETWORK_DFRN;
+                               $request = $ret['dfrn-request'];
+                               $confirm = $ret['dfrn-confirm'];
+                               $notify  = $ret['dfrn-notify'];
+                               $poll    = $ret['dfrn-poll'];
+
+                               $vcard = array();
+                               $vcard['fn'] = $ret['fn'];
+                               $vcard['nick'] = $ret['nick'];
+                               $vcard['photo'] = $ret['photo'];
+                       }
                }
        }
 
+       if($diaspora && $diaspora_base && $diaspora_guid) {
+               if($mode == PROBE_DIASPORA || ! $notify) {
+                       $notify = $diaspora_base . 'receive/users/' . $diaspora_guid;
+                       $batch  = $diaspora_base . 'receive/public' ;
+               }
+               if(strpos($url,'@'))
+                       $addr = str_replace('acct:', '', $url);
+       }                       
+
        if($network !== NETWORK_ZOT && $network !== NETWORK_DFRN && $network !== NETWORK_MAIL) {
-               $network  = NETWORK_OSTATUS;
+               if($diaspora)
+                       $network = NETWORK_DIASPORA;
+               elseif($has_lrdd)
+                       $network  = NETWORK_OSTATUS;
                $priority = 0;
 
-               if($hcard) {
+               if($hcard && ! $vcard) {
                        $vcard = scrape_vcard($hcard);
 
                        // Google doesn't use absolute url in profile photos
@@ -431,13 +550,6 @@ function probe_url($url) {
                        logger('probe_url: scrape_vcard: ' . print_r($vcard,true), LOGGER_DATA);
                }
 
-               if(! $profile) {
-                       if($diaspora)
-                               $profile = $hcard;
-                       else
-                               $profile = $url;
-               }
-
                if($twitter) {          
                        logger('twitter: setup');
                        $tid = basename($url);
@@ -447,23 +559,35 @@ function probe_url($url) {
                        else
                                $poll = $tapi . '?screen_name=' . $tid;
                        $profile = 'http://twitter.com/#!/' . $tid;
+                       $vcard['photo'] = 'https://api.twitter.com/1/users/profile_image/' . $tid;
+                       $vcard['nick'] = $tid;
+                       $vcard['fn'] = $tid . '@twitter';
                }
 
                if(! x($vcard,'fn'))
                        if(x($vcard,'nick'))
                                $vcard['fn'] = $vcard['nick'];
 
-       
-               if(((! isset($vcard)) && (! $poll) && (! $at_addr)) || ($twitter)) {
+               $check_feed = false;
+
+               if($twitter || ! $poll)
+                       $check_feed = true;
+               if((! isset($vcard)) || (! x($vcard,'fn')) || (! $profile))
+                       $check_feed = true;
+               if(($at_addr) && (! count($links)))
+                       $check_feed = false;
+
+               if($check_feed) {
 
-                       $feedret = scrape_feed($url);
-                       logger('probe_url: scrape_feed returns: ' . print_r($feedret,true), LOGGER_DATA);
+                       $feedret = scrape_feed(($poll) ? $poll : $url);
+                       logger('probe_url: scrape_feed ' . (($poll)? $poll : $url) . ' returns: ' . print_r($feedret,true), LOGGER_DATA);
                        if(count($feedret) && ($feedret['feed_atom'] || $feedret['feed_rss'])) {
                                $poll = ((x($feedret,'feed_atom')) ? unamp($feedret['feed_atom']) : unamp($feedret['feed_rss']));
-                               $vcard = array();
+                               if(! x($vcard)) 
+                                       $vcard = array();
                        }
 
-                       if(x($feedret,'photo'))
+                       if(x($feedret,'photo') && (! x($vcard,'photo')))
                                $vcard['photo'] = $feedret['photo'];
                        require_once('library/simplepie/simplepie.inc');
                    $feed = new SimplePie();
@@ -472,7 +596,7 @@ function probe_url($url) {
                        logger('probe_url: fetch feed: ' . $poll . ' returns: ' . $xml, LOGGER_DATA);
                        $a = get_app();
 
-                       logger('probe_url: scrape_feed: headers: ' . $a->get_curl_headers(), $LOGGER_DATA);
+                       logger('probe_url: scrape_feed: headers: ' . $a->get_curl_headers(), LOGGER_DATA);
 
                        $feed->set_raw_data($xml);
 
@@ -480,9 +604,11 @@ function probe_url($url) {
                        if($feed->error())
                                logger('probe_url: scrape_feed: Error parsing XML: ' . $feed->error());
 
+
                        if(! x($vcard,'photo'))
                                $vcard['photo'] = $feed->get_image_url();
                        $author = $feed->get_author();
+
                        if($author) {                   
                                $vcard['fn'] = unxmlify(trim($author->get_name()));
                                if(! $vcard['fn'])
@@ -490,6 +616,8 @@ function probe_url($url) {
                                if(strpos($vcard['fn'],'@') !== false)
                                        $vcard['fn'] = substr($vcard['fn'],0,strpos($vcard['fn'],'@'));
                                $email = unxmlify($author->get_email());
+                               if(! $profile && $author->get_link())
+                                       $profile = trim(unxmlify($author->get_link()));
                                if(! $vcard['photo']) {
                                        $rawtags = $feed->get_feed_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'author');
                                if($rawtags) {
@@ -510,6 +638,8 @@ function probe_url($url) {
                                                if(strpos($vcard['fn'],'@') !== false)
                                                        $vcard['fn'] = substr($vcard['fn'],0,strpos($vcard['fn'],'@'));
                                                $email = unxmlify($author->get_email());
+                                               if(! $profile && $author->get_link())
+                                                       $profile = trim(unxmlify($author->get_link()));
                                        }
                                        if(! $vcard['photo']) {
                                                $rawmedia = $item->get_item_tags('http://search.yahoo.com/mrss/','thumbnail');
@@ -526,8 +656,9 @@ function probe_url($url) {
                                        }
                                }
                        }
+
                        if((! $vcard['photo']) && strlen($email))
-                               $vcard['photo'] = gravatar_img($email);
+                               $vcard['photo'] = avatar_img($email);
                        if($poll === $profile)
                                $lnk = $feed->get_permalink();
                        if(isset($lnk) && strlen($lnk))
@@ -547,27 +678,39 @@ function probe_url($url) {
                                if(strpos($vcard['nick'],' '))
                                        $vcard['nick'] = trim(substr($vcard['nick'],0,strpos($vcard['nick'],' ')));
                        }
-                       $network = 'feed';
-                       $priority = 2;
+                       if(! $network)
+                               $network = NETWORK_FEED;
+                       if(! $priority)
+                               $priority = 2;
                }
        }
 
        if(! x($vcard,'photo')) {
                $a = get_app();
-               $vcard['photo'] = $a->get_baseurl() . '/images/default-profile.jpg' ; 
+               $vcard['photo'] = $a->get_baseurl() . '/images/person-175.jpg' ; 
        }
-       $vcard['fn'] = notags($vcard['fn']);
-       $vcard['nick'] = notags($vcard['nick']);
 
+       if(! $profile)
+               $profile = $url;
 
+       // No human could be associated with this link, use the URL as the contact name
+
+       if(($network === NETWORK_FEED) && ($poll) && (! x($vcard,'fn')))
+               $vcard['fn'] = $url;
+
+       $vcard['fn'] = notags($vcard['fn']);
+       $vcard['nick'] = str_replace(' ','',notags($vcard['nick']));
+               
        $result['name'] = $vcard['fn'];
        $result['nick'] = $vcard['nick'];
        $result['url'] = $profile;
        $result['addr'] = $addr;
+       $result['batch'] = $batch;
        $result['notify'] = $notify;
        $result['poll'] = $poll;
        $result['request'] = $request;
        $result['confirm'] = $confirm;
+       $result['poco'] = $poco;
        $result['photo'] = $vcard['photo'];
        $result['priority'] = $priority;
        $result['network'] = $network;