]> git.mxchange.org Git - friendica.git/blob - include/Scrape.php
dbd98c9ecc134dc9e05728ebc8543ca2b7ca6aa8
[friendica.git] / include / Scrape.php
1 <?php
2
3 require_once('library/HTML5/Parser.php');
4 require_once('include/crypto.php');
5
6 if(! function_exists('scrape_dfrn')) {
7 function scrape_dfrn($url) {
8
9         $a = get_app();
10
11         $ret = array();
12
13         logger('scrape_dfrn: url=' . $url);
14
15         $s = fetch_url($url);
16
17         if(! $s) 
18                 return $ret;
19
20         $headers = $a->get_curl_headers();
21         logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
22
23
24         $lines = explode("\n",$headers);
25         if(count($lines)) {
26                 foreach($lines as $line) {                              
27                         // don't try and run feeds through the html5 parser
28                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
29                                 return ret;
30                 }
31         }
32
33
34         $dom = HTML5_Parser::parse($s);
35
36         if(! $dom)
37                 return $ret;
38
39         $items = $dom->getElementsByTagName('link');
40
41         // get DFRN link elements
42
43         foreach($items as $item) {
44                 $x = $item->getAttribute('rel');
45                 if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
46                         $ret['feed_atom'] = $item->getAttribute('href');
47                 if(substr($x,0,5) == "dfrn-")
48                         $ret[$x] = $item->getAttribute('href');
49                 if($x === 'lrdd') {
50                         $decoded = urldecode($item->getAttribute('href'));
51                         if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
52                                 $ret['nick'] = $matches[1];
53                 }
54         }
55
56         // Pull out hCard profile elements
57
58         $items = $dom->getElementsByTagName('*');
59         foreach($items as $item) {
60                 if(attribute_contains($item->getAttribute('class'), 'vcard')) {
61                         $level2 = $item->getElementsByTagName('*');
62                         foreach($level2 as $x) {
63                                 if(attribute_contains($x->getAttribute('class'),'fn'))
64                                         $ret['fn'] = $x->textContent;
65                                 if(attribute_contains($x->getAttribute('class'),'photo'))
66                                         $ret['photo'] = $x->getAttribute('src');
67                                 if(attribute_contains($x->getAttribute('class'),'key'))
68                                         $ret['key'] = $x->textContent;
69                         }
70                 }
71         }
72
73         return $ret;
74 }}
75
76
77
78
79
80
81 if(! function_exists('validate_dfrn')) {
82 function validate_dfrn($a) {
83         $errors = 0;
84         if(! x($a,'key'))
85                 $errors ++;
86         if(! x($a,'dfrn-request'))
87                 $errors ++;
88         if(! x($a,'dfrn-confirm'))
89                 $errors ++;
90         if(! x($a,'dfrn-notify'))
91                 $errors ++;
92         if(! x($a,'dfrn-poll'))
93                 $errors ++;
94         return $errors;
95 }}
96
97 if(! function_exists('scrape_meta')) {
98 function scrape_meta($url) {
99
100         $a = get_app();
101
102         $ret = array();
103
104         logger('scrape_meta: url=' . $url);
105
106         $s = fetch_url($url);
107
108         if(! $s) 
109                 return $ret;
110
111         $headers = $a->get_curl_headers();
112         logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
113
114         $lines = explode("\n",$headers);
115         if(count($lines)) {
116                 foreach($lines as $line) {                              
117                         // don't try and run feeds through the html5 parser
118                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
119                                 return ret;
120                 }
121         }
122
123
124
125         $dom = HTML5_Parser::parse($s);
126
127         if(! $dom)
128                 return $ret;
129
130         $items = $dom->getElementsByTagName('meta');
131
132         // get DFRN link elements
133
134         foreach($items as $item) {
135                 $x = $item->getAttribute('name');
136                 if(substr($x,0,5) == "dfrn-")
137                         $ret[$x] = $item->getAttribute('content');
138         }
139
140         return $ret;
141 }}
142
143
144 if(! function_exists('scrape_vcard')) {
145 function scrape_vcard($url) {
146
147         $a = get_app();
148
149         $ret = array();
150
151         logger('scrape_vcard: url=' . $url);
152
153         $s = fetch_url($url);
154
155         if(! $s) 
156                 return $ret;
157
158         $headers = $a->get_curl_headers();
159         $lines = explode("\n",$headers);
160         if(count($lines)) {
161                 foreach($lines as $line) {                              
162                         // don't try and run feeds through the html5 parser
163                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
164                                 return ret;
165                 }
166         }
167
168         $dom = HTML5_Parser::parse($s);
169
170         if(! $dom)
171                 return $ret;
172
173         // Pull out hCard profile elements
174
175         $largest_photo = 0;
176
177         $items = $dom->getElementsByTagName('*');
178         foreach($items as $item) {
179                 if(attribute_contains($item->getAttribute('class'), 'vcard')) {
180                         $level2 = $item->getElementsByTagName('*');
181                         foreach($level2 as $x) {
182                                 if(attribute_contains($x->getAttribute('class'),'fn'))
183                                         $ret['fn'] = $x->textContent;
184                                 if((attribute_contains($x->getAttribute('class'),'photo'))
185                                         || (attribute_contains($x->getAttribute('class'),'avatar'))) {
186                                         $size = intval($x->getAttribute('width'));
187                                         if(($size > $largest_photo) || (! $largest_photo)) {
188                                                 $ret['photo'] = $x->getAttribute('src');
189                                                 $largest_photo = $size;
190                                         }
191                                 }
192                                 if((attribute_contains($x->getAttribute('class'),'nickname'))
193                                         || (attribute_contains($x->getAttribute('class'),'uid')))
194                                         $ret['nick'] = $x->textContent;
195                         }
196                 }
197         }
198
199         return $ret;
200 }}
201
202
203 if(! function_exists('scrape_feed')) {
204 function scrape_feed($url) {
205
206         $a = get_app();
207
208         $ret = array();
209         $s = fetch_url($url);
210
211         if(! $s) 
212                 return $ret;
213
214         $headers = $a->get_curl_headers();
215         logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
216
217         $lines = explode("\n",$headers);
218         if(count($lines)) {
219                 foreach($lines as $line) {                              
220                         if(stristr($line,'content-type:')) {
221                                 if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
222                                         $ret['feed_atom'] = $url;
223                                         return $ret;
224                                 }
225                                 if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
226                                         $ret['feed_rss'] = $url;
227                                         return $ret;
228                                 }
229                         }
230                 }
231         }
232
233         $dom = HTML5_Parser::parse($s);
234
235         if(! $dom)
236                 return $ret;
237
238
239         $items = $dom->getElementsByTagName('img');
240
241         // get img elements (twitter)
242
243         if($items) {
244                 foreach($items as $item) {
245                         $x = $item->getAttribute('id');
246                         if($x === 'profile-image') {
247                                 $ret['photo'] = $item->getAttribute('src');
248                         }
249                 }
250         }
251
252
253         $head = $dom->getElementsByTagName('base');
254         if($head) {
255                 foreach($head as $head0) {
256                         $basename = $head0->getAttribute('href');
257                         break;
258                 }
259         }
260         if(! $basename)
261                 $basename = substr($url,0,strrpos($url,'/')) . '/';
262
263         $items = $dom->getElementsByTagName('link');
264
265         // get Atom/RSS link elements, take the first one of either.
266
267         if($items) {
268                 foreach($items as $item) {
269                         $x = $item->getAttribute('rel');
270                         if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) {
271                                 if(! x($ret,'feed_atom'))
272                                         $ret['feed_atom'] = $item->getAttribute('href');
273                         }
274                         if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) {
275                                 if(! x($ret,'feed_rss'))
276                                         $ret['feed_rss'] = $item->getAttribute('href');
277                         }
278                 }       
279         }
280
281         // Drupal and perhaps others only provide relative URL's. Turn them into absolute.
282
283         if(x($ret,'feed_atom') && (! strstr($ret['feed_atom'],'://')))
284                 $ret['feed_atom'] = $basename . $ret['feed_atom'];
285         if(x($ret,'feed_rss') && (! strstr($ret['feed_rss'],'://')))
286                 $ret['feed_rss'] = $basename . $ret['feed_rss'];
287
288         return $ret;
289 }}
290
291
292 function probe_url($url) {
293         require_once('include/email.php');
294
295         $result = array();
296
297         if(! $url)
298                 return $result;
299
300         $diaspora = false;
301         $diaspora_base = '';
302         $diaspora_guid = '';    
303         $diaspora_key = '';
304         $email_conversant = false;
305
306         $twitter = ((strpos($url,'twitter.com') !== false) ? true : false);
307
308         $at_addr = ((strpos($url,'@') !== false) ? true : false);
309
310         if(! $twitter) {
311
312                 if(strpos($url,'mailto:') !== false && $at_addr) {
313                         $url = str_replace('mailto:','',$url);
314                         $links = array();
315                 }
316                 else
317                         $links = lrdd($url);
318
319                 if(count($links)) {
320                         logger('probe_url: found lrdd links: ' . print_r($links,true), LOGGER_DATA);
321                         foreach($links as $link) {
322                                 if($link['@attributes']['rel'] === NAMESPACE_ZOT)
323                                         $zot = unamp($link['@attributes']['href']);
324                                 if($link['@attributes']['rel'] === NAMESPACE_DFRN)
325                                         $dfrn = unamp($link['@attributes']['href']);
326                                 if($link['@attributes']['rel'] === 'salmon')
327                                         $notify = unamp($link['@attributes']['href']);
328                                 if($link['@attributes']['rel'] === NAMESPACE_FEED)
329                                         $poll = unamp($link['@attributes']['href']);
330                                 if($link['@attributes']['rel'] === 'http://microformats.org/profile/hcard')
331                                         $hcard = unamp($link['@attributes']['href']);
332                                 if($link['@attributes']['rel'] === 'http://webfinger.net/rel/profile-page')
333                                         $profile = unamp($link['@attributes']['href']);
334                                 if($link['@attributes']['rel'] === 'http://joindiaspora.com/seed_location') {
335                                         $diaspora_base = unamp($link['@attributes']['href']);
336                                         $diaspora = true;
337                                 }
338                                 if($link['@attributes']['rel'] === 'http://joindiaspora.com/guid') {
339                                         $diaspora_guid = unamp($link['@attributes']['href']);
340                                         $diaspora = true;
341                                 }
342                                 if($link['@attributes']['rel'] === 'diaspora-public-key') {
343                                         $diaspora_key = base64_decode(unamp($link['@attributes']['href']));
344                                         $pubkey = rsatopem($diaspora_key);
345                                         $diaspora = true;
346                                 }
347                         }
348
349                         // Status.Net can have more than one profile URL. We need to match the profile URL
350                         // to a contact on incoming messages to prevent spam, and we won't know which one
351                         // to match. So in case of two, one of them is stored as an alias. Only store URL's
352                         // and not webfinger user@host aliases. If they've got more than two non-email style
353                         // aliases, let's hope we're lucky and get one that matches the feed author-uri because 
354                         // otherwise we're screwed.
355
356                         foreach($links as $link) {
357                                 if($link['@attributes']['rel'] === 'alias') {
358                                         if(strpos($link['@attributes']['href'],'@') === false) {
359                                                 if(isset($profile)) {
360                                                         if($link['@attributes']['href'] !== $profile)
361                                                                 $alias = unamp($link['@attributes']['href']);
362                                                 }
363                                                 else
364                                                         $profile = unamp($link['@attributes']['href']);
365                                         }
366                                 }
367                         }
368                 }
369                 else {
370
371                         // Check email
372
373                         $orig_url = $url;
374                         if((strpos($orig_url,'@')) && validate_email($orig_url)) {
375                                 $x = q("SELECT `prvkey` FROM `user` WHERE `uid` = %d LIMIT 1",
376                                         intval(local_user())
377                                 );
378                                 $r = q("SELECT * FROM `mailacct` WHERE `uid` = %d AND `server` != '' LIMIT 1",
379                                         intval(local_user())
380                                 );
381                                 if(count($x) && count($r)) {
382                                     $mailbox = construct_mailbox_name($r[0]);
383                                         $password = '';
384                                         openssl_private_decrypt(hex2bin($r[0]['pass']),$password,$x[0]['prvkey']);
385                                         $mbox = email_connect($mailbox,$r[0]['user'],$password);
386                                         unset($password);
387                                 }
388                                 if($mbox) {
389                                         $msgs = email_poll($mbox,$orig_url);
390                                         if(count($msgs)) {
391                                                 $addr = $orig_url;
392                                                 $network = NETWORK_MAIL;
393                                                 $name = substr($url,0,strpos($url,'@'));
394                                                 $profile = 'http://' . substr($url,strpos($url,'@')+1);
395                                                 // fix nick character range
396                                                 $vcard = array('fn' => $name, 'nick' => $name, 'photo' => gravatar_img($url));
397                                                 $notify = 'smtp ' . random_string();
398                                                 $poll = 'email ' . random_string();
399                                                 $priority = 0;
400                                                 $x = email_msg_meta($mbox,$msgs[0]);
401                                                 if(stristr($x->from,$orig_url))
402                                                         $adr = imap_rfc822_parse_adrlist($x->from,'');
403                                                 elseif(stristr($x->to,$orig_url))
404                                                         $adr = imap_rfc822_parse_adrlist($x->to,'');
405                                                 if(isset($adr) && strlen($adr[0]->personal))
406                                                         $vcard['fn'] = notags($adr[0]->personal);
407                                         }
408                                         imap_close($mbox);
409                                 }
410                         }
411                 }
412         }       
413
414         if(strlen($zot)) {
415                 $s = fetch_url($zot);
416                 if($s) {
417                         $j = json_decode($s);
418                         if($j) {
419                                 $network = NETWORK_ZOT;
420                                 $vcard   = array(
421                                         'fn'    => $j->fullname, 
422                                         'nick'  => $j->nickname, 
423                                         'photo' => $j->photo
424                                 );
425                                 $profile  = $j->url;
426                                 $notify   = $j->post;
427                                 $pubkey   = $j->pubkey;
428                                 $poll     = 'N/A';
429                         }
430                 }
431         }
432
433         if(strlen($dfrn)) {
434                 $ret = scrape_dfrn($dfrn);
435                 if(is_array($ret) && x($ret,'dfrn-request')) {
436                         $network = NETWORK_DFRN;
437                         $request = $ret['dfrn-request'];
438                         $confirm = $ret['dfrn-confirm'];
439                         $notify  = $ret['dfrn-notify'];
440                         $poll    = $ret['dfrn-poll'];
441                 }
442         }
443
444         if($diaspora && $diaspora_base && $diaspora_guid) {
445                 $notify = $diaspora_base . 'receive/post/' . $diaspora_guid;
446                 if(strpos($url,'@'))
447                         $addr = str_replace('acct:', '', $url);
448         }                       
449
450         if($network !== NETWORK_ZOT && $network !== NETWORK_DFRN && $network !== NETWORK_MAIL) {
451                 if($diaspora)
452                         $network = NETWORK_DIASPORA;
453                 else
454                         $network  = NETWORK_OSTATUS;
455                 $priority = 0;
456
457                 if($hcard) {
458                         $vcard = scrape_vcard($hcard);
459
460                         // Google doesn't use absolute url in profile photos
461         
462                         if((x($vcard,'photo')) && substr($vcard['photo'],0,1) == '/') {
463                                 $h = @parse_url($hcard);
464                                 if($h)
465                                         $vcard['photo'] = $h['scheme'] . '://' . $h['host'] . $vcard['photo'];
466                         }
467                 
468                         logger('probe_url: scrape_vcard: ' . print_r($vcard,true), LOGGER_DATA);
469                 }
470
471                 if($twitter) {          
472                         logger('twitter: setup');
473                         $tid = basename($url);
474                         $tapi = 'https://api.twitter.com/1/statuses/user_timeline.rss';
475                         if(intval($tid))
476                                 $poll = $tapi . '?user_id=' . $tid;
477                         else
478                                 $poll = $tapi . '?screen_name=' . $tid;
479                         $profile = 'http://twitter.com/#!/' . $tid;
480                 }
481
482                 if(! x($vcard,'fn'))
483                         if(x($vcard,'nick'))
484                                 $vcard['fn'] = $vcard['nick'];
485
486                 $check_feed = false;
487
488                 if($twitter || ! $poll)
489                         $check_feed = true;
490                 if((! isset($vcard)) || (! $profile))
491                         $check_feed = true;
492                 if(($at_addr) && (! count($links)))
493                         $check_feed = false;
494
495                 if($check_feed) {
496
497                         $feedret = scrape_feed(($poll) ? $poll : $url);
498                         logger('probe_url: scrape_feed returns: ' . print_r($feedret,true), LOGGER_DATA);
499                         if(count($feedret) && ($feedret['feed_atom'] || $feedret['feed_rss'])) {
500                                 $poll = ((x($feedret,'feed_atom')) ? unamp($feedret['feed_atom']) : unamp($feedret['feed_rss']));
501                                 if(! x($vcard)) 
502                                         $vcard = array();
503                         }
504
505                         if(x($feedret,'photo') && (! x($vcard,'photo')))
506                                 $vcard['photo'] = $feedret['photo'];
507                         require_once('library/simplepie/simplepie.inc');
508                     $feed = new SimplePie();
509                         $xml = fetch_url($poll);
510
511                         logger('probe_url: fetch feed: ' . $poll . ' returns: ' . $xml, LOGGER_DATA);
512                         $a = get_app();
513
514                         logger('probe_url: scrape_feed: headers: ' . $a->get_curl_headers(), $LOGGER_DATA);
515
516                         $feed->set_raw_data($xml);
517
518                     $feed->init();
519                         if($feed->error())
520                                 logger('probe_url: scrape_feed: Error parsing XML: ' . $feed->error());
521
522
523                         if(! x($vcard,'photo'))
524                                 $vcard['photo'] = $feed->get_image_url();
525                         $author = $feed->get_author();
526
527                         if($author) {                   
528                                 $vcard['fn'] = unxmlify(trim($author->get_name()));
529                                 if(! $vcard['fn'])
530                                         $vcard['fn'] = trim(unxmlify($author->get_email()));
531                                 if(strpos($vcard['fn'],'@') !== false)
532                                         $vcard['fn'] = substr($vcard['fn'],0,strpos($vcard['fn'],'@'));
533                                 $email = unxmlify($author->get_email());
534                                 if(! $profile && $author->get_link())
535                                         $profile = trim(unxmlify($author->get_link()));
536                                 if(! $vcard['photo']) {
537                                         $rawtags = $feed->get_feed_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'author');
538                                 if($rawtags) {
539                                                 $elems = $rawtags[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10];
540                                                 if((x($elems,'link')) && ($elems['link'][0]['attribs']['']['rel'] === 'photo'))
541                                                         $vcard['photo'] = $elems['link'][0]['attribs']['']['href'];
542                                 }
543                                 }
544                         }
545                         else {
546                                 $item = $feed->get_item(0);
547                                 if($item) {
548                                         $author = $item->get_author();
549                                         if($author) {                   
550                                                 $vcard['fn'] = trim(unxmlify($author->get_name()));
551                                                 if(! $vcard['fn'])
552                                                         $vcard['fn'] = trim(unxmlify($author->get_email()));
553                                                 if(strpos($vcard['fn'],'@') !== false)
554                                                         $vcard['fn'] = substr($vcard['fn'],0,strpos($vcard['fn'],'@'));
555                                                 $email = unxmlify($author->get_email());
556                                                 if(! $profile && $author->get_link())
557                                                         $profile = trim(unxmlify($author->get_link()));
558                                         }
559                                         if(! $vcard['photo']) {
560                                                 $rawmedia = $item->get_item_tags('http://search.yahoo.com/mrss/','thumbnail');
561                                                 if($rawmedia && $rawmedia[0]['attribs']['']['url'])
562                                                         $vcard['photo'] = unxmlify($rawmedia[0]['attribs']['']['url']);
563                                         }
564                                         if(! $vcard['photo']) {
565                                                 $rawtags = $item->get_item_tags( SIMPLEPIE_NAMESPACE_ATOM_10, 'author');
566                                         if($rawtags) {
567                                                         $elems = $rawtags[0]['child'][SIMPLEPIE_NAMESPACE_ATOM_10];
568                                                         if((x($elems,'link')) && ($elems['link'][0]['attribs']['']['rel'] === 'photo'))
569                                                                 $vcard['photo'] = $elems['link'][0]['attribs']['']['href'];
570                                         }
571                                         }
572                                 }
573                         }
574
575                         if((! $vcard['photo']) && strlen($email))
576                                 $vcard['photo'] = gravatar_img($email);
577                         if($poll === $profile)
578                                 $lnk = $feed->get_permalink();
579                         if(isset($lnk) && strlen($lnk))
580                                 $profile = $lnk;        
581
582                         if(! (x($vcard,'fn')))
583                                 $vcard['fn'] = notags($feed->get_title());
584                         if(! (x($vcard,'fn')))
585                                 $vcard['fn'] = notags($feed->get_description());
586
587                         if(strpos($vcard['fn'],'Twitter / ') !== false) {
588                                 $vcard['fn'] = substr($vcard['fn'],strpos($vcard['fn'],'/')+1);
589                                 $vcard['fn'] = trim($vcard['fn']);
590                         }
591                         if(! x($vcard,'nick')) {
592                                 $vcard['nick'] = strtolower(notags(unxmlify($vcard['fn'])));
593                                 if(strpos($vcard['nick'],' '))
594                                         $vcard['nick'] = trim(substr($vcard['nick'],0,strpos($vcard['nick'],' ')));
595                         }
596                         if(! $network)
597                                 $network = 'feed';
598                         if(! $priority)
599                                 $priority = 2;
600                 }
601         }
602
603         if(! x($vcard,'photo')) {
604                 $a = get_app();
605                 $vcard['photo'] = $a->get_baseurl() . '/images/default-profile.jpg' ; 
606         }
607
608         if(! $profile)
609                 $profile = $url;
610
611         $vcard['fn'] = notags($vcard['fn']);
612         $vcard['nick'] = str_replace(' ','',notags($vcard['nick']));
613
614
615         $result['name'] = $vcard['fn'];
616         $result['nick'] = $vcard['nick'];
617         $result['url'] = $profile;
618         $result['addr'] = $addr;
619         $result['notify'] = $notify;
620         $result['poll'] = $poll;
621         $result['request'] = $request;
622         $result['confirm'] = $confirm;
623         $result['photo'] = $vcard['photo'];
624         $result['priority'] = $priority;
625         $result['network'] = $network;
626         $result['alias'] = $alias;
627         $result['pubkey'] = $pubkey;
628
629         logger('probe_url: ' . print_r($result,true), LOGGER_DEBUG);
630
631         return $result;
632 }