include/Scrape.php

   1 <?php
   2
   3 require_once('library/HTML5/Parser.php');
   4
   5 if(! function_exists('scrape_dfrn')) {
   6 function scrape_dfrn($url) {
   7
   8         $a = get_app();
   9
  10         $ret = array();
  11
  12         logger('scrape_dfrn: url=' . $url);
  13
  14         $s = fetch_url($url);
  15
  16         if(! $s)
  17                 return $ret;
  18
  19         $headers = $a->get_curl_headers();
  20         logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
  21
  22
  23         $lines = explode("\n",$headers);
  24         if(count($lines)) {
  25                 foreach($lines as $line) {
  26                         // don't try and run feeds through the html5 parser
  27                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
  28                                 return ret;
  29                 }
  30         }
  31
  32
  33         $dom = HTML5_Parser::parse($s);
  34
  35         if(! $dom)
  36                 return $ret;
  37
  38         $items = $dom->getElementsByTagName('link');
  39
  40         // get DFRN link elements
  41
  42         foreach($items as $item) {
  43                 $x = $item->getAttribute('rel');
  44                 if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
  45                         $ret['feed_atom'] = $item->getAttribute('href');
  46                 if(substr($x,0,5) == "dfrn-")
  47                         $ret[$x] = $item->getAttribute('href');
  48                 if($x === 'lrdd') {
  49                         $decoded = urldecode($item->getAttribute('href'));
  50                         if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
  51                                 $ret['nick'] = $matches[1];
  52                 }
  53         }
  54
  55         // Pull out hCard profile elements
  56
  57         $items = $dom->getElementsByTagName('*');
  58         foreach($items as $item) {
  59                 if(attribute_contains($item->getAttribute('class'), 'vcard')) {
  60                         $level2 = $item->getElementsByTagName('*');
  61                         foreach($level2 as $x) {
  62                                 if(attribute_contains($x->getAttribute('class'),'fn'))
  63                                         $ret['fn'] = $x->textContent;
  64                                 if(attribute_contains($x->getAttribute('class'),'photo'))
  65                                         $ret['photo'] = $x->getAttribute('src');
  66                                 if(attribute_contains($x->getAttribute('class'),'key'))
  67                                         $ret['key'] = $x->textContent;
  68                         }
  69                 }
  70         }
  71
  72         return $ret;
  73 }}
  74
  75
  76
  77
  78
  79
  80 if(! function_exists('validate_dfrn')) {
  81 function validate_dfrn($a) {
  82         $errors = 0;
  83         if(! x($a,'key'))
  84                 $errors ++;
  85         if(! x($a,'dfrn-request'))
  86                 $errors ++;
  87         if(! x($a,'dfrn-confirm'))
  88                 $errors ++;
  89         if(! x($a,'dfrn-notify'))
  90                 $errors ++;
  91         if(! x($a,'dfrn-poll'))
  92                 $errors ++;
  93         return $errors;
  94 }}
  95
  96 if(! function_exists('scrape_meta')) {
  97 function scrape_meta($url) {
  98
  99         $a = get_app();
 100
 101         $ret = array();
 102
 103         logger('scrape_meta: url=' . $url);
 104
 105         $s = fetch_url($url);
 106
 107         if(! $s)
 108                 return $ret;
 109
 110         $headers = $a->get_curl_headers();
 111         logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
 112
 113         $lines = explode("\n",$headers);
 114         if(count($lines)) {
 115                 foreach($lines as $line) {
 116                         // don't try and run feeds through the html5 parser
 117                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
 118                                 return ret;
 119                 }
 120         }
 121
 122
 123
 124         $dom = HTML5_Parser::parse($s);
 125
 126         if(! $dom)
 127                 return $ret;
 128
 129         $items = $dom->getElementsByTagName('meta');
 130
 131         // get DFRN link elements
 132
 133         foreach($items as $item) {
 134                 $x = $item->getAttribute('name');
 135                 if(substr($x,0,5) == "dfrn-")
 136                         $ret[$x] = $item->getAttribute('content');
 137         }
 138
 139         return $ret;
 140 }}
 141
 142
 143 if(! function_exists('scrape_vcard')) {
 144 function scrape_vcard($url) {
 145
 146         $a = get_app();
 147
 148         $ret = array();
 149
 150         logger('scrape_vcard: url=' . $url);
 151
 152         $s = fetch_url($url);
 153
 154         if(! $s)
 155                 return $ret;
 156
 157         $headers = $a->get_curl_headers();
 158         $lines = explode("\n",$headers);
 159         if(count($lines)) {
 160                 foreach($lines as $line) {
 161                         // don't try and run feeds through the html5 parser
 162                         if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
 163                                 return ret;
 164                 }
 165         }
 166
 167         $dom = HTML5_Parser::parse($s);
 168
 169         if(! $dom)
 170                 return $ret;
 171
 172         // Pull out hCard profile elements
 173
 174         $items = $dom->getElementsByTagName('*');
 175         foreach($items as $item) {
 176                 if(attribute_contains($item->getAttribute('class'), 'vcard')) {
 177                         $level2 = $item->getElementsByTagName('*');
 178                         foreach($level2 as $x) {
 179                                 if(attribute_contains($x->getAttribute('class'),'fn'))
 180                                         $ret['fn'] = $x->textContent;
 181                                 if((attribute_contains($x->getAttribute('class'),'photo'))
 182                                         || (attribute_contains($x->getAttribute('class'),'avatar')))
 183                                         $ret['photo'] = $x->getAttribute('src');
 184                                 if((attribute_contains($x->getAttribute('class'),'nickname'))
 185                                         || (attribute_contains($x->getAttribute('class'),'uid')))
 186                                         $ret['nick'] = $x->textContent;
 187                         }
 188                 }
 189         }
 190
 191         return $ret;
 192 }}
 193
 194
 195 if(! function_exists('scrape_feed')) {
 196 function scrape_feed($url) {
 197
 198         $a = get_app();
 199
 200         $ret = array();
 201         $s = fetch_url($url);
 202
 203         if(! $s)
 204                 return $ret;
 205
 206         $headers = $a->get_curl_headers();
 207         logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
 208
 209         $lines = explode("\n",$headers);
 210         if(count($lines)) {
 211                 foreach($lines as $line) {
 212                         if(stristr($line,'content-type:')) {
 213                                 if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
 214                                         $ret['feed_atom'] = $url;
 215                                         return $ret;
 216                                 }
 217                                 if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
 218                                         $ret['feed_rss'] = $url;
 219                                         return $ret;
 220                                 }
 221                         }
 222                 }
 223         }
 224
 225         $dom = HTML5_Parser::parse($s);
 226
 227         if(! $dom)
 228                 return $ret;
 229
 230
 231         $items = $dom->getElementsByTagName('img');
 232
 233         // get img elements (twitter)
 234
 235         if($items) {
 236                 foreach($items as $item) {
 237                         $x = $item->getAttribute('id');
 238                         if($x === 'profile-image') {
 239                                 $ret['photo'] = $item->getAttribute('src');
 240                         }
 241                 }
 242         }
 243
 244         $items = $dom->getElementsByTagName('link');
 245
 246         // get Atom/RSS link elements, take the first one of either.
 247
 248         if($items) {
 249                 foreach($items as $item) {
 250                         $x = $item->getAttribute('rel');
 251                         if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) {
 252                                 if(! x($ret,'feed_atom'))
 253                                         $ret['feed_atom'] = $item->getAttribute('href');
 254                         }
 255                         if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) {
 256                                 if(! x($ret,'feed_rss'))
 257                                         $ret['feed_rss'] = $item->getAttribute('href');
 258                         }
 259                 }
 260         }
 261
 262         return $ret;
 263 }}