Merge commit 'mike/master'

[friendica.git] / include / Scrape.php
diff --git a/include/Scrape.php b/include/Scrape.php

index cc50151657b763e505a03acb5b240bad45f77830..ff98992526314474b6dcc7aa20386a92f114c332 100644 (file)
--- a/include/Scrape.php
+++ b/include/Scrape.php
@@ -2,24 +2,34 @@
  
  require_once('library/HTML5/Parser.php');
  
-if(! function_exists('attribute_contains')) {
-function attribute_contains($attr,$s) {
-       $a = explode(' ', $attr);
-       if(count($a) && in_array($s,$a))
-               return true;
-       return false;
-}}
-
-
  if(! function_exists('scrape_dfrn')) {
  function scrape_dfrn($url) {
  
+       $a = get_app();
+
         $ret = array();
+
+       logger('scrape_dfrn: url=' . $url);
+
         $s = fetch_url($url);
  
         if(! $s) 
                 return $ret;
  
+       $headers = $a->get_curl_headers();
+       logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
+
+
+       $lines = explode("\n",$headers);
+       if(count($lines)) {
+               foreach($lines as $line) {                              
+                       // don't try and run feeds through the html5 parser
+                       if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+                               return ret;
+               }
+       }
+
+
         $dom = HTML5_Parser::parse($s);
  
         if(! $dom)
@@ -31,8 +41,15 @@ function scrape_dfrn($url) {
  
         foreach($items as $item) {
                 $x = $item->getAttribute('rel');
+               if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
+                       $ret['feed_atom'] = $item->getAttribute('href');
                 if(substr($x,0,5) == "dfrn-")
                         $ret[$x] = $item->getAttribute('href');
+               if($x === 'lrdd') {
+                       $decoded = urldecode($item->getAttribute('href'));
+                       if(preg_match('/acct:([^@]*)@/',$decoded,$matches))
+                               $ret['nick'] = $matches[1];
+               }
         }
  
         // Pull out hCard profile elements
@@ -48,7 +65,7 @@ function scrape_dfrn($url) {
                                         $ret['photo'] = $x->getAttribute('src');
                                 if(attribute_contains($x->getAttribute('class'),'key'))
                                         $ret['key'] = $x->textContent;
-                       }
+                       }
                 }
         }
  
@@ -76,5 +93,151 @@ function validate_dfrn($a) {
         return $errors;
  }}
  
+if(! function_exists('scrape_meta')) {
+function scrape_meta($url) {
+
+       $a = get_app();
+
+       $ret = array();
+
+       logger('scrape_meta: url=' . $url);
+
+       $s = fetch_url($url);
+
+       if(! $s) 
+               return $ret;
+
+       $headers = $a->get_curl_headers();
+       logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
+
+       $lines = explode("\n",$headers);
+       if(count($lines)) {
+               foreach($lines as $line) {                              
+                       // don't try and run feeds through the html5 parser
+                       if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+                               return ret;
+               }
+       }
+
+
+
+       $dom = HTML5_Parser::parse($s);
+
+       if(! $dom)
+               return $ret;
+
+       $items = $dom->getElementsByTagName('meta');
+
+       // get DFRN link elements
+
+       foreach($items as $item) {
+               $x = $item->getAttribute('name');
+               if(substr($x,0,5) == "dfrn-")
+                       $ret[$x] = $item->getAttribute('content');
+       }
+
+       return $ret;
+}}
+
+
+if(! function_exists('scrape_vcard')) {
+function scrape_vcard($url) {
+
+       $a = get_app();
+
+       $ret = array();
+
+       logger('scrape_vcard: url=' . $url);
+
+       $s = fetch_url($url);
+
+       if(! $s) 
+               return $ret;
+
+       $headers = $a->get_curl_headers();
+       $lines = explode("\n",$headers);
+       if(count($lines)) {
+               foreach($lines as $line) {                              
+                       // don't try and run feeds through the html5 parser
+                       if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
+                               return ret;
+               }
+       }
+
+       $dom = HTML5_Parser::parse($s);
+
+       if(! $dom)
+               return $ret;
+
+       // Pull out hCard profile elements
+
+       $items = $dom->getElementsByTagName('*');
+       foreach($items as $item) {
+               if(attribute_contains($item->getAttribute('class'), 'vcard')) {
+                       $level2 = $item->getElementsByTagName('*');
+                       foreach($level2 as $x) {
+                               if(attribute_contains($x->getAttribute('class'),'fn'))
+                                       $ret['fn'] = $x->textContent;
+                               if((attribute_contains($x->getAttribute('class'),'photo'))
+                                       || (attribute_contains($x->getAttribute('class'),'avatar')))
+                                       $ret['photo'] = $x->getAttribute('src');
+                               if((attribute_contains($x->getAttribute('class'),'nickname'))
+                                       || (attribute_contains($x->getAttribute('class'),'uid')))
+                                       $ret['nick'] = $x->textContent;
+                       }
+               }
+       }
+
+       return $ret;
+}}
+
  
+if(! function_exists('scrape_feed')) {
+function scrape_feed($url) {
  
+       $a = get_app();
+
+       $ret = array();
+       $s = fetch_url($url);
+
+       if(! $s) 
+               return $ret;
+
+       $headers = $a->get_curl_headers();
+       logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
+
+       $lines = explode("\n",$headers);
+       if(count($lines)) {
+               foreach($lines as $line) {                              
+                       if(stristr($line,'content-type:')) {
+                               if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
+                                       $ret['feed_atom'] = $url;
+                                       return $ret;
+                               }
+                               if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
+                                       $ret['feed_rss'] = $url;
+                                       return ret;
+                               }
+                       }
+               }
+       }
+
+       $dom = HTML5_Parser::parse($s);
+
+       if(! $dom)
+               return $ret;
+
+       $items = $dom->getElementsByTagName('link');
+
+       // get Atom link elements
+
+       foreach($items as $item) {
+               $x = $item->getAttribute('rel');
+               if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml'))
+                       $ret['feed_atom'] = $item->getAttribute('href');
+               if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml'))
+                       $ret['feed_rss'] = $item->getAttribute('href');
+       }
+
+       return $ret;
+}}
+\ No newline at end of file