X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;f=include%2FScrape.php;h=bb42c3bdd3474946db9325ad4fb6846141c5571e;hb=8c5c07b2654f5ff28cea1a7012198f74c5b1af03;hp=cc50151657b763e505a03acb5b240bad45f77830;hpb=6348e70daa113e8b3203de8fbc919d08c90d972e;p=friendica.git diff --git a/include/Scrape.php b/include/Scrape.php index cc50151657..bb42c3bdd3 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -2,24 +2,28 @@ require_once('library/HTML5/Parser.php'); -if(! function_exists('attribute_contains')) { -function attribute_contains($attr,$s) { - $a = explode(' ', $attr); - if(count($a) && in_array($s,$a)) - return true; - return false; -}} - - if(! function_exists('scrape_dfrn')) { function scrape_dfrn($url) { + $a = get_app(); + $ret = array(); $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -31,8 +35,15 @@ function scrape_dfrn($url) { foreach($items as $item) { $x = $item->getAttribute('rel'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) + $ret['feed_atom'] = $item->getAttribute('href'); if(substr($x,0,5) == "dfrn-") $ret[$x] = $item->getAttribute('href'); + if($x === 'lrdd') { + $decoded = urldecode($item->getAttribute('href')); + if(preg_match('/acct:([^@]*)@/',$decoded,$matches)) + $ret['nick'] = $matches[1]; + } } // Pull out hCard profile elements @@ -48,7 +59,7 @@ function scrape_dfrn($url) { $ret['photo'] = $x->getAttribute('src'); if(attribute_contains($x->getAttribute('class'),'key')) $ret['key'] = $x->textContent; - } + } } } @@ -76,5 +87,141 @@ function validate_dfrn($a) { return $errors; }} +if(! function_exists('scrape_meta')) { +function scrape_meta($url) { + + $a = get_app(); + + $ret = array(); + $s = fetch_url($url); + + if(! $s) + return $ret; + + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + + $dom = HTML5_Parser::parse($s); + + if(! $dom) + return $ret; + + $items = $dom->getElementsByTagName('meta'); + + // get DFRN link elements + + foreach($items as $item) { + $x = $item->getAttribute('name'); + if(substr($x,0,5) == "dfrn-") + $ret[$x] = $item->getAttribute('content'); + } + + return $ret; +}} + + +if(! function_exists('scrape_vcard')) { +function scrape_vcard($url) { + + $a = get_app(); + + $ret = array(); + $s = fetch_url($url); + + if(! $s) + return $ret; + + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + $dom = HTML5_Parser::parse($s); + + if(! $dom) + return $ret; + + // Pull out hCard profile elements + + $items = $dom->getElementsByTagName('*'); + foreach($items as $item) { + if(attribute_contains($item->getAttribute('class'), 'vcard')) { + $level2 = $item->getElementsByTagName('*'); + foreach($level2 as $x) { + if(attribute_contains($x->getAttribute('class'),'fn')) + $ret['fn'] = $x->textContent; + if((attribute_contains($x->getAttribute('class'),'photo')) + || (attribute_contains($x->getAttribute('class'),'avatar'))) + $ret['photo'] = $x->getAttribute('src'); + if((attribute_contains($x->getAttribute('class'),'nickname')) + || (attribute_contains($x->getAttribute('class'),'uid'))) + $ret['nick'] = $x->textContent; + } + } + } + + return $ret; +}} + + +if(! function_exists('scrape_feed')) { +function scrape_feed($url) { + + $a = get_app(); + + $ret = array(); + $s = fetch_url($url); + + if(! $s) + return $ret; + + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + if(stristr($line,'content-type:')) { + if(stristr($line,'application/atom+xml')) { + $ret['feed_atom'] = $url; + return $ret; + } + if(stristr($line,'application/rss+xml')) { + $ret['feed_rss'] = $url; + return ret; + } + } + } + } + + $dom = HTML5_Parser::parse($s); + + if(! $dom) + return $ret; + + $items = $dom->getElementsByTagName('link'); + + // get Atom link elements + + foreach($items as $item) { + $x = $item->getAttribute('rel'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) + $ret['feed_atom'] = $item->getAttribute('href'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) + $ret['feed_rss'] = $item->getAttribute('href'); + } + + return $ret; +}} \ No newline at end of file