X-Git-Url: https://git.mxchange.org/?a=blobdiff_plain;ds=sidebyside;f=include%2FScrape.php;h=ff98992526314474b6dcc7aa20386a92f114c332;hb=b98b5645e6572a2ee4b00a1d7960b93160f1de7f;hp=0272dde129cd89c83d918ac2f9d6cbd5c3ceb86b;hpb=b8b227b32882fb511c8481a41c53637e7ce7707a;p=friendica.git diff --git a/include/Scrape.php b/include/Scrape.php index 0272dde129..ff98992526 100644 --- a/include/Scrape.php +++ b/include/Scrape.php @@ -2,24 +2,34 @@ require_once('library/HTML5/Parser.php'); -if(! function_exists('attribute_contains')) { -function attribute_contains($attr,$s) { - $a = explode(' ', $attr); - if(count($a) && in_array($s,$a)) - return true; - return false; -}} - - if(! function_exists('scrape_dfrn')) { function scrape_dfrn($url) { + $a = get_app(); + $ret = array(); + + logger('scrape_dfrn: url=' . $url); + $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG); + + + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -31,6 +41,8 @@ function scrape_dfrn($url) { foreach($items as $item) { $x = $item->getAttribute('rel'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) + $ret['feed_atom'] = $item->getAttribute('href'); if(substr($x,0,5) == "dfrn-") $ret[$x] = $item->getAttribute('href'); if($x === 'lrdd') { @@ -53,7 +65,7 @@ function scrape_dfrn($url) { $ret['photo'] = $x->getAttribute('src'); if(attribute_contains($x->getAttribute('class'),'key')) $ret['key'] = $x->textContent; - } + } } } @@ -84,12 +96,31 @@ function validate_dfrn($a) { if(! function_exists('scrape_meta')) { function scrape_meta($url) { + $a = get_app(); + $ret = array(); + + logger('scrape_meta: url=' . $url); + $s = fetch_url($url); if(! $s) return $ret; + $headers = $a->get_curl_headers(); + logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG); + + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + + $dom = HTML5_Parser::parse($s); if(! $dom) @@ -107,3 +138,106 @@ function scrape_meta($url) { return $ret; }} + + +if(! function_exists('scrape_vcard')) { +function scrape_vcard($url) { + + $a = get_app(); + + $ret = array(); + + logger('scrape_vcard: url=' . $url); + + $s = fetch_url($url); + + if(! $s) + return $ret; + + $headers = $a->get_curl_headers(); + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + // don't try and run feeds through the html5 parser + if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml')))) + return ret; + } + } + + $dom = HTML5_Parser::parse($s); + + if(! $dom) + return $ret; + + // Pull out hCard profile elements + + $items = $dom->getElementsByTagName('*'); + foreach($items as $item) { + if(attribute_contains($item->getAttribute('class'), 'vcard')) { + $level2 = $item->getElementsByTagName('*'); + foreach($level2 as $x) { + if(attribute_contains($x->getAttribute('class'),'fn')) + $ret['fn'] = $x->textContent; + if((attribute_contains($x->getAttribute('class'),'photo')) + || (attribute_contains($x->getAttribute('class'),'avatar'))) + $ret['photo'] = $x->getAttribute('src'); + if((attribute_contains($x->getAttribute('class'),'nickname')) + || (attribute_contains($x->getAttribute('class'),'uid'))) + $ret['nick'] = $x->textContent; + } + } + } + + return $ret; +}} + + +if(! function_exists('scrape_feed')) { +function scrape_feed($url) { + + $a = get_app(); + + $ret = array(); + $s = fetch_url($url); + + if(! $s) + return $ret; + + $headers = $a->get_curl_headers(); + logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG); + + $lines = explode("\n",$headers); + if(count($lines)) { + foreach($lines as $line) { + if(stristr($line,'content-type:')) { + if(stristr($line,'application/atom+xml') || stristr($s,'getElementsByTagName('link'); + + // get Atom link elements + + foreach($items as $item) { + $x = $item->getAttribute('rel'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/atom+xml')) + $ret['feed_atom'] = $item->getAttribute('href'); + if(($x === 'alternate') && ($item->getAttribute('type') === 'application/rss+xml')) + $ret['feed_rss'] = $item->getAttribute('href'); + } + + return $ret; +}} \ No newline at end of file