]> git.mxchange.org Git - friendica.git/commitdiff
suppress some scraping errors when confronted with hybrid/strange
authorFriendika <info@friendika.com>
Wed, 2 Feb 2011 22:48:27 +0000 (14:48 -0800)
committerFriendika <info@friendika.com>
Wed, 2 Feb 2011 22:48:27 +0000 (14:48 -0800)
feeds that provide insufficient content-type and choke the html parser.

boot.php
include/Scrape.php

index e47f1834d87ecdcc8cd92780d50450ed343fd4a7..0dd5071685765204c50adf215504d39b43c39a8e 100644 (file)
--- a/boot.php
+++ b/boot.php
@@ -1366,6 +1366,7 @@ function lrdd($uri) {
        else {
                $html = fetch_url($uri);
                $headers = $a->get_curl_headers();
+               logger('lrdd: headers=' . $headers, LOGGER_DEBUG);
                $lines = explode("\n",$headers);
                if(count($lines)) {
                        foreach($lines as $line) {                              
@@ -1377,6 +1378,8 @@ function lrdd($uri) {
                                // don't try and run feeds through the html5 parser
                                if(stristr($line,'content-type:') && ((stristr($line,'application/atom+xml')) || (stristr($line,'application/rss+xml'))))
                                        return array();
+                               if(stristr($html,'<rss') || stristr($html,'<feed'))
+                                       return array();
                        }
                }
                if(! isset($link)) {
index bb42c3bdd3474946db9325ad4fb6846141c5571e..ff98992526314474b6dcc7aa20386a92f114c332 100644 (file)
@@ -8,12 +8,18 @@ function scrape_dfrn($url) {
        $a = get_app();
 
        $ret = array();
+
+       logger('scrape_dfrn: url=' . $url);
+
        $s = fetch_url($url);
 
        if(! $s) 
                return $ret;
 
        $headers = $a->get_curl_headers();
+       logger('scrape_dfrn: headers=' . $headers, LOGGER_DEBUG);
+
+
        $lines = explode("\n",$headers);
        if(count($lines)) {
                foreach($lines as $line) {                              
@@ -93,12 +99,17 @@ function scrape_meta($url) {
        $a = get_app();
 
        $ret = array();
+
+       logger('scrape_meta: url=' . $url);
+
        $s = fetch_url($url);
 
        if(! $s) 
                return $ret;
 
        $headers = $a->get_curl_headers();
+       logger('scrape_meta: headers=' . $headers, LOGGER_DEBUG);
+
        $lines = explode("\n",$headers);
        if(count($lines)) {
                foreach($lines as $line) {                              
@@ -135,6 +146,9 @@ function scrape_vcard($url) {
        $a = get_app();
 
        $ret = array();
+
+       logger('scrape_vcard: url=' . $url);
+
        $s = fetch_url($url);
 
        if(! $s) 
@@ -190,15 +204,17 @@ function scrape_feed($url) {
                return $ret;
 
        $headers = $a->get_curl_headers();
+       logger('scrape_feed: headers=' . $headers, LOGGER_DEBUG);
+
        $lines = explode("\n",$headers);
        if(count($lines)) {
                foreach($lines as $line) {                              
                        if(stristr($line,'content-type:')) {
-                               if(stristr($line,'application/atom+xml')) {
+                               if(stristr($line,'application/atom+xml') || stristr($s,'<feed')) {
                                        $ret['feed_atom'] = $url;
                                        return $ret;
                                }
-                               if(stristr($line,'application/rss+xml')) {
+                               if(stristr($line,'application/rss+xml') || stristr($s,'<rss')) {
                                        $ret['feed_rss'] = $url;
                                        return ret;
                                }