- $stamp1 = microtime(true);
-
- // Now fetch the body as well
- $ch = curl_init();
- curl_setopt($ch, CURLOPT_URL, $url);
- curl_setopt($ch, CURLOPT_HEADER, 1);
- curl_setopt($ch, CURLOPT_NOBODY, 0);
- curl_setopt($ch, CURLOPT_TIMEOUT, 10);
- curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
- curl_setopt($ch, CURLOPT_USERAGENT, $a->get_useragent());
-
- $header = curl_exec($ch);
- $curl_info = @curl_getinfo($ch);
- $http_code = $curl_info['http_code'];
- curl_close($ch);
-
- $a->save_timestamp($stamp1, "network");
-
- // Fetch the first mentioned charset. Can be in body or header
- $charset = "";
- if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
- $charset = trim(trim(trim(array_pop($matches)), ';,'));
-
- if ($charset == "")
- $charset = "utf-8";
-
- $pos = strpos($header, "\r\n\r\n");
-
- if ($pos)
- $body = trim(substr($header, $pos));
- else
- $body = $header;
-
- if (($charset != '') AND (strtoupper($charset) != "UTF-8")) {
- logger("parseurl_getsiteinfo: detected charset ".$charset, LOGGER_DEBUG);
- //$body = mb_convert_encoding($body, "UTF-8", $charset);
- $body = iconv($charset, "UTF-8//TRANSLIT", $body);
- }
-
- $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
-
- $doc = new DOMDocument();
- @$doc->loadHTML($body);
-
- deletenode($doc, 'style');
- deletenode($doc, 'script');
- deletenode($doc, 'option');
- deletenode($doc, 'h1');
- deletenode($doc, 'h2');
- deletenode($doc, 'h3');
- deletenode($doc, 'h4');
- deletenode($doc, 'h5');
- deletenode($doc, 'h6');
- deletenode($doc, 'ol');
- deletenode($doc, 'ul');
-
- $xpath = new DomXPath($doc);
-
- $list = $xpath->query("//meta[@content]");
- foreach ($list as $node) {
- $attr = array();
- if ($node->attributes->length)
- foreach ($node->attributes as $attribute)
- $attr[$attribute->name] = $attribute->value;
-
- if (@$attr["http-equiv"] == 'refresh') {
- $path = $attr["content"];
- $pathinfo = explode(";", $path);
- $content = "";
- foreach ($pathinfo AS $value) {
- if (substr(strtolower($value), 0, 4) == "url=")
- $content = substr($value, 4);
- }
- if ($content != "") {
- $siteinfo = parseurl_getsiteinfo($content, $no_guessing, $do_oembed, ++$count);
- return($siteinfo);
- }