+ $schemearr = parse_url($scheme);
+
+ $complete = $schemearr["scheme"]."://".$schemearr["host"];
+
+ if ($schemearr["port"] != "")
+ $complete .= ":".$schemearr["port"];
+
+ if(strpos($urlarr['path'],'/') !== 0)
+ $complete .= '/';
+
+ $complete .= $urlarr["path"];
+
+ if ($urlarr["query"] != "")
+ $complete .= "?".$urlarr["query"];
+
+ if ($urlarr["fragment"] != "")
+ $complete .= "#".$urlarr["fragment"];
+
+ return($complete);
+}
+
+function parseurl_getsiteinfo($url) {
+ $siteinfo = array();
+
+ $ch = curl_init();
+ curl_setopt($ch, CURLOPT_URL, $url);
+ curl_setopt($ch, CURLOPT_HEADER, 1);
+ curl_setopt($ch, CURLOPT_NOBODY, 0);
+ curl_setopt($ch, CURLOPT_TIMEOUT, 3);
+ curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+ curl_setopt($ch,CURLOPT_USERAGENT,'Opera/9.64(Windows NT 5.1; U; de) Presto/2.1.1');
+
+ $header = curl_exec($ch);
+ curl_close($ch);
+
+ // Fetch the first mentioned charset. Can be in body or header
+ if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
+ $charset = trim(array_pop($matches));
+ else
+ $charset = "utf-8";
+
+ $pos = strpos($header, "\r\n\r\n");
+
+ if ($pos)
+ $body = trim(substr($header, $pos));
+ else
+ $body = $header;
+
+ $body = mb_convert_encoding($body, "UTF-8", $charset);
+ $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
+
+ $doc = new DOMDocument();
+ @$doc->loadHTML($body);
+
+ deletenode($doc, 'style');
+ deletenode($doc, 'script');
+ deletenode($doc, 'option');
+ deletenode($doc, 'h1');
+ deletenode($doc, 'h2');
+ deletenode($doc, 'h3');
+ deletenode($doc, 'h4');
+ deletenode($doc, 'h5');
+ deletenode($doc, 'h6');
+ deletenode($doc, 'ol');
+ deletenode($doc, 'ul');
+
+ $xpath = new DomXPath($doc);