<?php
require_once('library/HTML5/Parser.php');
+require_once('library/HTMLPurifier.auto.php');
+
+function arr_add_hashes(&$item,$k) {
+ $item = '#' . $item;
+}
function parse_url_content(&$a) {
- logger('parse_url: ' . $_GET['url']);
+ $text = null;
+ $str_tags = '';
+
+ $textmode = false;
+ if(local_user() && intval(get_pconfig(local_user(),'system','plaintext')))
+ $textmode = true;
+
+ if($textmode)
+ $br = (($textmode) ? "\n" : '<br /?');
+
+ if(x($_GET,'binurl'))
+ $url = trim(hex2bin($_GET['binurl']));
+ else
+ $url = trim($_GET['url']);
+
+ if($_GET['title'])
+ $title = strip_tags(trim($_GET['title']));
- $url = trim(hex2bin($_GET['url']));
+ if($_GET['description'])
+ $text = strip_tags(trim($_GET['description']));
+
+ if($_GET['tags']) {
+ $arr_tags = str_getcsv($_GET['tags']);
+ if(count($arr_tags)) {
+ array_walk($arr_tags,'arr_add_hashes');
+ $str_tags = $br . implode(' ',$arr_tags) . $br;
+ }
+ }
logger('parse_url: ' . $url);
- $text = null;
- $template = "<a href=\"%s\" >%s</a>\n%s";
+ if($textmode)
+ $template = $br . '[bookmark=%s]%s[/bookmark]%s' . $br;
+ else
+ $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
$arr = array('url' => $url, 'text' => '');
killme();
}
+
+ if($url && $title && $text) {
+
+ if($textmode)
+ $text = $br . $br . '[quote]' . $text . '[/quote]' . $br;
+ else
+ $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
+
+ $title = str_replace(array("\r","\n"),array('',''),$title);
+
+ $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
+
+ logger('parse_url (unparsed): returns: ' . $result);
+
+ echo $result;
+ killme();
+ }
+
+
if($url) {
$s = fetch_url($url);
} else {
killme();
}
+// logger('parse_url: data: ' . $s, LOGGER_DATA);
if(! $s) {
- echo sprintf($template,$url,$url,'');
+ echo sprintf($template,$url,$url,'') . $str_tags;
killme();
}
- $dom = @HTML5_Parser::parse($s);
+ $matches = '';
+ $c = preg_match('/\<head(.*?)\>(.*?)\<\/head\>/ism',$s,$matches);
+ if($c) {
+// logger('parse_url: header: ' . $matches[2], LOGGER_DATA);
+ try {
+ $domhead = HTML5_Parser::parse($matches[2]);
+ } catch (DOMException $e) {
+ logger('scrape_dfrn: parse error: ' . $e);
+ }
+ if($domhead)
+ logger('parsed header');
+ }
+
+ if(! $title) {
+ if(strpos($s,'<title>')) {
+ $title = substr($s,strpos($s,'<title>')+7,64);
+ if(strpos($title,'<') !== false)
+ $title = strip_tags(substr($title,0,strpos($title,'<')));
+ }
+ }
- if(! $dom)
- return $ret;
+ $config = HTMLPurifier_Config::createDefault();
+ $config->set('Cache.DefinitionImpl', null);
+ $purifier = new HTMLPurifier($config);
+ $s = $purifier->purify($s);
+
+// logger('purify_output: ' . $s);
+
+ try {
+ $dom = HTML5_Parser::parse($s);
+ } catch (DOMException $e) {
+ logger('scrape_dfrn: parse error: ' . $e);
+ }
+
+ if(! $dom) {
+ echo sprintf($template,$url,$url,'') . $str_tags;
+ killme();
+ }
$items = $dom->getElementsByTagName('title');
}
- $divs = $dom->getElementsByTagName('div');
- if($divs) {
- foreach($divs as $div) {
- $class = $div->getAttribute('class');
- if($class && stristr($class,'article')) {
- $items = $div->getElementsByTagName('p');
- if($items) {
- foreach($items as $item) {
- if($item->getElementsByTagName('script'))
- continue;
- $text = $item->textContent;
- $text = strip_tags($text);
- if(strlen($text) < 100)
- continue;
- $text = substr($text,0,250) . '...' ;
- break;
+ if(! $text) {
+ $divs = $dom->getElementsByTagName('div');
+ if($divs) {
+ foreach($divs as $div) {
+ $class = $div->getAttribute('class');
+ if($class && (stristr($class,'article') || stristr($class,'content'))) {
+ $items = $div->getElementsByTagName('p');
+ if($items) {
+ foreach($items as $item) {
+ $text = $item->textContent;
+ if(stristr($text,'<script')) {
+ $text = '';
+ continue;
+ }
+ $text = strip_tags($text);
+ if(strlen($text) < 100) {
+ $text = '';
+ continue;
+ }
+ $text = substr($text,0,250) . '...' ;
+ break;
+ }
}
}
+ if($text)
+ break;
+ }
+ }
+
+ if(! $text) {
+ $items = $dom->getElementsByTagName('p');
+ if($items) {
+ foreach($items as $item) {
+ $text = $item->textContent;
+ if(stristr($text,'<script'))
+ continue;
+ $text = strip_tags($text);
+ if(strlen($text) < 100) {
+ $text = '';
+ continue;
+ }
+ $text = substr($text,0,250) . '...' ;
+ break;
+ }
}
}
}
if(! $text) {
- $items = $dom->getElementsByTagName('p');
+ logger('parsing meta');
+ $items = $domhead->getElementsByTagName('meta');
if($items) {
foreach($items as $item) {
- if($item->getElementsByTagName('script'))
- continue;
- $text = $item->textContent;
- $text = strip_tags($text);
- if(strlen($text) < 100)
- continue;
- $text = substr($text,0,250) . '...' ;
- break;
+ $property = $item->getAttribute('property');
+ if($property && (stristr($property,':description'))) {
+
+ $text = $item->getAttribute('content');
+ if(stristr($text,'<script')) {
+ $text = '';
+ continue;
+ }
+ $text = strip_tags($text);
+
+
+ $text = substr($text,0,250) . '...' ;
+ }
+ if($property && (stristr($property,':image'))) {
+
+ $image = $item->getAttribute('content');
+ if(stristr($text,'<script')) {
+ $image = '';
+ continue;
+ }
+ $image = strip_tags($image);
+
+ $i = fetch_url($image);
+ if($i) {
+ require_once('include/Photo.php');
+ $ph = new Photo($i);
+ if($ph->is_valid()) {
+ if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
+ $ph->scaleImage(300);
+ $new_width = $ph->getWidth();
+ $new_height = $ph->getHeight();
+ if($textmode)
+ $image = $br . $br . '[img=' . $new_width . 'x' . $new_height . ']' . $image . '[/img]';
+ else
+ $image = '<br /><br /><img height="' . $new_height . '" width="' . $new_width . '" src="' .$image . '" alt="photo" />';
+ }
+ else {
+ if($textmode)
+ $image = $br . $br . '[img]' . $image . '[/img]';
+ else
+ $image = '<br /><br /><img src="' . $image . '" alt="photo" />';
+ }
+ }
+ else
+ $image = '';
+
+ }
+ }
}
}
}
if(strlen($text)) {
- $text = '<br />' . $text;
+ if($textmode)
+ $text = $br .$br . '[quote]' . $text . '[/quote]' . $br ;
+ else
+ $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
+ }
+
+ if($image) {
+ $text = $image . $br . $text;
}
+ $title = str_replace(array("\r","\n"),array('',''),$title);
+
+ $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
+
+ logger('parse_url: returns: ' . $result);
- echo sprintf($template,$url,$title,$text);
+ echo $result;
killme();
}