]> git.mxchange.org Git - friendica.git/blob - mod/parse_url.php
better handling of remote urls - especially FB opengraph
[friendica.git] / mod / parse_url.php
1 <?php
2
3 require_once('library/HTML5/Parser.php');
4 require_once('library/HTMLPurifier.auto.php');
5
6 function arr_add_hashes(&$item,$k) {
7         $item = '#' . $item;
8 }
9
10 function parse_url_content(&$a) {
11
12         $text = null;
13         $str_tags = '';
14
15         if(x($_GET,'binurl'))
16                 $url = trim(hex2bin($_GET['binurl']));
17         else
18                 $url = trim($_GET['url']);
19
20         if($_GET['title'])
21                 $title = strip_tags(trim($_GET['title']));
22
23         if($_GET['description'])
24                 $text = strip_tags(trim($_GET['description']));
25
26         if($_GET['tags']) {
27                 $arr_tags = str_getcsv($_GET['tags']);
28                 if(count($arr_tags)) {
29                         array_walk($arr_tags,'arr_add_hashes');
30                         $str_tags = '<br />' . implode(' ',$arr_tags) . '<br />';               
31                 }
32         }
33
34         logger('parse_url: ' . $url);
35
36
37         $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
38
39
40         $arr = array('url' => $url, 'text' => '');
41
42         call_hooks('parse_link', $arr);
43
44         if(strlen($arr['text'])) {
45                 echo $arr['text'];
46                 killme();
47         }
48
49
50         if($url && $title && $text) {
51
52                 $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
53                 $title = str_replace(array("\r","\n"),array('',''),$title);
54
55                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
56
57                 logger('parse_url (unparsed): returns: ' . $result); 
58
59                 echo $result;
60                 killme();
61         }
62
63
64         if($url) {
65                 $s = fetch_url($url);
66         } else {
67                 echo '';
68                 killme();
69         }
70
71 //      logger('parse_url: data: ' . $s, LOGGER_DATA);
72
73         if(! $s) {
74                 echo sprintf($template,$url,$url,'') . $str_tags;
75                 killme();
76         }
77
78         $matches = '';
79         $c = preg_match('/\<head(.*?)\>(.*?)\<\/head\>/ism',$s,$matches);
80         if($c) {
81 //              logger('parse_url: header: ' . $matches[2], LOGGER_DATA);
82                 try {
83                         $domhead = HTML5_Parser::parse($matches[2]);
84                 } catch (DOMException $e) {
85                         logger('scrape_dfrn: parse error: ' . $e);
86                 }
87                 if($domhead)
88                         logger('parsed header');
89         }
90
91         if(! $title) {
92                 if(strpos($s,'<title>')) {
93                         $title = substr($s,strpos($s,'<title>')+7,64);
94                         if(strpos($title,'<') !== false)
95                                 $title = strip_tags(substr($title,0,strpos($title,'<')));
96                 }
97         }
98
99         $config = HTMLPurifier_Config::createDefault();
100         $config->set('Cache.DefinitionImpl', null);
101         $purifier = new HTMLPurifier($config);
102         $s = $purifier->purify($s);
103
104 //      logger('purify_output: ' . $s);
105
106         try {
107                 $dom = HTML5_Parser::parse($s);
108         } catch (DOMException $e) {
109                 logger('scrape_dfrn: parse error: ' . $e);
110         }
111
112         if(! $dom) {
113                 echo sprintf($template,$url,$url,'') . $str_tags;
114                 killme();
115         }
116
117         $items = $dom->getElementsByTagName('title');
118
119         if($items) {
120                 foreach($items as $item) {
121                         $title = trim($item->textContent);
122                         break;
123                 }
124         }
125
126
127         if(! $text) {
128                 $divs = $dom->getElementsByTagName('div');
129                 if($divs) {
130                         foreach($divs as $div) {
131                                 $class = $div->getAttribute('class');
132                                 if($class && (stristr($class,'article') || stristr($class,'content'))) {
133                                         $items = $div->getElementsByTagName('p');
134                                         if($items) {
135                                                 foreach($items as $item) {
136                                                         $text = $item->textContent;
137                                                         if(stristr($text,'<script')) {
138                                                                 $text = '';
139                                                                 continue;
140                                                         }
141                                                         $text = strip_tags($text);
142                                                         if(strlen($text) < 100) {
143                                                                 $text = '';
144                                                                 continue;
145                                                         }
146                                                         $text = substr($text,0,250) . '...' ;
147                                                         break;
148                                                 }
149                                         }
150                                 }
151                                 if($text)
152                                         break;
153                         }
154                 }
155
156                 if(! $text) {
157                         $items = $dom->getElementsByTagName('p');
158                         if($items) {
159                                 foreach($items as $item) {
160                                         $text = $item->textContent;
161                                         if(stristr($text,'<script'))
162                                                 continue;
163                                         $text = strip_tags($text);
164                                         if(strlen($text) < 100) {
165                                                 $text = '';
166                                                 continue;
167                                         }
168                                         $text = substr($text,0,250) . '...' ;
169                                         break;
170                                 }
171                         }
172                 }
173         }
174
175         if(! $text) {
176                 logger('parsing meta');
177                 $items = $domhead->getElementsByTagName('meta');
178                 if($items) {
179                         foreach($items as $item) {
180                                 $property = $item->getAttribute('property');
181                                 if($property && (stristr($property,':description'))) {
182
183                                         $text = $item->getAttribute('content');
184                                         if(stristr($text,'<script')) {
185                                                 $text = '';
186                                                 continue;
187                                         }
188                                         $text = strip_tags($text);
189
190
191                                         $text = substr($text,0,250) . '...' ;
192                                 }
193                                 if($property && (stristr($property,':image'))) {
194
195                                         $image = $item->getAttribute('content');
196                                         if(stristr($text,'<script')) {
197                                                 $image = '';
198                                                 continue;
199                                         }
200                                         $image = strip_tags($image);
201                                         
202                                         $i = fetch_url($image);
203                                         if($i) {
204                                                 require_once('include/Photo.php');
205                                                 $ph = new Photo($i);
206                                                 if($ph->is_valid()) {
207                                                         if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
208                                                                 $ph->scaleImage(300);
209                                                                 $new_width = $ph->getWidth();
210                                                                 $new_height = $ph->getHeight();
211                                                                 $image = '<br /><br /><img height="' . $new_height . '" width="' . $new_width . '" src="' .$image . '" alt="photo" />';
212                                                         }
213                                                         else
214                                                                 $image = '<br /><br /><img src="' . $image . '" alt="photo" />';
215                                                 }
216                                                 else
217                                                         $image = '';
218                                         
219                                         }
220                                 }
221                         }
222                 }
223         }
224
225         if(strlen($text)) {
226                 $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
227         }
228
229         if($image) {
230                 $text = $image . '<br />' . $text;
231         }
232         $title = str_replace(array("\r","\n"),array('',''),$title);
233
234         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
235
236         logger('parse_url: returns: ' . $result); 
237
238         echo $result;
239         killme();
240 }