]> git.mxchange.org Git - friendica.git/blob - mod/parse_url.php
Suppoorting Open Graph and Dublin Core when displaying single item
[friendica.git] / mod / parse_url.php
1 <?php
2 /* To-Do
3 https://developers.google.com/+/plugins/snippet/
4
5 <meta itemprop="name" content="Toller Titel">
6 <meta itemprop="description" content="Eine tolle Beschreibung">
7 <meta itemprop="image" content="http://maple.libertreeproject.org/images/tree-icon.png">
8
9 <body itemscope itemtype="http://schema.org/Product">
10   <h1 itemprop="name">Shiny Trinket</h1>
11   <img itemprop="image" src="{image-url}" />
12   <p itemprop="description">Shiny trinkets are shiny.</p>
13 </body>
14 */
15
16 if(!function_exists('deletenode')) {
17         function deletenode(&$doc, $node)
18         {
19                 $xpath = new DomXPath($doc);
20                 $list = $xpath->query("//".$node);
21                 foreach ($list as $child)
22                         $child->parentNode->removeChild($child);
23         }
24 }
25
26 function completeurl($url, $scheme) {
27         $urlarr = parse_url($url);
28
29         if (isset($urlarr["scheme"]))
30                 return($url);
31
32         $schemearr = parse_url($scheme);
33
34         $complete = $schemearr["scheme"]."://".$schemearr["host"];
35
36         if (@$schemearr["port"] != "")
37                 $complete .= ":".$schemearr["port"];
38
39                 if(strpos($urlarr['path'],'/') !== 0)
40                         $complete .= '/';
41
42         $complete .= $urlarr["path"];
43
44         if (@$urlarr["query"] != "")
45                 $complete .= "?".$urlarr["query"];
46
47         if (@$urlarr["fragment"] != "")
48                 $complete .= "#".$urlarr["fragment"];
49
50         return($complete);
51 }
52
53 function parseurl_getsiteinfo($url) {
54         $siteinfo = array();
55         $ch = curl_init();
56         curl_setopt($ch, CURLOPT_URL, $url);
57         curl_setopt($ch, CURLOPT_HEADER, 1);
58         curl_setopt($ch, CURLOPT_NOBODY, 0);
59         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
60         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
61         //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
62         curl_setopt($ch,CURLOPT_USERAGENT,'Opera/9.64(Windows NT 5.1; U; de) Presto/2.1.1');
63
64         $header = curl_exec($ch);
65         $curl_info = @curl_getinfo($ch);
66         $http_code = $curl_info['http_code'];
67         curl_close($ch);
68
69         if ((($curl_info['http_code'] == "301") OR ($curl_info['http_code'] == "302"))
70                 AND (($curl_info['redirect_url'] != "") OR ($curl_info['location'] != ""))) {
71                 if ($curl_info['redirect_url'] != "")
72                         $siteinfo = parseurl_getsiteinfo($curl_info['redirect_url']);
73                 else
74                         $siteinfo = parseurl_getsiteinfo($curl_info['location']);
75                 return($siteinfo);
76         }
77
78         // Fetch the first mentioned charset. Can be in body or header
79         if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
80                 $charset = trim(array_pop($matches));
81         else
82                 $charset = "utf-8";
83
84         $pos = strpos($header, "\r\n\r\n");
85
86         if ($pos)
87                 $body = trim(substr($header, $pos));
88         else
89                 $body = $header;
90
91         $body = mb_convert_encoding($body, "UTF-8", $charset);
92         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
93
94         $doc = new DOMDocument();
95         @$doc->loadHTML($body);
96
97         deletenode($doc, 'style');
98         deletenode($doc, 'script');
99         deletenode($doc, 'option');
100         deletenode($doc, 'h1');
101         deletenode($doc, 'h2');
102         deletenode($doc, 'h3');
103         deletenode($doc, 'h4');
104         deletenode($doc, 'h5');
105         deletenode($doc, 'h6');
106         deletenode($doc, 'ol');
107         deletenode($doc, 'ul');
108
109         $xpath = new DomXPath($doc);
110
111         $list = $xpath->query("//meta[@content]");
112         foreach ($list as $node) {
113                 $attr = array();
114                 if ($node->attributes->length)
115                         foreach ($node->attributes as $attribute)
116                                 $attr[$attribute->name] = $attribute->value;
117
118                 if (@$attr["http-equiv"] == 'refresh') {
119                         $path = $attr["content"];
120                         $pathinfo = explode(";", $path);
121                         $content = "";
122                         foreach ($pathinfo AS $value) {
123                                 if (substr(strtolower($value), 0, 4) == "url=")
124                                         $content = substr($value, 4);
125                         }
126                         if ($content != "") {
127                                 $siteinfo = parseurl_getsiteinfo($content);
128                                 return($siteinfo);
129                         }
130                 }
131         }
132
133         //$list = $xpath->query("head/title");
134         $list = $xpath->query("//title");
135         foreach ($list as $node)
136                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
137
138         //$list = $xpath->query("head/meta[@name]");
139         $list = $xpath->query("//meta[@name]");
140         foreach ($list as $node) {
141                 $attr = array();
142                 if ($node->attributes->length)
143                         foreach ($node->attributes as $attribute)
144                                 $attr[$attribute->name] = $attribute->value;
145
146                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
147
148                 switch (strtolower($attr["name"])) {
149                         case "fulltitle":
150                                 $siteinfo["title"] = $attr["content"];
151                                 break;
152                         case "description":
153                                 $siteinfo["text"] = $attr["content"];
154                                 break;
155                         case "dc.title":
156                                 $siteinfo["title"] = $attr["content"];
157                                 break;
158                         case "dc.description":
159                                 $siteinfo["text"] = $attr["content"];
160                                 break;
161                 }
162         }
163
164         //$list = $xpath->query("head/meta[@property]");
165         $list = $xpath->query("//meta[@property]");
166         foreach ($list as $node) {
167                 $attr = array();
168                 if ($node->attributes->length)
169                         foreach ($node->attributes as $attribute)
170                                 $attr[$attribute->name] = $attribute->value;
171
172                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
173
174                 switch (strtolower($attr["property"])) {
175                         case "og:image":
176                                 $siteinfo["image"] = $attr["content"];
177                                 break;
178                         case "og:title":
179                                 $siteinfo["title"] = $attr["content"];
180                                 break;
181                         case "og:description":
182                                 $siteinfo["text"] = $attr["content"];
183                                 break;
184                 }
185         }
186
187         if (@$siteinfo["image"] == "") {
188             $list = $xpath->query("//img[@src]");
189             foreach ($list as $node) {
190                 $attr = array();
191                 if ($node->attributes->length)
192                     foreach ($node->attributes as $attribute)
193                         $attr[$attribute->name] = $attribute->value;
194
195                         $src = completeurl($attr["src"], $url);
196                         $photodata = @getimagesize($src);
197
198                         if (($photodata) && ($photodata[0] > 150) and ($photodata[1] > 150)) {
199                                 if ($photodata[0] > 300) {
200                                         $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
201                                         $photodata[0] = 300;
202                                 }
203                                 if ($photodata[1] > 300) {
204                                         $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
205                                         $photodata[1] = 300;
206                                 }
207                                 $siteinfo["images"][] = array("src"=>$src,
208                                                                 "width"=>$photodata[0],
209                                                                 "height"=>$photodata[1]);
210                         }
211
212                 }
213     } else {
214                 $src = completeurl($siteinfo["image"], $url);
215
216                 unset($siteinfo["image"]);
217
218                 $photodata = @getimagesize($src);
219
220                 if (($photodata) && ($photodata[0] > 10) and ($photodata[1] > 10))
221                         $siteinfo["images"][] = array("src"=>$src,
222                                                         "width"=>$photodata[0],
223                                                         "height"=>$photodata[1]);
224         }
225
226         if (@$siteinfo["text"] == "") {
227                 $text = "";
228
229                 $list = $xpath->query("//div[@class='article']");
230                 foreach ($list as $node)
231                         if (strlen($node->nodeValue) > 40)
232                                 $text .= " ".trim($node->nodeValue);
233
234                 if ($text == "") {
235                         $list = $xpath->query("//div[@class='content']");
236                         foreach ($list as $node)
237                                 if (strlen($node->nodeValue) > 40)
238                                         $text .= " ".trim($node->nodeValue);
239                 }
240
241                 // If none text was found then take the paragraph content
242                 if ($text == "") {
243                         $list = $xpath->query("//p");
244                         foreach ($list as $node)
245                                 if (strlen($node->nodeValue) > 40)
246                                         $text .= " ".trim($node->nodeValue);
247                 }
248
249                 if ($text != "") {
250                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
251
252                         while (strpos($text, "  "))
253                                 $text = trim(str_replace("  ", " ", $text));
254
255                         $siteinfo["text"] = html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...';
256                 }
257         }
258
259         return($siteinfo);
260 }
261
262 function arr_add_hashes(&$item,$k) {
263         $item = '#' . $item;
264 }
265
266 function parse_url_content(&$a) {
267
268         $text = null;
269         $str_tags = '';
270
271         $textmode = false;
272
273         if(local_user() && (! feature_enabled(local_user(),'richtext')))
274                 $textmode = true;
275
276         //if($textmode)
277         $br = (($textmode) ? "\n" : '<br />');
278
279         if(x($_GET,'binurl'))
280                 $url = trim(hex2bin($_GET['binurl']));
281         else
282                 $url = trim($_GET['url']);
283
284         if($_GET['title'])
285                 $title = strip_tags(trim($_GET['title']));
286
287         if($_GET['description'])
288                 $text = strip_tags(trim($_GET['description']));
289
290         if($_GET['tags']) {
291                 $arr_tags = str_getcsv($_GET['tags']);
292                 if(count($arr_tags)) {
293                         array_walk($arr_tags,'arr_add_hashes');
294                         $str_tags = $br . implode(' ',$arr_tags) . $br;
295                 }
296         }
297
298         logger('parse_url: ' . $url);
299
300         if($textmode)
301                 $template = $br . '[bookmark=%s]%s[/bookmark]%s' . $br;
302         else
303                 $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
304
305         $arr = array('url' => $url, 'text' => '');
306
307         call_hooks('parse_link', $arr);
308
309         if(strlen($arr['text'])) {
310                 echo $arr['text'];
311                 killme();
312         }
313
314
315         if($url && $title && $text) {
316
317                 if($textmode)
318                         $text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
319                 else
320                         $text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
321
322                 $title = str_replace(array("\r","\n"),array('',''),$title);
323
324                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
325
326                 logger('parse_url (unparsed): returns: ' . $result);
327
328                 echo $result;
329                 killme();
330         }
331
332         $siteinfo = parseurl_getsiteinfo($url);
333
334         if($siteinfo["title"] == "") {
335                 echo sprintf($template,$url,$url,'') . $str_tags;
336                 killme();
337         } else {
338                 $text = $siteinfo["text"];
339                 $title = $siteinfo["title"];
340         }
341
342         $image = "";
343
344         if(sizeof($siteinfo["images"]) > 0){
345                 /* Execute below code only if image is present in siteinfo */
346
347                 $total_images = 0;
348                 $max_images = get_config('system','max_bookmark_images');
349                 if($max_images === false)
350                         $max_images = 2;
351                 else
352                         $max_images = intval($max_images);
353
354                 foreach ($siteinfo["images"] as $imagedata) {
355                         if($textmode)
356                                 $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]' . "\n";
357                         else
358                                 $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" /><br />';
359                         $total_images ++;
360                         if($max_images && $max_images >= $total_images)
361                                 break;
362         }
363         }
364
365         if(strlen($text)) {
366                 if($textmode)
367                         $text = $br.'[quote]'.trim($text).'[/quote]'.$br ;
368                 else
369                         $text = '<br /><blockquote>'.trim($text).'</blockquote><br />';
370         }
371
372         if($image) {
373                 $text = $br.$br.$image.$text;
374         }
375         $title = str_replace(array("\r","\n"),array('',''),$title);
376
377         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
378
379         logger('parse_url: returns: ' . $result);
380
381         echo trim($result);
382         killme();
383 }