]> git.mxchange.org Git - friendica.git/blob - mod/parse_url.php
Merge remote branch 'upstream/master'
[friendica.git] / mod / parse_url.php
1 <?php
2 if(!function_exists('deletenode')) {
3         function deletenode(&$doc, $node)
4         {
5                 $xpath = new DomXPath($doc);
6                 $list = $xpath->query("//".$node);
7                 foreach ($list as $child)
8                         $child->parentNode->removeChild($child);
9         }
10 }
11
12 function completeurl($url, $scheme) {
13         $urlarr = parse_url($url);
14
15         if (isset($urlarr["scheme"]))
16                 return($url);
17
18         $schemearr = parse_url($scheme);
19
20         $complete = $schemearr["scheme"]."://".$schemearr["host"];
21
22         if ($schemearr["port"] != "")
23                 $complete .= ":".$schemearr["port"];
24
25         $complete .= $urlarr["path"];
26
27         if ($urlarr["query"] != "")
28                 $complete .= "?".$urlarr["query"];
29
30         if ($urlarr["fragment"] != "")
31                 $complete .= "#".$urlarr["fragment"];
32
33         return($complete);
34 }
35
36 function parseurl_getsiteinfo($url) {
37         $siteinfo = array();
38
39         $ch = curl_init();
40         curl_setopt($ch, CURLOPT_URL, $url);
41         curl_setopt($ch, CURLOPT_HEADER, 1);
42         curl_setopt($ch, CURLOPT_NOBODY, 0);
43         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
44         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
45         curl_setopt($ch,CURLOPT_USERAGENT,'Opera/9.64(Windows NT 5.1; U; de) Presto/2.1.1');
46
47         $header = curl_exec($ch);
48         curl_close($ch);
49
50         // Fetch the first mentioned charset. Can be in body or header
51         if (preg_match('/charset=(.*?)['."'".'"\s\n]/', $header, $matches))
52                 $charset = trim(array_pop($matches));
53         else
54                 $charset = "utf-8";
55
56         $pos = strpos($header, "\r\n\r\n");
57
58         if ($pos)
59                 $body = trim(substr($header, $pos));
60         else
61                 $body = $header;
62
63         $body = mb_convert_encoding($body, "UTF-8", $charset);
64         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
65
66         $doc = new DOMDocument();
67         @$doc->loadHTML($body);
68
69         deletenode($doc, 'style');
70         deletenode($doc, 'script');
71         deletenode($doc, 'option');
72         deletenode($doc, 'h1');
73         deletenode($doc, 'h2');
74         deletenode($doc, 'h3');
75         deletenode($doc, 'h4');
76         deletenode($doc, 'h5');
77         deletenode($doc, 'h6');
78         deletenode($doc, 'ol');
79         deletenode($doc, 'ul');
80
81         $xpath = new DomXPath($doc);
82
83         //$list = $xpath->query("head/title");
84         $list = $xpath->query("//title");
85         foreach ($list as $node)
86                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
87
88         //$list = $xpath->query("head/meta[@name]");
89         $list = $xpath->query("//meta[@name]");
90         foreach ($list as $node) {
91                 $attr = array();
92                 if ($node->attributes->length)
93                         foreach ($node->attributes as $attribute)
94                                 $attr[$attribute->name] = $attribute->value;
95
96                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
97
98                 switch (strtolower($attr["name"])) {
99                         case "fulltitle":
100                                 $siteinfo["title"] = $attr["content"];
101                                 break;
102                         case "description":
103                                 $siteinfo["text"] = $attr["content"];
104                                 break;
105                         case "dc.title":
106                                 $siteinfo["title"] = $attr["content"];
107                                 break;
108                         case "dc.description":
109                                 $siteinfo["text"] = $attr["content"];
110                                 break;
111                 }
112         }
113
114         //$list = $xpath->query("head/meta[@property]");
115         $list = $xpath->query("//meta[@property]");
116         foreach ($list as $node) {
117                 $attr = array();
118                 if ($node->attributes->length)
119                         foreach ($node->attributes as $attribute)
120                                 $attr[$attribute->name] = $attribute->value;
121
122                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
123
124                 switch (strtolower($attr["property"])) {
125                         case "og:image":
126                                 $siteinfo["image"] = $attr["content"];
127                                 break;
128                         case "og:title":
129                                 $siteinfo["title"] = $attr["content"];
130                                 break;
131                         case "og:description":
132                                 $siteinfo["text"] = $attr["content"];
133                                 break;
134                 }
135         }
136
137         if ($siteinfo["image"] == "") {
138                 $list = $xpath->query("//img[@src]");
139                 foreach ($list as $node) {
140                         $attr = array();
141                         if ($node->attributes->length)
142                                 foreach ($node->attributes as $attribute)
143                                         $attr[$attribute->name] = $attribute->value;
144
145                         $src = completeurl($attr["src"], $url);
146                         $photodata = getimagesize($src);
147
148                         if (($photodata[0] > 150) and ($photodata[1] > 150)) {
149                                 if ($photodata[0] > 300) {
150                                         $photodata[1] = round($photodata[1] * (300 / $photodata[0]));
151                                         $photodata[0] = 300;
152                                 }
153                                 if ($photodata[1] > 300) {
154                                         $photodata[0] = round($photodata[0] * (300 / $photodata[1]));
155                                         $photodata[1] = 300;
156                                 }
157                                 $siteinfo["images"][] = array("src"=>$src,
158                                                                 "width"=>$photodata[0],
159                                                                 "height"=>$photodata[1]);
160                         }
161
162                 }
163         } else {
164                 $src = completeurl($siteinfo["image"], $url);
165                 $photodata = getimagesize($src);
166
167                 if (($photodata[0] > 10) and ($photodata[1] > 10))
168                         $siteinfo["images"][] = array("src"=>$src,
169                                                         "width"=>$photodata[0],
170                                                         "height"=>$photodata[1]);
171         }
172
173         if ($siteinfo["text"] == "") {
174                 $text = "";
175
176                 $list = $xpath->query("//div[@class='article']");
177                 foreach ($list as $node)
178                         if (strlen($node->nodeValue) > 40)
179                                 $text .= " ".trim($node->nodeValue);
180
181                 if ($text == "") {
182                         $list = $xpath->query("//div[@class='content']");
183                         foreach ($list as $node)
184                                 if (strlen($node->nodeValue) > 40)
185                                         $text .= " ".trim($node->nodeValue);
186                 }
187
188                 // If none text was found then take the paragraph content
189                 if ($text == "") {
190                         $list = $xpath->query("//p");
191                         foreach ($list as $node)
192                                 if (strlen($node->nodeValue) > 40)
193                                         $text .= " ".trim($node->nodeValue);
194                 }
195
196                 if ($text != "") {
197                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
198
199                         while (strpos($text, "  "))
200                                 $text = trim(str_replace("  ", " ", $text));
201
202                         $siteinfo["text"] = html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...';
203                 }
204         }
205
206         return($siteinfo);
207 }
208
209 function arr_add_hashes(&$item,$k) {
210         $item = '#' . $item;
211 }
212
213 function parse_url_content(&$a) {
214
215         $text = null;
216         $str_tags = '';
217
218         $textmode = false;
219         if(local_user() && intval(get_pconfig(local_user(),'system','plaintext')))
220                 $textmode = true;
221
222         //if($textmode)
223         $br = (($textmode) ? "\n" : '<br />');
224
225         if(x($_GET,'binurl'))
226                 $url = trim(hex2bin($_GET['binurl']));
227         else
228                 $url = trim($_GET['url']);
229
230         if($_GET['title'])
231                 $title = strip_tags(trim($_GET['title']));
232
233         if($_GET['description'])
234                 $text = strip_tags(trim($_GET['description']));
235
236         if($_GET['tags']) {
237                 $arr_tags = str_getcsv($_GET['tags']);
238                 if(count($arr_tags)) {
239                         array_walk($arr_tags,'arr_add_hashes');
240                         $str_tags = $br . implode(' ',$arr_tags) . $br;
241                 }
242         }
243
244         logger('parse_url: ' . $url);
245
246         if($textmode)
247                 $template = $br . '[bookmark=%s]%s[/bookmark]%s' . $br;
248         else
249                 $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
250
251         $arr = array('url' => $url, 'text' => '');
252
253         call_hooks('parse_link', $arr);
254
255         if(strlen($arr['text'])) {
256                 echo $arr['text'];
257                 killme();
258         }
259
260
261         if($url && $title && $text) {
262
263                 if($textmode)
264                         $text = $br . '[quote]' . trim($text) . '[/quote]' . $br;
265                 else
266                         $text = '<br /><blockquote>' . trim($text) . '</blockquote><br />';
267
268                 $title = str_replace(array("\r","\n"),array('',''),$title);
269
270                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
271
272                 logger('parse_url (unparsed): returns: ' . $result);
273
274                 echo $result;
275                 killme();
276         }
277
278         $siteinfo = parseurl_getsiteinfo($url);
279
280         if($siteinfo["title"] == "") {
281                 echo print_r($siteinfo, true);
282                 //echo sprintf($template,$url,$url,'') . $str_tags;
283                 killme();
284         } else {
285                 $text = $siteinfo["text"];
286                 $title = $siteinfo["title"];
287         }
288
289         $image = "";
290
291         foreach ($siteinfo["images"] as $imagedata)
292                 if($textmode)
293                         $image .= '[img='.$imagedata["width"].'x'.$imagedata["height"].']'.$imagedata["src"].'[/img]';
294                 else
295                         $image .= '<img height="'.$imagedata["height"].'" width="'.$imagedata["width"].'" src="'.$imagedata["src"].'" alt="photo" />';
296
297         if(strlen($text)) {
298                 if($textmode)
299                         $text = $br.'[quote]'.trim($text).'[/quote]'.$br ;
300                 else
301                         $text = '<br /><blockquote>'.trim($text).'</blockquote><br />';
302         }
303
304         if($image) {
305                 $text = $br.$br.$image.$text;
306         }
307         $title = str_replace(array("\r","\n"),array('',''),$title);
308
309         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
310
311         logger('parse_url: returns: ' . $result);
312
313         echo trim($result);
314         killme();
315 }