]> git.mxchange.org Git - friendica.git/blob - mod/parse_url.php
parse_url: complete new code for fetching website information
[friendica.git] / mod / parse_url.php
1 <?php
2 if(!function_exists('deletenode')) {
3         function deletenode(&$doc, $node)
4         {
5                 $xpath = new DomXPath($doc);
6                 $list = $xpath->query("//".$node);
7                 foreach ($list as $child)
8                         $child->parentNode->removeChild($child);
9         }
10 }
11
12 function parseurl_getsiteinfo($url) {
13         $siteinfo = array();
14
15         $ch = curl_init();
16         curl_setopt($ch, CURLOPT_URL, $url);
17         curl_setopt($ch, CURLOPT_HEADER, 1);
18         curl_setopt($ch, CURLOPT_NOBODY, 0);
19         curl_setopt($ch, CURLOPT_TIMEOUT, 3);
20         curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
21         curl_setopt($ch,CURLOPT_USERAGENT,'Opera/9.64(Windows NT 5.1; U; de) Presto/2.1.1');
22
23         $header = curl_exec($ch);
24         curl_close($ch);
25
26         if (preg_match('/charset=(.*?)\n/', $header, $matches))
27                 $charset = trim(array_pop($matches));
28         else
29                 $charset = "utf-8";
30
31         $pos = strpos($header, "\r\n\r\n");
32
33         if ($pos)
34                 $body = trim(substr($header, $pos));
35         else
36                 $body = $header;
37
38         $body = mb_convert_encoding($body, "UTF-8", $charset);
39         $body = mb_convert_encoding($body, 'HTML-ENTITIES', "UTF-8");
40
41         $doc = new DOMDocument();
42         @$doc->loadHTML($body);
43
44         deletenode($doc, 'style');
45         deletenode($doc, 'script');
46         deletenode($doc, 'option');
47         deletenode($doc, 'h1');
48         deletenode($doc, 'h2');
49         deletenode($doc, 'h3');
50         deletenode($doc, 'h4');
51         deletenode($doc, 'h5');
52         deletenode($doc, 'h6');
53         deletenode($doc, 'ol');
54         deletenode($doc, 'ul');
55
56         $xpath = new DomXPath($doc);
57
58         $list = $xpath->query("head/title");
59         foreach ($list as $node)
60                 $siteinfo["title"] =  html_entity_decode($node->nodeValue, ENT_QUOTES, "UTF-8");
61
62         $list = $xpath->query("head/meta[@name]");
63         foreach ($list as $node) {
64                 $attr = array();
65                 if ($node->attributes->length)
66                         foreach ($node->attributes as $attribute)
67                                 $attr[$attribute->name] = $attribute->value;
68
69                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
70
71                 switch (strtolower($attr["name"])) {
72                         case "fulltitle":
73                                 $siteinfo["title"] = $attr["content"];
74                                 break;
75                         case "description":
76                                 $siteinfo["text"] = $attr["content"];
77                                 break;
78                         case "dc.title":
79                                 $siteinfo["title"] = $attr["content"];
80                                 break;
81                         case "dc.description":
82                                 $siteinfo["text"] = $attr["content"];
83                                 break;
84                 }
85         }
86
87         $list = $xpath->query("head/meta[@property]");
88         foreach ($list as $node) {
89                 $attr = array();
90                 if ($node->attributes->length)
91                         foreach ($node->attributes as $attribute)
92                                 $attr[$attribute->name] = $attribute->value;
93
94                 $attr["content"] = html_entity_decode($attr["content"], ENT_QUOTES, "UTF-8");
95
96                 switch (strtolower($attr["property"])) {
97                         case "og:image":
98                                 $siteinfo["image"] = $attr["content"];
99                                 break;
100                         case "og:title":
101                                 $siteinfo["title"] = $attr["content"];
102                                 break;
103                         case "og:description":
104                                 $siteinfo["text"] = $attr["content"];
105                                 break;
106                 }
107         }
108
109         if ($siteinfo["image"] == "") {
110                 require_once('include/Photo.php');
111                 $list = $xpath->query("//img[@src]");
112                 foreach ($list as $node) {
113                         $attr = array();
114                         if ($node->attributes->length)
115                                 foreach ($node->attributes as $attribute)
116                                         $attr[$attribute->name] = $attribute->value;
117
118                         // guess mimetype from headers or filename
119                         $type = guess_image_type($attr["src"],true);
120
121                         $i = fetch_url($attr["src"]);
122                         $ph = new Photo($i, $type);
123
124                         if(($ph->getWidth() > 200) and ($ph->getHeight() > 200))
125                                 $siteinfo["image"] = $attr["src"];
126                 }
127         }
128
129         if ($siteinfo["text"] == "") {
130                 $text = "";
131
132                 $list = $xpath->query("//div[@class='article']");
133                 foreach ($list as $node)
134                         $text .= " ".trim($node->nodeValue);
135
136                 if ($text == "") {
137                         $list = $xpath->query("//div[@class='content']");
138                         foreach ($list as $node)
139                                 $text .= " ".trim($node->nodeValue);
140                 }
141
142                 // If none text was found then take the paragraph content
143                 if ($text == "") {
144                         $list = $xpath->query("//p");
145                         foreach ($list as $node)
146                                 $text .= " ".trim($node->nodeValue);
147                 }
148
149                 if ($text != "") {
150                         $text = trim(str_replace(array("\n", "\r"), array(" ", " "), $text));
151
152                         while (strpos($text, "  "))
153                                 $text = trim(str_replace("  ", " ", $text));
154
155                         $siteinfo["text"] = html_entity_decode(substr($text,0,350), ENT_QUOTES, "UTF-8").'...';
156                 }
157         }
158
159         return($siteinfo);
160 }
161
162 function arr_add_hashes(&$item,$k) {
163         $item = '#' . $item;
164 }
165
166 function parse_url_content(&$a) {
167
168         $text = null;
169         $str_tags = '';
170
171         $textmode = false;
172         if(local_user() && intval(get_pconfig(local_user(),'system','plaintext')))
173                 $textmode = true;
174
175         if($textmode)
176         $br = (($textmode) ? "\n" : '<br /?');
177
178         if(x($_GET,'binurl'))
179                 $url = trim(hex2bin($_GET['binurl']));
180         else
181                 $url = trim($_GET['url']);
182
183         if($_GET['title'])
184                 $title = strip_tags(trim($_GET['title']));
185
186         if($_GET['description'])
187                 $text = strip_tags(trim($_GET['description']));
188
189         if($_GET['tags']) {
190                 $arr_tags = str_getcsv($_GET['tags']);
191                 if(count($arr_tags)) {
192                         array_walk($arr_tags,'arr_add_hashes');
193                         $str_tags = $br . implode(' ',$arr_tags) . $br;
194                 }
195         }
196
197         logger('parse_url: ' . $url);
198
199         if($textmode)
200                 $template = $br . '[bookmark=%s]%s[/bookmark]%s' . $br;
201         else
202                 $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
203
204         $arr = array('url' => $url, 'text' => '');
205
206         call_hooks('parse_link', $arr);
207
208         if(strlen($arr['text'])) {
209                 echo $arr['text'];
210                 killme();
211         }
212
213
214         if($url && $title && $text) {
215
216                 if($textmode)
217                         $text = $br . $br . '[quote]' . trim($text) . '[/quote]' . $br;
218                 else
219                         $text = '<br /><br /><blockquote>' . trim($text) . '</blockquote><br />';
220
221                 $title = str_replace(array("\r","\n"),array('',''),$title);
222
223                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
224
225                 logger('parse_url (unparsed): returns: ' . $result);
226
227                 echo $result;
228                 killme();
229         }
230
231         $siteinfo = parseurl_getsiteinfo($url);
232
233         if($siteinfo["title"] == "") {
234                 echo sprintf($template,$url,$url,'') . $str_tags;
235                 killme();
236         } else {
237                 $image = $siteinfo["image"];
238                 $text = $siteinfo["text"];
239                 $title = $siteinfo["title"];
240         }
241
242         if ($image != "") {
243                 $i = fetch_url($image);
244                 if($i) {
245                         require_once('include/Photo.php');
246                         // guess mimetype from headers or filename
247                         $type = guess_image_type($image,true);
248
249                         $ph = new Photo($i, $type);
250                         if($ph->is_valid()) {
251                                 if($ph->getWidth() > 300 || $ph->getHeight() > 300) {
252                                         $ph->scaleImage(300);
253                                         $new_width = $ph->getWidth();
254                                         $new_height = $ph->getHeight();
255                                         if($textmode)
256                                                 $image = $br . $br . '[img=' . $new_width . 'x' . $new_height . ']' . $image . '[/img]';
257                                         else
258                                                 $image = '<br /><br /><img height="' . $new_height . '" width="' . $new_width . '" src="' .$image . '" alt="photo" />';
259                                 } else {
260                                         if($textmode)
261                                                 $image = $br.$br.'[img]'.$image.'[/img]';
262                                         else
263                                                 $image = '<br /><br /><img src="'.$image.'" alt="photo" />';
264                                 }
265                         }
266                 }
267         }
268
269         if(strlen($text)) {
270                 if($textmode)
271                         $text = $br.$br.'[quote]'.trim($text).'[/quote]'.$br ;
272                 else
273                         $text = '<br /><br /><blockquote>'.trim($text).'</blockquote><br />';
274         }
275
276         if($image) {
277                 $text = $image.$br.$text;
278         }
279         $title = str_replace(array("\r","\n"),array('',''),$title);
280
281         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
282
283         logger('parse_url: returns: ' . $result);
284
285         echo $result;
286         killme();
287 }