]> git.mxchange.org Git - friendica.git/blob - mod/parse_url.php
fix various html parse errors
[friendica.git] / mod / parse_url.php
1 <?php
2
3 require_once('library/HTML5/Parser.php');
4 require_once('library/HTMLPurifier.auto.php');
5
6 function arr_add_hashes(&$item,$k) {
7         $item = '#' . $item;
8 }
9
10 function parse_url_content(&$a) {
11
12         $text = null;
13         $str_tags = '';
14
15         if(x($_GET,'binurl'))
16                 $url = trim(hex2bin($_GET['binurl']));
17         else
18                 $url = trim($_GET['url']);
19
20         if($_GET['title'])
21                 $title = strip_tags(trim($_GET['title']));
22
23         if($_GET['description'])
24                 $text = strip_tags(trim($_GET['description']));
25
26         if($_GET['tags']) {
27                 $arr_tags = str_getcsv($_GET['tags']);
28                 if(count($arr_tags)) {
29                         array_walk($arr_tags,'arr_add_hashes');
30                         $str_tags = '<br />' . implode(' ',$arr_tags) . '<br />';               
31                 }
32         }
33
34         logger('parse_url: ' . $url);
35
36
37         $template = "<br /><a class=\"bookmark\" href=\"%s\" >%s</a>%s<br />";
38
39
40         $arr = array('url' => $url, 'text' => '');
41
42         call_hooks('parse_link', $arr);
43
44         if(strlen($arr['text'])) {
45                 echo $arr['text'];
46                 killme();
47         }
48
49         if($url && $title && $text) {
50
51                 $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
52                 $title = str_replace(array("\r","\n"),array('',''),$title);
53
54                 $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
55
56                 logger('parse_url (unparsed): returns: ' . $result); 
57
58                 echo $result;
59                 killme();
60         }
61
62
63         if($url) {
64                 $s = fetch_url($url);
65         } else {
66                 echo '';
67                 killme();
68         }
69
70         logger('parse_url: data: ' . $s, LOGGER_DATA);
71
72         if(! $s) {
73                 echo sprintf($template,$url,$url,'') . $str_tags;
74                 killme();
75         }
76
77         if(! $title) {
78                 if(strpos($s,'<title>')) {
79                         $title = substr($s,strpos($s,'<title>')+7,64);
80                         if(strpos($title,'<') !== false)
81                                 $title = strip_tags(substr($title,0,strpos($title,'<')));
82                 }
83         }
84
85         $config = HTMLPurifier_Config::createDefault();
86         $config->set('Cache.DefinitionImpl', null);
87
88         $purifier = new HTMLPurifier($config);
89         $s = $purifier->purify($s);
90
91         try {
92                 $dom = HTML5_Parser::parse($s);
93         } catch (DOMException $e) {
94                 logger('scrape_dfrn: parse error: ' . $e);
95         }
96
97         if(! $dom) {
98                 echo sprintf($template,$url,$url,'') . $str_tags;
99                 killme();
100         }
101
102         $items = $dom->getElementsByTagName('title');
103
104         if($items) {
105                 foreach($items as $item) {
106                         $title = trim($item->textContent);
107                         break;
108                 }
109         }
110
111
112         if(! $text) {
113                 $divs = $dom->getElementsByTagName('div');
114                 if($divs) {
115                         foreach($divs as $div) {
116                                 $class = $div->getAttribute('class');
117                                 if($class && (stristr($class,'article') || stristr($class,'content'))) {
118                                         $items = $div->getElementsByTagName('p');
119                                         if($items) {
120                                                 foreach($items as $item) {
121                                                         $text = $item->textContent;
122                                                         if(stristr($text,'<script')) {
123                                                                 $text = '';
124                                                                 continue;
125                                                         }
126                                                         $text = strip_tags($text);
127                                                         if(strlen($text) < 100) {
128                                                                 $text = '';
129                                                                 continue;
130                                                         }
131                                                         $text = substr($text,0,250) . '...' ;
132                                                         break;
133                                                 }
134                                         }
135                                 }
136                                 if($text)
137                                         break;
138                         }
139                 }
140
141                 if(! $text) {
142                         $items = $dom->getElementsByTagName('p');
143                         if($items) {
144                                 foreach($items as $item) {
145                                         $text = $item->textContent;
146                                         if(stristr($text,'<script'))
147                                                 continue;
148                                         $text = strip_tags($text);
149                                         if(strlen($text) < 100) {
150                                                 $text = '';
151                                                 continue;
152                                         }
153                                         $text = substr($text,0,250) . '...' ;
154                                         break;
155                                 }
156                         }
157                 }
158         }
159
160         if(strlen($text)) {
161                 $text = '<br /><br /><blockquote>' . $text . '</blockquote><br />';
162         }
163
164         $title = str_replace(array("\r","\n"),array('',''),$title);
165
166         $result = sprintf($template,$url,($title) ? $title : $url,$text) . $str_tags;
167
168         logger('parse_url: returns: ' . $result); 
169
170         echo $result;
171         killme();
172 }