]> git.mxchange.org Git - quix0rs-gnu-social.git/blob - plugins/OStatus/extlib/hkit/hkit.class.php
Work around weird bug with HTML normalization via PHP DOM module; if source had xmlns...
[quix0rs-gnu-social.git] / plugins / OStatus / extlib / hkit / hkit.class.php
1 <?php
2
3         /* 
4         
5         hKit Library for PHP5 - a generic library for parsing Microformats
6         Copyright (C) 2006  Drew McLellan
7
8         This library is free software; you can redistribute it and/or
9         modify it under the terms of the GNU Lesser General Public
10         License as published by the Free Software Foundation; either
11         version 2.1 of the License, or (at your option) any later version.
12
13         This library is distributed in the hope that it will be useful,
14         but WITHOUT ANY WARRANTY; without even the implied warranty of
15         MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16         Lesser General Public License for more details.
17
18         You should have received a copy of the GNU Lesser General Public
19         License along with this library; if not, write to the Free Software
20         Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
21         
22         Author  
23                 Drew McLellan - http://allinthehead.com/
24                 
25         Contributors:
26                 Scott Reynen - http://www.randomchaos.com/
27                 
28         Version 0.5, 22-Jul-2006
29                 fixed by-ref issue cropping up in PHP 5.0.5
30                 fixed a bug with a@title
31                 added support for new fn=n optimisation
32                 added support for new a.include include-pattern
33         Version 0.4, 23-Jun-2006
34                 prevented nested includes from causing infinite loops
35                 returns false if URL can't be fetched
36                 added pre-flight check for base support level
37                 added deduping of once-only classnames
38                 prevented accumulation of multiple 'value' values
39                 tuned whitespace handling and treatment of DEL elements
40         Version 0.3, 21-Jun-2006
41                 added post-processor callback method into profiles
42                 fixed minor problems raised by hcard testsuite
43                 added support for include-pattern
44                 added support for td@headers pattern
45                 added implied-n optimization into default hcard profile
46         Version 0.2, 20-Jun-2006
47                 added class callback mechanism
48                 added resolvePath & resolveEmail
49                 added basic BASE support
50         Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
51                 added external Tidy option
52         Version 0.1, 20-Jun-2006
53                 initial release
54                 
55         
56         
57         
58         */
59
60         class hKit
61         {
62                 
63                 public $tidy_mode       = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
64                 public $tidy_proxy      = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
65                 public $tmp_dir         = '/path/to/writable/dir/'; // required only for tidy_mode=exec
66                 
67                 private $root_class = '';
68                 private $classes        = '';
69                 private $singles        = '';
70                 private $required       = '';
71                 private $att_map        = '';
72                 private $callbacks      = '';
73                 private $processor      = '';
74                 
75                 private $url            = '';
76                 private $base           = '';
77                 private $doc            = '';
78                 
79                 
80                 public function hKit()
81                 {
82                         // pre-flight checks
83                         $pass           = true; 
84                         $required       = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
85                         $missing        = array();
86                         
87                         foreach ($required as $f){
88                                 if (!function_exists($f)){
89                                         $pass           = false;
90                                         $missing[]      = $f . '()';
91                                 }
92                         }
93                         
94                         if (!$pass)
95                                 die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
96                         
97                 }
98                 
99
100                 public function getByURL($profile='', $url='')
101                 {
102                         
103                         if ($profile=='' || $url == '') return false;
104                         
105                         $this->loadProfile($profile);
106                         
107                         $source         = $this->loadURL($url);
108                         
109                         if ($source){
110                                 $tidy_xhtml     = $this->tidyThis($source);
111
112                                 $fragment       = false;
113                         
114                                 if (strrchr($url, '#'))
115                                 $fragment       = array_pop(explode('#', $url));
116                         
117                                 $doc            = $this->loadDoc($tidy_xhtml, $fragment);
118                                 $s                      = $this->processNodes($doc, $this->classes);
119                                 $s                      = $this->postProcess($profile, $s);
120                         
121                                 return $s;
122                         }else{
123                                 return false;
124                         }
125                 }
126                 
127                 public function getByString($profile='', $input_xml='')
128                 {
129                         if ($profile=='' || $input_xml == '') return false;
130                         
131                         $this->loadProfile($profile);
132
133                         $doc    = $this->loadDoc($input_xml);
134                         $s              = $this->processNodes($doc, $this->classes);
135                         $s              = $this->postProcess($profile, $s);
136                         
137                         return $s;
138                         
139                 }
140                 
141                 private function processNodes($items, $classes, $allow_includes=true){
142
143                         $out    = array();
144
145                         foreach($items as $item){
146                                 $data   = array();
147
148                                 for ($i=0; $i<sizeof($classes); $i++){
149                                         
150                                         if (!is_array($classes[$i])){
151
152                                                 $xpath                  = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
153                                                 $results                = $item->xpath($xpath);
154                                                 
155                                                 if ($results){
156                                                         foreach ($results as $result){ 
157                                                                 if (isset($classes[$i+1]) && is_array($classes[$i+1])){
158                                                                         $nodes                          = $this->processNodes($results, $classes[$i+1]);
159                                                                         if (sizeof($nodes) > 0){
160                                                                                 $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
161                                                                                 $data[$classes[$i]]     = $nodes;
162                                                                         }else{
163                                                                                 $data[$classes[$i]]     = $this->getNodeValue($result, $classes[$i]);
164                                                                         }
165                                                                         
166                                                                 }else{                                                          
167                                                                         if (isset($data[$classes[$i]])){
168                                                                                 if (is_array($data[$classes[$i]])){
169                                                                                         // is already an array - append
170                                                                                         $data[$classes[$i]][]   = $this->getNodeValue($result, $classes[$i]);
171
172                                                                                 }else{
173                                                                                         // make it an array
174                                                                                         if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
175                                                                                                 $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
176                                                                                         }else{
177                                                                                                 $old_val                        = $data[$classes[$i]];
178                                                                                                 $data[$classes[$i]]     = array($old_val, $this->getNodeValue($result, $classes[$i]));
179                                                                                                 $old_val                        = false;
180                                                                                         }
181                                                                                 }
182                                                                         }else{                                                                          
183                                                                                 // set as normal value
184                                                                                 $data[$classes[$i]]     = $this->getNodeValue($result, $classes[$i]);
185
186                                                                         }
187                                                                 }
188                                                         
189                                                                 // td@headers pattern
190                                                                 if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
191                                                                         $include_ids    = explode(' ', $result['headers']);
192                                                                         $doc                    = $this->doc;
193                                                                         foreach ($include_ids as $id){
194                                                                                 $xpath                  = "//*[@id='$id']/..";
195                                                                                 $includes               = $doc->xpath($xpath);
196                                                                                 foreach ($includes as $include){
197                                                                                         $tmp = $this->processNodes($include, $this->classes);
198                                                                                         if (is_array($tmp)) $data = array_merge($data, $tmp);
199                                                                                 }
200                                                                         }
201                                                                 }
202                                                         }                                       
203                                                 }                               
204                                         }
205                                         $result = false;
206                                 }
207                                 
208                                 // include-pattern
209                                 if ($allow_includes){
210                                         $xpath                  = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
211                                         $results                = $item->xpath($xpath);
212                                 
213                                         if ($results){
214                                                 foreach ($results as $result){
215                                                         $tagName = strtoupper(dom_import_simplexml($result)->tagName);
216                                                         if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) 
217                                                                         && preg_match('/\binclude\b/', $result['class'])){      
218                                                                 $att            = ($tagName == "OBJECT" ? 'data' : 'href');                                             
219                                                                 $id                     = str_replace('#', '', $result[$att]);
220                                                                 $doc            = $this->doc;
221                                                                 $xpath          = "//*[@id='$id']";
222                                                                 $includes       = $doc->xpath($xpath);
223                                                                 foreach ($includes as $include){
224                                                                         $include        = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
225                                                                         $tmp            = $this->processNodes($include, $this->classes, false);
226                                                                         if (is_array($tmp)) $data = array_merge($data, $tmp);
227                                                                 }
228                                                         }
229                                                 }
230                                         }
231                                 }
232                                 $out[]  = $data;
233                         }
234                         
235                         if (sizeof($out) > 1){
236                                 return $out;
237                         }else if (isset($data)){
238                                 return $data;
239                         }else{
240                                 return array();
241                         }
242                 }
243
244
245                 private function getNodeValue($node, $className)
246                 {
247
248                         $tag_name       = strtoupper(dom_import_simplexml($node)->tagName);
249                         $s                      = false;
250                         
251                         // ignore DEL tags
252                         if ($tag_name == 'DEL') return $s;
253                         
254                         // look up att map values
255                         if (array_key_exists($className, $this->att_map)){
256                                 
257                                 foreach ($this->att_map[$className] as $map){                                   
258                                         if (preg_match("/$tag_name\|/", $map)){
259                                                 $s      = ''.$node[array_pop($foo = explode('|', $map))];
260                                         }
261                                 }
262                         }
263                         
264                         // if nothing and OBJ, try data.
265                         if (!$s && $tag_name=='OBJECT' && $node['data'])        $s      = ''.$node['data'];
266                         
267                         // if nothing and IMG, try alt.
268                         if (!$s && $tag_name=='IMG' && $node['alt'])    $s      = ''.$node['alt'];
269                         
270                         // if nothing and AREA, try alt.
271                         if (!$s && $tag_name=='AREA' && $node['alt'])   $s      = ''.$node['alt'];
272                         
273                         //if nothing and not A, try title.
274                         if (!$s && $tag_name!='A' && $node['title'])    $s      = ''.$node['title'];
275                                 
276                         
277                         // if nothing found, go with node text
278                         $s      = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));                    
279
280                         // callbacks                    
281                         if (array_key_exists($className, $this->callbacks)){
282                                 $s      = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
283                         }
284                         
285                         // trim and remove line breaks
286                         if ($tag_name != 'PRE'){
287                                 $s      = trim(preg_replace('/[\r\n\t]+/', '', $s));
288                                 $s      = trim(preg_replace('/(\s{2})+/', ' ', $s));
289                         }
290                         
291                         return $s;
292                 }
293
294                 private function filterBlankValues($s){
295                         return preg_match("/\w+/", $s);
296                 }
297                 
298                 
299                 private function tidyThis($source)
300                 {
301                         switch ( $this->tidy_mode )
302                         {
303                                 case 'exec':
304                                         $tmp_file       = $this->tmp_dir.md5($source).'.txt';
305                                         file_put_contents($tmp_file, $source);
306                                         exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
307                                         unlink($tmp_file);
308                                         return implode("\n", $tidy);
309                                 break;
310                                 
311                                 case 'php':
312                                         $tidy   = tidy_parse_string($source);
313                                         return tidy_clean_repair($tidy);
314                                 break;
315                                                 
316                                 default:
317                                         return $source;
318                                 break;
319                         }
320                         
321                 }
322                 
323                 
324                 private function loadProfile($profile)
325                 {
326                         require_once("$profile.profile.php");
327                 }
328                 
329                 
330                 private function loadDoc($input_xml, $fragment=false)
331                 {
332                         $xml            = simplexml_load_string($input_xml);
333                         
334                         $this->doc      = $xml;
335                         
336                         if ($fragment){
337                                 $doc    = $xml->xpath("//*[@id='$fragment']");
338                                 $xml    = simplexml_load_string($doc[0]->asXML());
339                                 $doc    = null;
340                         }
341                         
342                         // base tag
343                         if ($xml->head->base['href']) $this->base = $xml->head->base['href'];                   
344
345                         // xml:base attribute - PITA with SimpleXML
346                         preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
347                         if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
348                                                                 
349                         return  $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
350                         
351                 }
352                 
353                 
354                 private function loadURL($url)
355                 {
356                         $this->url      = $url;
357                         
358                         if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
359                                 $url    = $this->tidy_proxy . $url;
360                         }
361                 
362                         return @file_get_contents($url);
363                         
364                 }
365                 
366                 
367                 private function postProcess($profile, $s)
368                 {
369                         $required       = $this->required;
370                         
371                         if (is_array($s) && array_key_exists($required[0], $s)){
372                                 $s      = array($s);
373                         }
374                         
375                         $s      = $this->dedupeSingles($s);
376                         
377                         if (function_exists('hKit_'.$profile.'_post')){
378                                 $s              = call_user_func('hKit_'.$profile.'_post', $s);
379                         }
380                         
381                         $s      = $this->removeTextVals($s);
382                         
383                         return $s;
384                 }
385                 
386                 
387                 private function resolvePath($filepath)
388                 {       // ugly code ahoy: needs a serious tidy up
389                                         
390                         $filepath       = $filepath[0];
391                         
392                         $base   = $this->base;
393                         $url    = $this->url;
394                         
395                         if ($base != '' &&  strpos($base, '://') !== false)
396                                 $url    = $base;
397                         
398                         $r              = parse_url($url);
399                         $domain = $r['scheme'] . '://' . $r['host'];
400
401                         if (!isset($r['path'])) $r['path'] = '/';
402                         $path   = explode('/', $r['path']);
403                         $file   = explode('/', $filepath);
404                         $new    = array('');
405
406                         if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
407                                 return $filepath;
408                         }
409
410                         if ($file[0] == ''){
411                                 // absolute path
412                                 return ''.$domain . implode('/', $file);
413                         }else{
414                                 // relative path
415                                 if ($path[sizeof($path)-1] == '') array_pop($path);
416                                 if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
417
418                                 foreach ($file as $segment){
419                                         if ($segment == '..'){
420                                                 array_pop($path);
421                                         }else{
422                                                 $new[]  = $segment;
423                                         }
424                                 }
425                                 return ''.$domain . implode('/', $path) . implode('/', $new);
426                         }       
427                 }
428                 
429                 private function resolveEmail($v)
430                 {
431                         $parts  = parse_url($v[0]);
432                         return ($parts['path']);
433                 }
434                 
435                 
436                 private function dedupeSingles($s)
437                 {
438                         $singles        = $this->singles;
439                         
440                         foreach ($s as &$item){
441                                 foreach ($singles as $classname){
442                                         if (array_key_exists($classname, $item) && is_array($item[$classname])){
443                                                 if (isset($item[$classname][0])) $item[$classname]      = $item[$classname][0];
444                                         }
445                                 }
446                         }
447                         
448                         return $s;
449                 }
450                 
451                 private function removeTextVals($s)
452                 {
453                         foreach ($s as $key => &$val){
454                                 if ($key){
455                                         $k = $key;
456                                 }else{
457                                         $k = '';
458                                 }
459                                 
460                                 if (is_array($val)){
461                                         $val = $this->removeTextVals($val);
462                                 }else{
463                                         if ($k == 'text'){
464                                                 $val = '';
465                                         }
466                                 }
467                         }
468                         
469                         return array_filter($s);
470                 }
471
472         }
473
474
475 ?>