5 hKit Library for PHP5 - a generic library for parsing Microformats
6 Copyright (C) 2006 Drew McLellan
8 This library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
13 This library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public
19 License along with this library; if not, write to the Free Software
20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 Drew McLellan - http://allinthehead.com/
26 Scott Reynen - http://www.randomchaos.com/
28 Version 0.5, 22-Jul-2006
29 fixed by-ref issue cropping up in PHP 5.0.5
30 fixed a bug with a@title
31 added support for new fn=n optimisation
32 added support for new a.include include-pattern
33 Version 0.4, 23-Jun-2006
34 prevented nested includes from causing infinite loops
35 returns false if URL can't be fetched
36 added pre-flight check for base support level
37 added deduping of once-only classnames
38 prevented accumulation of multiple 'value' values
39 tuned whitespace handling and treatment of DEL elements
40 Version 0.3, 21-Jun-2006
41 added post-processor callback method into profiles
42 fixed minor problems raised by hcard testsuite
43 added support for include-pattern
44 added support for td@headers pattern
45 added implied-n optimization into default hcard profile
46 Version 0.2, 20-Jun-2006
47 added class callback mechanism
48 added resolvePath & resolveEmail
49 added basic BASE support
50 Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
51 added external Tidy option
52 Version 0.1, 20-Jun-2006
63 public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
64 public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
65 public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
67 private $root_class = '';
68 private $classes = '';
69 private $singles = '';
70 private $required = '';
71 private $att_map = '';
72 private $callbacks = '';
73 private $processor = '';
80 public function hKit()
84 $required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
87 foreach ($required as $f){
88 if (!function_exists($f)){
90 $missing[] = $f . '()';
95 die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
100 public function getByURL($profile='', $url='')
103 if ($profile=='' || $url == '') return false;
105 $this->loadProfile($profile);
107 $source = $this->loadURL($url);
110 $tidy_xhtml = $this->tidyThis($source);
114 if (strrchr($url, '#'))
115 $fragment = array_pop(explode('#', $url));
117 $doc = $this->loadDoc($tidy_xhtml, $fragment);
118 $s = $this->processNodes($doc, $this->classes);
119 $s = $this->postProcess($profile, $s);
127 public function getByString($profile='', $input_xml='')
129 if ($profile=='' || $input_xml == '') return false;
131 $this->loadProfile($profile);
133 $doc = $this->loadDoc($input_xml);
134 $s = $this->processNodes($doc, $this->classes);
135 $s = $this->postProcess($profile, $s);
141 private function processNodes($items, $classes, $allow_includes=true){
145 foreach($items as $item){
148 for ($i=0; $i<sizeof($classes); $i++){
150 if (!is_array($classes[$i])){
152 $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
153 $results = $item->xpath($xpath);
156 foreach ($results as $result){
157 if (isset($classes[$i+1]) && is_array($classes[$i+1])){
158 $nodes = $this->processNodes($results, $classes[$i+1]);
159 if (sizeof($nodes) > 0){
160 $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
161 $data[$classes[$i]] = $nodes;
163 $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
167 if (isset($data[$classes[$i]])){
168 if (is_array($data[$classes[$i]])){
169 // is already an array - append
170 $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
174 if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
175 $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
177 $old_val = $data[$classes[$i]];
178 $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
183 // set as normal value
184 $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
189 // td@headers pattern
190 if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
191 $include_ids = explode(' ', $result['headers']);
193 foreach ($include_ids as $id){
194 $xpath = "//*[@id='$id']/..";
195 $includes = $doc->xpath($xpath);
196 foreach ($includes as $include){
197 $tmp = $this->processNodes($include, $this->classes);
198 if (is_array($tmp)) $data = array_merge($data, $tmp);
209 if ($allow_includes){
210 $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
211 $results = $item->xpath($xpath);
214 foreach ($results as $result){
215 $tagName = strtoupper(dom_import_simplexml($result)->tagName);
216 if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
217 && preg_match('/\binclude\b/', $result['class'])){
218 $att = ($tagName == "OBJECT" ? 'data' : 'href');
219 $id = str_replace('#', '', $result[$att]);
221 $xpath = "//*[@id='$id']";
222 $includes = $doc->xpath($xpath);
223 foreach ($includes as $include){
224 $include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
225 $tmp = $this->processNodes($include, $this->classes, false);
226 if (is_array($tmp)) $data = array_merge($data, $tmp);
235 if (sizeof($out) > 1){
237 }else if (isset($data)){
245 private function getNodeValue($node, $className)
248 $tag_name = strtoupper(dom_import_simplexml($node)->tagName);
252 if ($tag_name == 'DEL') return $s;
254 // look up att map values
255 if (array_key_exists($className, $this->att_map)){
257 foreach ($this->att_map[$className] as $map){
258 if (preg_match("/$tag_name\|/", $map)){
259 $s = ''.$node[array_pop($foo = explode('|', $map))];
264 // if nothing and OBJ, try data.
265 if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
267 // if nothing and IMG, try alt.
268 if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
270 // if nothing and AREA, try alt.
271 if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
273 //if nothing and not A, try title.
274 if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
277 // if nothing found, go with node text
278 $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
281 if (array_key_exists($className, $this->callbacks)){
282 $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
285 // trim and remove line breaks
286 if ($tag_name != 'PRE'){
287 $s = trim(preg_replace('/[\r\n\t]+/', '', $s));
288 $s = trim(preg_replace('/(\s{2})+/', ' ', $s));
294 private function filterBlankValues($s){
295 return preg_match("/\w+/", $s);
299 private function tidyThis($source)
301 switch ( $this->tidy_mode )
304 $tmp_file = $this->tmp_dir.md5($source).'.txt';
305 file_put_contents($tmp_file, $source);
306 exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
308 return implode("\n", $tidy);
312 $tidy = tidy_parse_string($source);
313 return tidy_clean_repair($tidy);
324 private function loadProfile($profile)
326 require_once("$profile.profile.php");
330 private function loadDoc($input_xml, $fragment=false)
332 $xml = simplexml_load_string($input_xml);
337 $doc = $xml->xpath("//*[@id='$fragment']");
338 $xml = simplexml_load_string($doc[0]->asXML());
343 if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
345 // xml:base attribute - PITA with SimpleXML
346 preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
347 if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
349 return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
354 private function loadURL($url)
358 if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
359 $url = $this->tidy_proxy . $url;
362 return @file_get_contents($url);
367 private function postProcess($profile, $s)
369 $required = $this->required;
371 if (is_array($s) && array_key_exists($required[0], $s)){
375 $s = $this->dedupeSingles($s);
377 if (function_exists('hKit_'.$profile.'_post')){
378 $s = call_user_func('hKit_'.$profile.'_post', $s);
381 $s = $this->removeTextVals($s);
387 private function resolvePath($filepath)
388 { // ugly code ahoy: needs a serious tidy up
390 $filepath = $filepath[0];
395 if ($base != '' && strpos($base, '://') !== false)
398 $r = parse_url($url);
399 $domain = $r['scheme'] . '://' . $r['host'];
401 if (!isset($r['path'])) $r['path'] = '/';
402 $path = explode('/', $r['path']);
403 $file = explode('/', $filepath);
406 if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
412 return ''.$domain . implode('/', $file);
415 if ($path[sizeof($path)-1] == '') array_pop($path);
416 if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
418 foreach ($file as $segment){
419 if ($segment == '..'){
425 return ''.$domain . implode('/', $path) . implode('/', $new);
429 private function resolveEmail($v)
431 $parts = parse_url($v[0]);
432 return ($parts['path']);
436 private function dedupeSingles($s)
438 $singles = $this->singles;
440 foreach ($s as &$item){
441 foreach ($singles as $classname){
442 if (array_key_exists($classname, $item) && is_array($item[$classname])){
443 if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
451 private function removeTextVals($s)
453 foreach ($s as $key => &$val){
461 $val = $this->removeTextVals($val);
469 return array_filter($s);