[ Index ]

PHP Cross Reference of vtigercrm-6.1.0

title

Body

[close]

/include/simplehtmldom/ -> simple_html_dom.php (source)

   1  <?php
   2  /**

   3   * Website: http://sourceforge.net/projects/simplehtmldom/

   4   * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)

   5   * Contributions by:

   6   *     Yousuke Kumakura (Attribute filters)

   7   *     Vadim Voituk (Negative indexes supports of "find" method)

   8   *     Antcs (Constructor with automatically load contents either text or file/url)

   9   *

  10   * all affected sections have comments starting with "PaperG"

  11   *

  12   * Paperg - Added case insensitive testing of the value of the selector.

  13   * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately.

  14   *  This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source,

  15   *  it will almost always be smaller by some amount.

  16   *  We use this to determine how far into the file the tag in question is.  This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from.

  17   *  but for most purposes, it's a really good estimation.

  18   * Paperg - Added the forceTagsClosed to the dom constructor.  Forcing tags closed is great for malformed html, but it CAN lead to parsing errors.

  19   * Allow the user to tell us how much they trust the html.

  20   * Paperg add the text and plaintext to the selectors for the find syntax.  plaintext implies text in the innertext of a node.  text implies that the tag is a text node.

  21   * This allows for us to find tags based on the text they contain.

  22   * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag.

  23   * Paperg: added parse_charset so that we know about the character set of the source document.

  24   *  NOTE:  If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the

  25   *  last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection.

  26   *

  27   * Found infinite loop in the case of broken html in restore_noise.  Rewrote to protect from that.

  28   * PaperG (John Schlick) Added get_display_size for "IMG" tags.

  29   *

  30   * Licensed under The MIT License

  31   * Redistributions of files must retain the above copyright notice.

  32   *

  33   * @author S.C. Chen <[email protected]>

  34   * @author John Schlick

  35   * @author Rus Carroll

  36   * @version 1.5 ($Rev: 196 $)

  37   * @package PlaceLocalInclude

  38   * @subpackage simple_html_dom

  39   */
  40  
  41  /**

  42   * All of the Defines for the classes below.

  43   * @author S.C. Chen <[email protected]>

  44   */
  45  define('HDOM_TYPE_ELEMENT', 1);
  46  define('HDOM_TYPE_COMMENT', 2);
  47  define('HDOM_TYPE_TEXT',    3);
  48  define('HDOM_TYPE_ENDTAG',  4);
  49  define('HDOM_TYPE_ROOT',    5);
  50  define('HDOM_TYPE_UNKNOWN', 6);
  51  define('HDOM_QUOTE_DOUBLE', 0);
  52  define('HDOM_QUOTE_SINGLE', 1);
  53  define('HDOM_QUOTE_NO',     3);
  54  define('HDOM_INFO_BEGIN',   0);
  55  define('HDOM_INFO_END',     1);
  56  define('HDOM_INFO_QUOTE',   2);
  57  define('HDOM_INFO_SPACE',   3);
  58  define('HDOM_INFO_TEXT',    4);
  59  define('HDOM_INFO_INNER',   5);
  60  define('HDOM_INFO_OUTER',   6);
  61  define('HDOM_INFO_ENDSPACE',7);
  62  define('DEFAULT_TARGET_CHARSET', 'UTF-8');
  63  define('DEFAULT_BR_TEXT', "\r\n");
  64  define('DEFAULT_SPAN_TEXT', " ");
  65  define('MAX_FILE_SIZE', 600000);
  66  // helper functions

  67  // -----------------------------------------------------------------------------

  68  // get html dom from file

  69  // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.

  70  function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  71  {
  72      // We DO force the tags to be terminated.

  73      $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  74      // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done.

  75      $contents = file_get_contents($url, $use_include_path, $context, $offset);
  76      // Paperg - use our own mechanism for getting the contents as we want to control the timeout.

  77      //$contents = retrieve_url_contents($url);

  78      if (empty($contents) || strlen($contents) > MAX_FILE_SIZE)
  79      {
  80          return false;
  81      }
  82      // The second parameter can force the selectors to all be lowercase.

  83      $dom->load($contents, $lowercase, $stripRN);
  84      return $dom;
  85  }
  86  
  87  // get html dom from string

  88  function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
  89  {
  90      $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
  91      if (empty($str) || strlen($str) > MAX_FILE_SIZE)
  92      {
  93          $dom->clear();
  94          return false;
  95      }
  96      $dom->load($str, $lowercase, $stripRN);
  97      return $dom;
  98  }
  99  
 100  // dump html dom tree

 101  function dump_html_tree($node, $show_attr=true, $deep=0)
 102  {
 103      $node->dump($node);
 104  }
 105  
 106  
 107  /**

 108   * simple html dom node

 109   * PaperG - added ability for "find" routine to lowercase the value of the selector.

 110   * PaperG - added $tag_start to track the start position of the tag in the total byte index

 111   *

 112   * @package PlaceLocalInclude

 113   */
 114  class simple_html_dom_node
 115  {
 116      public $nodetype = HDOM_TYPE_TEXT;
 117      public $tag = 'text';
 118      public $attr = array();
 119      public $children = array();
 120      public $nodes = array();
 121      public $parent = null;
 122      // The "info" array - see HDOM_INFO_... for what each element contains.

 123      public $_ = array();
 124      public $tag_start = 0;
 125      private $dom = null;
 126  
 127      function __construct($dom)
 128      {
 129          $this->dom = $dom;
 130          $dom->nodes[] = $this;
 131      }
 132  
 133      function __destruct()
 134      {
 135          $this->clear();
 136      }
 137  
 138      function __toString()
 139      {
 140          return $this->outertext();
 141      }
 142  
 143      // clean up memory due to php5 circular references memory leak...

 144      function clear()
 145      {
 146          $this->dom = null;
 147          $this->nodes = null;
 148          $this->parent = null;
 149          $this->children = null;
 150      }
 151  
 152      // dump node's tree

 153      function dump($show_attr=true, $deep=0)
 154      {
 155          $lead = str_repeat('    ', $deep);
 156  
 157          echo $lead.$this->tag;
 158          if ($show_attr && count($this->attr)>0)
 159          {
 160              echo '(';
 161              foreach ($this->attr as $k=>$v)
 162                  echo "[$k]=>\"".$this->$k.'", ';
 163              echo ')';
 164          }
 165          echo "\n";
 166  
 167          if ($this->nodes)
 168          {
 169              foreach ($this->nodes as $c)
 170              {
 171                  $c->dump($show_attr, $deep+1);
 172              }
 173          }
 174      }
 175  
 176  
 177      // Debugging function to dump a single dom node with a bunch of information about it.

 178      function dump_node($echo=true)
 179      {
 180  
 181          $string = $this->tag;
 182          if (count($this->attr)>0)
 183          {
 184              $string .= '(';
 185              foreach ($this->attr as $k=>$v)
 186              {
 187                  $string .= "[$k]=>\"".$this->$k.'", ';
 188              }
 189              $string .= ')';
 190          }
 191          if (count($this->_)>0)
 192          {
 193              $string .= ' $_ (';
 194              foreach ($this->_ as $k=>$v)
 195              {
 196                  if (is_array($v))
 197                  {
 198                      $string .= "[$k]=>(";
 199                      foreach ($v as $k2=>$v2)
 200                      {
 201                          $string .= "[$k2]=>\"".$v2.'", ';
 202                      }
 203                      $string .= ")";
 204                  } else {
 205                      $string .= "[$k]=>\"".$v.'", ';
 206                  }
 207              }
 208              $string .= ")";
 209          }
 210  
 211          if (isset($this->text))
 212          {
 213              $string .= " text: (" . $this->text . ")";
 214          }
 215  
 216          $string .= " HDOM_INNER_INFO: '";
 217          if (isset($node->_[HDOM_INFO_INNER]))
 218          {
 219              $string .= $node->_[HDOM_INFO_INNER] . "'";
 220          }
 221          else
 222          {
 223              $string .= ' NULL ';
 224          }
 225  
 226          $string .= " children: " . count($this->children);
 227          $string .= " nodes: " . count($this->nodes);
 228          $string .= " tag_start: " . $this->tag_start;
 229          $string .= "\n";
 230  
 231          if ($echo)
 232          {
 233              echo $string;
 234              return;
 235          }
 236          else
 237          {
 238              return $string;
 239          }
 240      }
 241  
 242      // returns the parent of node

 243      // If a node is passed in, it will reset the parent of the current node to that one.

 244      function parent($parent=null)
 245      {
 246          // I am SURE that this doesn't work properly.

 247          // It fails to unset the current node from it's current parents nodes or children list first.

 248          if ($parent !== null)
 249          {
 250              $this->parent = $parent;
 251              $this->parent->nodes[] = $this;
 252              $this->parent->children[] = $this;
 253          }
 254  
 255          return $this->parent;
 256      }
 257  
 258      // verify that node has children

 259      function has_child()
 260      {
 261          return !empty($this->children);
 262      }
 263  
 264      // returns children of node

 265      function children($idx=-1)
 266      {
 267          if ($idx===-1)
 268          {
 269              return $this->children;
 270          }
 271          if (isset($this->children[$idx])) return $this->children[$idx];
 272          return null;
 273      }
 274  
 275      // returns the first child of node

 276      function first_child()
 277      {
 278          if (count($this->children)>0)
 279          {
 280              return $this->children[0];
 281          }
 282          return null;
 283      }
 284  
 285      // returns the last child of node

 286      function last_child()
 287      {
 288          if (($count=count($this->children))>0)
 289          {
 290              return $this->children[$count-1];
 291          }
 292          return null;
 293      }
 294  
 295      // returns the next sibling of node

 296      function next_sibling()
 297      {
 298          if ($this->parent===null)
 299          {
 300              return null;
 301          }
 302  
 303          $idx = 0;
 304          $count = count($this->parent->children);
 305          while ($idx<$count && $this!==$this->parent->children[$idx])
 306          {
 307              ++$idx;
 308          }
 309          if (++$idx>=$count)
 310          {
 311              return null;
 312          }
 313          return $this->parent->children[$idx];
 314      }
 315  
 316      // returns the previous sibling of node

 317      function prev_sibling()
 318      {
 319          if ($this->parent===null) return null;
 320          $idx = 0;
 321          $count = count($this->parent->children);
 322          while ($idx<$count && $this!==$this->parent->children[$idx])
 323              ++$idx;
 324          if (--$idx<0) return null;
 325          return $this->parent->children[$idx];
 326      }
 327  
 328      // function to locate a specific ancestor tag in the path to the root.

 329      function find_ancestor_tag($tag)
 330      {
 331          global $debugObject;
 332          if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 333  
 334          // Start by including ourselves in the comparison.

 335          $returnDom = $this;
 336  
 337          while (!is_null($returnDom))
 338          {
 339              if (is_object($debugObject)) { $debugObject->debugLog(2, "Current tag is: " . $returnDom->tag); }
 340  
 341              if ($returnDom->tag == $tag)
 342              {
 343                  break;
 344              }
 345              $returnDom = $returnDom->parent;
 346          }
 347          return $returnDom;
 348      }
 349  
 350      // get dom node's inner html

 351      function innertext()
 352      {
 353          if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 354          if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 355  
 356          $ret = '';
 357          foreach ($this->nodes as $n)
 358              $ret .= $n->outertext();
 359          return $ret;
 360      }
 361  
 362      // get dom node's outer text (with tag)

 363      function outertext()
 364      {
 365          global $debugObject;
 366          if (is_object($debugObject))
 367          {
 368              $text = '';
 369              if ($this->tag == 'text')
 370              {
 371                  if (!empty($this->text))
 372                  {
 373                      $text = " with text: " . $this->text;
 374                  }
 375              }
 376              $debugObject->debugLog(1, 'Innertext of tag: ' . $this->tag . $text);
 377          }
 378  
 379          if ($this->tag==='root') return $this->innertext();
 380  
 381          // trigger callback

 382          if ($this->dom && $this->dom->callback!==null)
 383          {
 384              call_user_func_array($this->dom->callback, array($this));
 385          }
 386  
 387          if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
 388          if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 389  
 390          // render begin tag

 391          if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]])
 392          {
 393              $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
 394          } else {
 395              $ret = "";
 396          }
 397  
 398          // render inner text

 399          if (isset($this->_[HDOM_INFO_INNER]))
 400          {
 401              // If it's a br tag...  don't return the HDOM_INNER_INFO that we may or may not have added.

 402              if ($this->tag != "br")
 403              {
 404                  $ret .= $this->_[HDOM_INFO_INNER];
 405              }
 406          } else {
 407              if ($this->nodes)
 408              {
 409                  foreach ($this->nodes as $n)
 410                  {
 411                      $ret .= $this->convert_text($n->outertext());
 412                  }
 413              }
 414          }
 415  
 416          // render end tag

 417          if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
 418              $ret .= '</'.$this->tag.'>';
 419          return $ret;
 420      }
 421  
 422      // get dom node's plain text

 423      function text()
 424      {
 425          if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
 426          switch ($this->nodetype)
 427          {
 428              case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 429              case HDOM_TYPE_COMMENT: return '';
 430              case HDOM_TYPE_UNKNOWN: return '';
 431          }
 432          if (strcasecmp($this->tag, 'script')===0) return '';
 433          if (strcasecmp($this->tag, 'style')===0) return '';
 434  
 435          $ret = '';
 436          // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL.

 437          // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening.

 438          // WHY is this happening?

 439          if (!is_null($this->nodes))
 440          {
 441              foreach ($this->nodes as $n)
 442              {
 443                  $ret .= $this->convert_text($n->text());
 444              }
 445  
 446              // If this node is a span... add a space at the end of it so multiple spans don't run into each other.  This is plaintext after all.

 447              if ($this->tag == "span")
 448              {
 449                  $ret .= $this->dom->default_span_text;
 450              }
 451  
 452  
 453          }
 454          return $ret;
 455      }
 456  
 457      function xmltext()
 458      {
 459          $ret = $this->innertext();
 460          $ret = str_ireplace('<![CDATA[', '', $ret);
 461          $ret = str_replace(']]>', '', $ret);
 462          return $ret;
 463      }
 464  
 465      // build node's text with tag

 466      function makeup()
 467      {
 468          // text, comment, unknown

 469          if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
 470  
 471          $ret = '<'.$this->tag;
 472          $i = -1;
 473  
 474          foreach ($this->attr as $key=>$val)
 475          {
 476              ++$i;
 477  
 478              // skip removed attribute

 479              if ($val===null || $val===false)
 480                  continue;
 481  
 482              $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
 483              //no value attr: nowrap, checked selected...

 484              if ($val===true)
 485                  $ret .= $key;
 486              else {
 487                  switch ($this->_[HDOM_INFO_QUOTE][$i])
 488                  {
 489                      case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
 490                      case HDOM_QUOTE_SINGLE: $quote = '\''; break;
 491                      default: $quote = '';
 492                  }
 493                  $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
 494              }
 495          }
 496          $ret = $this->dom->restore_noise($ret);
 497          return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
 498      }
 499  
 500      // find elements by css selector

 501      //PaperG - added ability for find to lowercase the value of the selector.

 502      function find($selector, $idx=null, $lowercase=false)
 503      {
 504          $selectors = $this->parse_selector($selector);
 505          if (($count=count($selectors))===0) return array();
 506          $found_keys = array();
 507  
 508          // find each selector

 509          for ($c=0; $c<$count; ++$c)
 510          {
 511              // The change on the below line was documented on the sourceforge code tracker id 2788009

 512              // used to be: if (($levle=count($selectors[0]))===0) return array();

 513              if (($levle=count($selectors[$c]))===0) return array();
 514              if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
 515  
 516              $head = array($this->_[HDOM_INFO_BEGIN]=>1);
 517  
 518              // handle descendant selectors, no recursive!

 519              for ($l=0; $l<$levle; ++$l)
 520              {
 521                  $ret = array();
 522                  foreach ($head as $k=>$v)
 523                  {
 524                      $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
 525                      //PaperG - Pass this optional parameter on to the seek function.

 526                      $n->seek($selectors[$c][$l], $ret, $lowercase);
 527                  }
 528                  $head = $ret;
 529              }
 530  
 531              foreach ($head as $k=>$v)
 532              {
 533                  if (!isset($found_keys[$k]))
 534                      $found_keys[$k] = 1;
 535              }
 536          }
 537  
 538          // sort keys

 539          ksort($found_keys);
 540  
 541          $found = array();
 542          foreach ($found_keys as $k=>$v)
 543              $found[] = $this->dom->nodes[$k];
 544  
 545          // return nth-element or array

 546          if (is_null($idx)) return $found;
 547          else if ($idx<0) $idx = count($found) + $idx;
 548          return (isset($found[$idx])) ? $found[$idx] : null;
 549      }
 550  
 551      // seek for given conditions

 552      // PaperG - added parameter to allow for case insensitive testing of the value of a selector.

 553      protected function seek($selector, &$ret, $lowercase=false)
 554      {
 555          global $debugObject;
 556          if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
 557  
 558          list($tag, $key, $val, $exp, $no_key) = $selector;
 559  
 560          // xpath index

 561          if ($tag && $key && is_numeric($key))
 562          {
 563              $count = 0;
 564              foreach ($this->children as $c)
 565              {
 566                  if ($tag==='*' || $tag===$c->tag) {
 567                      if (++$count==$key) {
 568                          $ret[$c->_[HDOM_INFO_BEGIN]] = 1;
 569                          return;
 570                      }
 571                  }
 572              }
 573              return;
 574          }
 575  
 576          $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
 577          if ($end==0) {
 578              $parent = $this->parent;
 579              while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
 580                  $end -= 1;
 581                  $parent = $parent->parent;
 582              }
 583              $end += $parent->_[HDOM_INFO_END];
 584          }
 585  
 586          for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
 587              $node = $this->dom->nodes[$i];
 588  
 589              $pass = true;
 590  
 591              if ($tag==='*' && !$key) {
 592                  if (in_array($node, $this->children, true))
 593                      $ret[$i] = 1;
 594                  continue;
 595              }
 596  
 597              // compare tag

 598              if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
 599              // compare key

 600              if ($pass && $key) {
 601                  if ($no_key) {
 602                      if (isset($node->attr[$key])) $pass=false;
 603                  } else {
 604                      if (($key != "plaintext") && !isset($node->attr[$key])) $pass=false;
 605                  }
 606              }
 607              // compare value

 608              if ($pass && $key && $val  && $val!=='*') {
 609                  // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right?

 610                  if ($key == "plaintext") {
 611                      // $node->plaintext actually returns $node->text();

 612                      $nodeKeyValue = $node->text();
 613                  } else {
 614                      // this is a normal search, we want the value of that attribute of the tag.

 615                      $nodeKeyValue = $node->attr[$key];
 616                  }
 617                  if (is_object($debugObject)) {$debugObject->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue);}
 618  
 619                  //PaperG - If lowercase is set, do a case insensitive test of the value of the selector.

 620                  if ($lowercase) {
 621                      $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue));
 622                  } else {
 623                      $check = $this->match($exp, $val, $nodeKeyValue);
 624                  }
 625                  if (is_object($debugObject)) {$debugObject->debugLog(2, "after match: " . ($check ? "true" : "false"));}
 626  
 627                  // handle multiple class

 628                  if (!$check && strcasecmp($key, 'class')===0) {
 629                      foreach (explode(' ',$node->attr[$key]) as $k) {
 630                          // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form.

 631                          if (!empty($k)) {
 632                              if ($lowercase) {
 633                                  $check = $this->match($exp, strtolower($val), strtolower($k));
 634                              } else {
 635                                  $check = $this->match($exp, $val, $k);
 636                              }
 637                              if ($check) break;
 638                          }
 639                      }
 640                  }
 641                  if (!$check) $pass = false;
 642              }
 643              if ($pass) $ret[$i] = 1;
 644              unset($node);
 645          }
 646          // It's passed by reference so this is actually what this function returns.

 647          if (is_object($debugObject)) {$debugObject->debugLog(1, "EXIT - ret: ", $ret);}
 648      }
 649  
 650      protected function match($exp, $pattern, $value) {
 651          global $debugObject;
 652          if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 653  
 654          switch ($exp) {
 655              case '=':
 656                  return ($value===$pattern);
 657              case '!=':
 658                  return ($value!==$pattern);
 659              case '^=':
 660                  return preg_match("/^".preg_quote($pattern,'/')."/", $value);
 661              case '$=':
 662                  return preg_match("/".preg_quote($pattern,'/')."$/", $value);
 663              case '*=':
 664                  if ($pattern[0]=='/') {
 665                      return preg_match($pattern, $value);
 666                  }
 667                  return preg_match("/".$pattern."/i", $value);
 668          }
 669          return false;
 670      }
 671  
 672      protected function parse_selector($selector_string) {
 673          global $debugObject;
 674          if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 675  
 676          // pattern of CSS selectors, modified from mootools

 677          // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does.

 678          // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check.

 679  // Notice the \[ starting the attbute?  and the @? following?  This implies that an attribute can begin with an @ sign that is not captured.

 680  // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.

 681  // farther study is required to determine of this should be documented or removed.

 682  //        $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";

 683          $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
 684          preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
 685          if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
 686  
 687          $selectors = array();
 688          $result = array();
 689          //print_r($matches);

 690  
 691          foreach ($matches as $m) {
 692              $m[0] = trim($m[0]);
 693              if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') continue;
 694              // for browser generated xpath

 695              if ($m[1]==='tbody') continue;
 696  
 697              list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false);
 698              if (!empty($m[2])) {$key='id'; $val=$m[2];}
 699              if (!empty($m[3])) {$key='class'; $val=$m[3];}
 700              if (!empty($m[4])) {$key=$m[4];}
 701              if (!empty($m[5])) {$exp=$m[5];}
 702              if (!empty($m[6])) {$val=$m[6];}
 703  
 704              // convert to lowercase

 705              if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
 706              //elements that do NOT have the specified attribute

 707              if (isset($key[0]) && $key[0]==='!') {$key=substr($key, 1); $no_key=true;}
 708  
 709              $result[] = array($tag, $key, $val, $exp, $no_key);
 710              if (trim($m[7])===',') {
 711                  $selectors[] = $result;
 712                  $result = array();
 713              }
 714          }
 715          if (count($result)>0)
 716              $selectors[] = $result;
 717          return $selectors;
 718      }
 719  
 720      function __get($name) {
 721          if (isset($this->attr[$name]))
 722          {
 723              return $this->convert_text($this->attr[$name]);
 724          }
 725          switch ($name) {
 726              case 'outertext': return $this->outertext();
 727              case 'innertext': return $this->innertext();
 728              case 'plaintext': return $this->text();
 729              case 'xmltext': return $this->xmltext();
 730              default: return array_key_exists($name, $this->attr);
 731          }
 732      }
 733  
 734      function __set($name, $value) {
 735          switch ($name) {
 736              case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
 737              case 'innertext':
 738                  if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
 739                  return $this->_[HDOM_INFO_INNER] = $value;
 740          }
 741          if (!isset($this->attr[$name])) {
 742              $this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
 743              $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
 744          }
 745          $this->attr[$name] = $value;
 746      }
 747  
 748      function __isset($name) {
 749          switch ($name) {
 750              case 'outertext': return true;
 751              case 'innertext': return true;
 752              case 'plaintext': return true;
 753          }
 754          //no value attr: nowrap, checked selected...

 755          return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
 756      }
 757  
 758      function __unset($name) {
 759          if (isset($this->attr[$name]))
 760              unset($this->attr[$name]);
 761      }
 762  
 763      // PaperG - Function to convert the text from one character set to another if the two sets are not the same.

 764      function convert_text($text)
 765      {
 766          global $debugObject;
 767          if (is_object($debugObject)) {$debugObject->debugLogEntry(1);}
 768  
 769          $converted_text = $text;
 770  
 771          $sourceCharset = "";
 772          $targetCharset = "";
 773  
 774          if ($this->dom)
 775          {
 776              $sourceCharset = strtoupper($this->dom->_charset);
 777              $targetCharset = strtoupper($this->dom->_target_charset);
 778          }
 779          if (is_object($debugObject)) {$debugObject->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset);}
 780  
 781          if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0))
 782          {
 783              // Check if the reported encoding could have been incorrect and the text is actually already UTF-8

 784              if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text)))
 785              {
 786                  $converted_text = $text;
 787              }
 788              else
 789              {
 790                  $converted_text = iconv($sourceCharset, $targetCharset, $text);
 791              }
 792          }
 793  
 794          // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.

 795          if ($targetCharset == 'UTF-8')
 796          {
 797              if (substr($converted_text, 0, 3) == "\xef\xbb\xbf")
 798              {
 799                  $converted_text = substr($converted_text, 3);
 800              }
 801              if (substr($converted_text, -3) == "\xef\xbb\xbf")
 802              {
 803                  $converted_text = substr($converted_text, 0, -3);
 804              }
 805          }
 806  
 807          return $converted_text;
 808      }
 809  
 810      /**

 811      * Returns true if $string is valid UTF-8 and false otherwise.

 812      *

 813      * @param mixed $str String to be tested

 814      * @return boolean

 815      */
 816      static function is_utf8($str)
 817      {
 818          $c=0; $b=0;
 819          $bits=0;
 820          $len=strlen($str);
 821          for($i=0; $i<$len; $i++)
 822          {
 823              $c=ord($str[$i]);
 824              if($c > 128)
 825              {
 826                  if(($c >= 254)) return false;
 827                  elseif($c >= 252) $bits=6;
 828                  elseif($c >= 248) $bits=5;
 829                  elseif($c >= 240) $bits=4;
 830                  elseif($c >= 224) $bits=3;
 831                  elseif($c >= 192) $bits=2;
 832                  else return false;
 833                  if(($i+$bits) > $len) return false;
 834                  while($bits > 1)
 835                  {
 836                      $i++;
 837                      $b=ord($str[$i]);
 838                      if($b < 128 || $b > 191) return false;
 839                      $bits--;
 840                  }
 841              }
 842          }
 843          return true;
 844      }
 845      /*

 846      function is_utf8($string)

 847      {

 848          //this is buggy

 849          return (utf8_encode(utf8_decode($string)) == $string);

 850      }

 851      */
 852  
 853      /**

 854       * Function to try a few tricks to determine the displayed size of an img on the page.

 855       * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types.

 856       *

 857       * @author John Schlick

 858       * @version April 19 2012

 859       * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out.

 860       */
 861      function get_display_size()
 862      {
 863          global $debugObject;
 864  
 865          $width = -1;
 866          $height = -1;
 867  
 868          if ($this->tag !== 'img')
 869          {
 870              return false;
 871          }
 872  
 873          // See if there is aheight or width attribute in the tag itself.

 874          if (isset($this->attr['width']))
 875          {
 876              $width = $this->attr['width'];
 877          }
 878  
 879          if (isset($this->attr['height']))
 880          {
 881              $height = $this->attr['height'];
 882          }
 883  
 884          // Now look for an inline style.

 885          if (isset($this->attr['style']))
 886          {
 887              // Thanks to user gnarf from stackoverflow for this regular expression.

 888              $attributes = array();
 889              preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER);
 890              foreach ($matches as $match) {
 891                $attributes[$match[1]] = $match[2];
 892              }
 893  
 894              // If there is a width in the style attributes:

 895              if (isset($attributes['width']) && $width == -1)
 896              {
 897                  // check that the last two characters are px (pixels)

 898                  if (strtolower(substr($attributes['width'], -2)) == 'px')
 899                  {
 900                      $proposed_width = substr($attributes['width'], 0, -2);
 901                      // Now make sure that it's an integer and not something stupid.

 902                      if (filter_var($proposed_width, FILTER_VALIDATE_INT))
 903                      {
 904                          $width = $proposed_width;
 905                      }
 906                  }
 907              }
 908  
 909              // If there is a width in the style attributes:

 910              if (isset($attributes['height']) && $height == -1)
 911              {
 912                  // check that the last two characters are px (pixels)

 913                  if (strtolower(substr($attributes['height'], -2)) == 'px')
 914                  {
 915                      $proposed_height = substr($attributes['height'], 0, -2);
 916                      // Now make sure that it's an integer and not something stupid.

 917                      if (filter_var($proposed_height, FILTER_VALIDATE_INT))
 918                      {
 919                          $height = $proposed_height;
 920                      }
 921                  }
 922              }
 923  
 924          }
 925  
 926          // Future enhancement:

 927          // Look in the tag to see if there is a class or id specified that has a height or width attribute to it.

 928  
 929          // Far future enhancement

 930          // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width

 931          // Note that in this case, the class or id will have the img subselector for it to apply to the image.

 932  
 933          // ridiculously far future development

 934          // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.

 935  
 936          $result = array('height' => $height,
 937                          'width' => $width);
 938          return $result;
 939      }
 940  
 941      // camel naming conventions

 942      function getAllAttributes() {return $this->attr;}
 943      function getAttribute($name) {return $this->__get($name);}
 944      function setAttribute($name, $value) {$this->__set($name, $value);}
 945      function hasAttribute($name) {return $this->__isset($name);}
 946      function removeAttribute($name) {$this->__set($name, null);}
 947      function getElementById($id) {return $this->find("#$id", 0);}
 948      function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
 949      function getElementByTagName($name) {return $this->find($name, 0);}
 950      function getElementsByTagName($name, $idx=null) {return $this->find($name, $idx);}
 951      function parentNode() {return $this->parent();}
 952      function childNodes($idx=-1) {return $this->children($idx);}
 953      function firstChild() {return $this->first_child();}
 954      function lastChild() {return $this->last_child();}
 955      function nextSibling() {return $this->next_sibling();}
 956      function previousSibling() {return $this->prev_sibling();}
 957      function hasChildNodes() {return $this->has_child();}
 958      function nodeName() {return $this->tag;}
 959      function appendChild($node) {$node->parent($this); return $node;}
 960  
 961  }
 962  
 963  /**

 964   * simple html dom parser

 965   * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector.

 966   * Paperg - change $size from protected to public so we can easily access it

 967   * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not.  Default is to NOT trust it.

 968   *

 969   * @package PlaceLocalInclude

 970   */
 971  class simple_html_dom
 972  {
 973      public $root = null;
 974      public $nodes = array();
 975      public $callback = null;
 976      public $lowercase = false;
 977      // Used to keep track of how large the text was when we started.

 978      public $original_size;
 979      public $size;
 980      protected $pos;
 981      protected $doc;
 982      protected $char;
 983      protected $cursor;
 984      protected $parent;
 985      protected $noise = array();
 986      protected $token_blank = " \t\r\n";
 987      protected $token_equal = ' =/>';
 988      protected $token_slash = " />\r\n\t";
 989      protected $token_attr = ' >';
 990      // Note that this is referenced by a child node, and so it needs to be public for that node to see this information.

 991      public $_charset = '';
 992      public $_target_charset = '';
 993      protected $default_br_text = "";
 994      public $default_span_text = "";
 995  
 996      // use isset instead of in_array, performance boost about 30%...

 997      protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1);
 998      protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
 999      // Known sourceforge issue #2977341

1000      // B tags that are not closed cause us to return everything to the end of the document.

1001      protected $optional_closing_tags = array(
1002          'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
1003          'th'=>array('th'=>1),
1004          'td'=>array('td'=>1),
1005          'li'=>array('li'=>1),
1006          'dt'=>array('dt'=>1, 'dd'=>1),
1007          'dd'=>array('dd'=>1, 'dt'=>1),
1008          'dl'=>array('dd'=>1, 'dt'=>1),
1009          'p'=>array('p'=>1),
1010          'nobr'=>array('nobr'=>1),
1011          'b'=>array('b'=>1),
1012          'option'=>array('option'=>1),
1013      );
1014  
1015      function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1016      {
1017          if ($str)
1018          {
1019              if (preg_match("/^http:\/\//i",$str) || is_file($str))
1020              {
1021                  $this->load_file($str);
1022              }
1023              else
1024              {
1025                  $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1026              }
1027          }
1028          // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html.

1029          if (!$forceTagsClosed) {
1030              $this->optional_closing_array=array();
1031          }
1032          $this->_target_charset = $target_charset;
1033      }
1034  
1035      function __destruct()
1036      {
1037          $this->clear();
1038      }
1039  
1040      // load html from string

1041      function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1042      {
1043          global $debugObject;
1044  
1045          // prepare

1046          $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText);
1047          // strip out comments

1048          $this->remove_noise("'<!--(.*?)-->'is");
1049          // strip out cdata

1050          $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1051          // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037

1052          // Script tags removal now preceeds style tag removal.

1053          // strip out <script> tags

1054          $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1055          $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1056          // strip out <style> tags

1057          $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1058          $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1059          // strip out preformatted tags

1060          $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1061          // strip out server side scripts

1062          $this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1063          // strip smarty scripts

1064          $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1065  
1066          // parsing

1067          while ($this->parse());
1068          // end

1069          $this->root->_[HDOM_INFO_END] = $this->cursor;
1070          $this->parse_charset();
1071  
1072          // make load function chainable

1073          return $this;
1074  
1075      }
1076  
1077      // load html from file

1078      function load_file()
1079      {
1080          $args = func_get_args();
1081          $this->load(call_user_func_array('file_get_contents', $args), true);
1082          // Throw an error if we can't properly load the dom.

1083          if (($error=error_get_last())!==null) {
1084              $this->clear();
1085              return false;
1086          }
1087      }
1088  
1089      // set callback function

1090      function set_callback($function_name)
1091      {
1092          $this->callback = $function_name;
1093      }
1094  
1095      // remove callback function

1096      function remove_callback()
1097      {
1098          $this->callback = null;
1099      }
1100  
1101      // save dom as string

1102      function save($filepath='')
1103      {
1104          $ret = $this->root->innertext();
1105          if ($filepath!=='') file_put_contents($filepath, $ret, LOCK_EX);
1106          return $ret;
1107      }
1108  
1109      // find dom node by css selector

1110      // Paperg - allow us to specify that we want case insensitive testing of the value of the selector.

1111      function find($selector, $idx=null, $lowercase=false)
1112      {
1113          return $this->root->find($selector, $idx, $lowercase);
1114      }
1115  
1116      // clean up memory due to php5 circular references memory leak...

1117      function clear()
1118      {
1119          foreach ($this->nodes as $n) {$n->clear(); $n = null;}
1120          // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear.

1121          if (isset($this->children)) foreach ($this->children as $n) {$n->clear(); $n = null;}
1122          if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
1123          if (isset($this->root)) {$this->root->clear(); unset($this->root);}
1124          unset($this->doc);
1125          unset($this->noise);
1126      }
1127  
1128      function dump($show_attr=true)
1129      {
1130          $this->root->dump($show_attr);
1131      }
1132  
1133      // prepare HTML data and init everything

1134      protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
1135      {
1136          $this->clear();
1137  
1138          // set the length of content before we do anything to it.

1139          $this->size = strlen($str);
1140          // Save the original size of the html that we got in.  It might be useful to someone.

1141          $this->original_size = $this->size;
1142  
1143          //before we save the string as the doc...  strip out the \r \n's if we are told to.

1144          if ($stripRN) {
1145              $str = str_replace("\r", " ", $str);
1146              $str = str_replace("\n", " ", $str);
1147  
1148              // set the length of content since we have changed it.

1149              $this->size = strlen($str);
1150          }
1151  
1152          $this->doc = $str;
1153          $this->pos = 0;
1154          $this->cursor = 1;
1155          $this->noise = array();
1156          $this->nodes = array();
1157          $this->lowercase = $lowercase;
1158          $this->default_br_text = $defaultBRText;
1159          $this->default_span_text = $defaultSpanText;
1160          $this->root = new simple_html_dom_node($this);
1161          $this->root->tag = 'root';
1162          $this->root->_[HDOM_INFO_BEGIN] = -1;
1163          $this->root->nodetype = HDOM_TYPE_ROOT;
1164          $this->parent = $this->root;
1165          if ($this->size>0) $this->char = $this->doc[0];
1166      }
1167  
1168      // parse html content

1169      protected function parse()
1170      {
1171          if (($s = $this->copy_until_char('<'))==='')
1172          {
1173              return $this->read_tag();
1174          }
1175  
1176          // text

1177          $node = new simple_html_dom_node($this);
1178          ++$this->cursor;
1179          $node->_[HDOM_INFO_TEXT] = $s;
1180          $this->link_nodes($node, false);
1181          return true;
1182      }
1183  
1184      // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later.

1185      // NOTE:  IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec

1186      // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism.

1187      protected function parse_charset()
1188      {
1189          global $debugObject;
1190  
1191          $charset = null;
1192  
1193          if (function_exists('get_last_retrieve_url_contents_content_type'))
1194          {
1195              $contentTypeHeader = get_last_retrieve_url_contents_content_type();
1196              $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1197              if ($success)
1198              {
1199                  $charset = $matches[1];
1200                  if (is_object($debugObject)) {$debugObject->debugLog(2, 'header content-type found charset of: ' . $charset);}
1201              }
1202  
1203          }
1204  
1205          if (empty($charset))
1206          {
1207              $el = $this->root->find('meta[http-equiv=Content-Type]',0);
1208              if (!empty($el))
1209              {
1210                  $fullvalue = $el->content;
1211                  if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag found' . $fullvalue);}
1212  
1213                  if (!empty($fullvalue))
1214                  {
1215                      $success = preg_match('/charset=(.+)/', $fullvalue, $matches);
1216                      if ($success)
1217                      {
1218                          $charset = $matches[1];
1219                      }
1220                      else
1221                      {
1222                          // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1

1223                          if (is_object($debugObject)) {$debugObject->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.');}
1224                          $charset = 'ISO-8859-1';
1225                      }
1226                  }
1227              }
1228          }
1229  
1230          // If we couldn't find a charset above, then lets try to detect one based on the text we got...

1231          if (empty($charset))
1232          {
1233              // Have php try to detect the encoding from the text given to us.

1234              $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) );
1235              if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);}
1236  
1237              // and if this doesn't work...  then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...

1238              if ($charset === false)
1239              {
1240                  if (is_object($debugObject)) {$debugObject->debugLog(2, 'since mb_detect failed - using default of utf-8');}
1241                  $charset = 'UTF-8';
1242              }
1243          }
1244  
1245          // Since CP1252 is a superset, if we get one of it's subsets, we want it instead.

1246          if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1')))
1247          {
1248              if (is_object($debugObject)) {$debugObject->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset');}
1249              $charset = 'CP1252';
1250          }
1251  
1252          if (is_object($debugObject)) {$debugObject->debugLog(1, 'EXIT - ' . $charset);}
1253  
1254          return $this->_charset = $charset;
1255      }
1256  
1257      // read tag info

1258      protected function read_tag()
1259      {
1260          if ($this->char!=='<')
1261          {
1262              $this->root->_[HDOM_INFO_END] = $this->cursor;
1263              return false;
1264          }
1265          $begin_tag_pos = $this->pos;
1266          $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1267  
1268          // end tag

1269          if ($this->char==='/')
1270          {
1271              $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1272              // This represents the change in the simple_html_dom trunk from revision 180 to 181.

1273              // $this->skip($this->token_blank_t);

1274              $this->skip($this->token_blank);
1275              $tag = $this->copy_until_char('>');
1276  
1277              // skip attributes in end tag

1278              if (($pos = strpos($tag, ' '))!==false)
1279                  $tag = substr($tag, 0, $pos);
1280  
1281              $parent_lower = strtolower($this->parent->tag);
1282              $tag_lower = strtolower($tag);
1283  
1284              if ($parent_lower!==$tag_lower)
1285              {
1286                  if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower]))
1287                  {
1288                      $this->parent->_[HDOM_INFO_END] = 0;
1289                      $org_parent = $this->parent;
1290  
1291                      while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1292                          $this->parent = $this->parent->parent;
1293  
1294                      if (strtolower($this->parent->tag)!==$tag_lower) {
1295                          $this->parent = $org_parent; // restore origonal parent

1296                          if ($this->parent->parent) $this->parent = $this->parent->parent;
1297                          $this->parent->_[HDOM_INFO_END] = $this->cursor;
1298                          return $this->as_text_node($tag);
1299                      }
1300                  }
1301                  else if (($this->parent->parent) && isset($this->block_tags[$tag_lower]))
1302                  {
1303                      $this->parent->_[HDOM_INFO_END] = 0;
1304                      $org_parent = $this->parent;
1305  
1306                      while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
1307                          $this->parent = $this->parent->parent;
1308  
1309                      if (strtolower($this->parent->tag)!==$tag_lower)
1310                      {
1311                          $this->parent = $org_parent; // restore origonal parent

1312                          $this->parent->_[HDOM_INFO_END] = $this->cursor;
1313                          return $this->as_text_node($tag);
1314                      }
1315                  }
1316                  else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower)
1317                  {
1318                      $this->parent->_[HDOM_INFO_END] = 0;
1319                      $this->parent = $this->parent->parent;
1320                  }
1321                  else
1322                      return $this->as_text_node($tag);
1323              }
1324  
1325              $this->parent->_[HDOM_INFO_END] = $this->cursor;
1326              if ($this->parent->parent) $this->parent = $this->parent->parent;
1327  
1328              $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1329              return true;
1330          }
1331  
1332          $node = new simple_html_dom_node($this);
1333          $node->_[HDOM_INFO_BEGIN] = $this->cursor;
1334          ++$this->cursor;
1335          $tag = $this->copy_until($this->token_slash);
1336          $node->tag_start = $begin_tag_pos;
1337  
1338          // doctype, cdata & comments...

1339          if (isset($tag[0]) && $tag[0]==='!') {
1340              $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1341  
1342              if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
1343                  $node->nodetype = HDOM_TYPE_COMMENT;
1344                  $node->tag = 'comment';
1345              } else {
1346                  $node->nodetype = HDOM_TYPE_UNKNOWN;
1347                  $node->tag = 'unknown';
1348              }
1349              if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1350              $this->link_nodes($node, true);
1351              $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1352              return true;
1353          }
1354  
1355          // text

1356          if ($pos=strpos($tag, '<')!==false) {
1357              $tag = '<' . substr($tag, 0, -1);
1358              $node->_[HDOM_INFO_TEXT] = $tag;
1359              $this->link_nodes($node, false);
1360              $this->char = $this->doc[--$this->pos]; // prev

1361              return true;
1362          }
1363  
1364          if (!preg_match("/^[\w-:]+$/", $tag)) {
1365              $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1366              if ($this->char==='<') {
1367                  $this->link_nodes($node, false);
1368                  return true;
1369              }
1370  
1371              if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
1372              $this->link_nodes($node, false);
1373              $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1374              return true;
1375          }
1376  
1377          // begin tag

1378          $node->nodetype = HDOM_TYPE_ELEMENT;
1379          $tag_lower = strtolower($tag);
1380          $node->tag = ($this->lowercase) ? $tag_lower : $tag;
1381  
1382          // handle optional closing tags

1383          if (isset($this->optional_closing_tags[$tag_lower]) )
1384          {
1385              while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)]))
1386              {
1387                  $this->parent->_[HDOM_INFO_END] = 0;
1388                  $this->parent = $this->parent->parent;
1389              }
1390              $node->parent = $this->parent;
1391          }
1392  
1393          $guard = 0; // prevent infinity loop

1394          $space = array($this->copy_skip($this->token_blank), '', '');
1395  
1396          // attributes

1397          do
1398          {
1399              if ($this->char!==null && $space[0]==='')
1400              {
1401                  break;
1402              }
1403              $name = $this->copy_until($this->token_equal);
1404              if ($guard===$this->pos)
1405              {
1406                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1407                  continue;
1408              }
1409              $guard = $this->pos;
1410  
1411              // handle endless '<'

1412              if ($this->pos>=$this->size-1 && $this->char!=='>') {
1413                  $node->nodetype = HDOM_TYPE_TEXT;
1414                  $node->_[HDOM_INFO_END] = 0;
1415                  $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
1416                  $node->tag = 'text';
1417                  $this->link_nodes($node, false);
1418                  return true;
1419              }
1420  
1421              // handle mismatch '<'

1422              if ($this->doc[$this->pos-1]=='<') {
1423                  $node->nodetype = HDOM_TYPE_TEXT;
1424                  $node->tag = 'text';
1425                  $node->attr = array();
1426                  $node->_[HDOM_INFO_END] = 0;
1427                  $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1);
1428                  $this->pos -= 2;
1429                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1430                  $this->link_nodes($node, false);
1431                  return true;
1432              }
1433  
1434              if ($name!=='/' && $name!=='') {
1435                  $space[1] = $this->copy_skip($this->token_blank);
1436                  $name = $this->restore_noise($name);
1437                  if ($this->lowercase) $name = strtolower($name);
1438                  if ($this->char==='=') {
1439                      $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1440                      $this->parse_attr($node, $name, $space);
1441                  }
1442                  else {
1443                      //no value attr: nowrap, checked selected...

1444                      $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1445                      $node->attr[$name] = true;
1446                      if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev

1447                  }
1448                  $node->_[HDOM_INFO_SPACE][] = $space;
1449                  $space = array($this->copy_skip($this->token_blank), '', '');
1450              }
1451              else
1452                  break;
1453          } while ($this->char!=='>' && $this->char!=='/');
1454  
1455          $this->link_nodes($node, true);
1456          $node->_[HDOM_INFO_ENDSPACE] = $space[0];
1457  
1458          // check self closing

1459          if ($this->copy_until_char_escape('>')==='/')
1460          {
1461              $node->_[HDOM_INFO_ENDSPACE] .= '/';
1462              $node->_[HDOM_INFO_END] = 0;
1463          }
1464          else
1465          {
1466              // reset parent

1467              if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
1468          }
1469          $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1470  
1471          // If it's a BR tag, we need to set it's text to the default text.

1472          // This way when we see it in plaintext, we can generate formatting that the user wants.

1473          // since a br tag never has sub nodes, this works well.

1474          if ($node->tag == "br")
1475          {
1476              $node->_[HDOM_INFO_INNER] = $this->default_br_text;
1477          }
1478  
1479          return true;
1480      }
1481  
1482      // parse attributes

1483      protected function parse_attr($node, $name, &$space)
1484      {
1485          // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037

1486          // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one.

1487          if (isset($node->attr[$name]))
1488          {
1489              return;
1490          }
1491  
1492          $space[2] = $this->copy_skip($this->token_blank);
1493          switch ($this->char) {
1494              case '"':
1495                  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
1496                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1497                  $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
1498                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1499                  break;
1500              case '\'':
1501                  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
1502                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1503                  $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
1504                  $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1505                  break;
1506              default:
1507                  $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
1508                  $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
1509          }
1510          // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace.

1511          $node->attr[$name] = str_replace("\r", "", $node->attr[$name]);
1512          $node->attr[$name] = str_replace("\n", "", $node->attr[$name]);
1513          // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case.

1514          if ($name == "class") {
1515              $node->attr[$name] = trim($node->attr[$name]);
1516          }
1517      }
1518  
1519      // link node's parent

1520      protected function link_nodes(&$node, $is_child)
1521      {
1522          $node->parent = $this->parent;
1523          $this->parent->nodes[] = $node;
1524          if ($is_child)
1525          {
1526              $this->parent->children[] = $node;
1527          }
1528      }
1529  
1530      // as a text node

1531      protected function as_text_node($tag)
1532      {
1533          $node = new simple_html_dom_node($this);
1534          ++$this->cursor;
1535          $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
1536          $this->link_nodes($node, false);
1537          $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1538          return true;
1539      }
1540  
1541      protected function skip($chars)
1542      {
1543          $this->pos += strspn($this->doc, $chars, $this->pos);
1544          $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1545      }
1546  
1547      protected function copy_skip($chars)
1548      {
1549          $pos = $this->pos;
1550          $len = strspn($this->doc, $chars, $pos);
1551          $this->pos += $len;
1552          $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1553          if ($len===0) return '';
1554          return substr($this->doc, $pos, $len);
1555      }
1556  
1557      protected function copy_until($chars)
1558      {
1559          $pos = $this->pos;
1560          $len = strcspn($this->doc, $chars, $pos);
1561          $this->pos += $len;
1562          $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next

1563          return substr($this->doc, $pos, $len);
1564      }
1565  
1566      protected function copy_until_char($char)
1567      {
1568          if ($this->char===null) return '';
1569  
1570          if (($pos = strpos($this->doc, $char, $this->pos))===false) {
1571              $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1572              $this->char = null;
1573              $this->pos = $this->size;
1574              return $ret;
1575          }
1576  
1577          if ($pos===$this->pos) return '';
1578          $pos_old = $this->pos;
1579          $this->char = $this->doc[$pos];
1580          $this->pos = $pos;
1581          return substr($this->doc, $pos_old, $pos-$pos_old);
1582      }
1583  
1584      protected function copy_until_char_escape($char)
1585      {
1586          if ($this->char===null) return '';
1587  
1588          $start = $this->pos;
1589          while (1)
1590          {
1591              if (($pos = strpos($this->doc, $char, $start))===false)
1592              {
1593                  $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
1594                  $this->char = null;
1595                  $this->pos = $this->size;
1596                  return $ret;
1597              }
1598  
1599              if ($pos===$this->pos) return '';
1600  
1601              if ($this->doc[$pos-1]==='\\') {
1602                  $start = $pos+1;
1603                  continue;
1604              }
1605  
1606              $pos_old = $this->pos;
1607              $this->char = $this->doc[$pos];
1608              $this->pos = $pos;
1609              return substr($this->doc, $pos_old, $pos-$pos_old);
1610          }
1611      }
1612  
1613      // remove noise from html content

1614      // save the noise in the $this->noise array.

1615      protected function remove_noise($pattern, $remove_tag=false)
1616      {
1617          global $debugObject;
1618          if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1619  
1620          $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
1621  
1622          for ($i=$count-1; $i>-1; --$i)
1623          {
1624              $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000);
1625              if (is_object($debugObject)) { $debugObject->debugLog(2, 'key is: ' . $key); }
1626              $idx = ($remove_tag) ? 0 : 1;
1627              $this->noise[$key] = $matches[$i][$idx][0];
1628              $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
1629          }
1630  
1631          // reset the length of content

1632          $this->size = strlen($this->doc);
1633          if ($this->size>0)
1634          {
1635              $this->char = $this->doc[0];
1636          }
1637      }
1638  
1639      // restore noise to html content

1640      function restore_noise($text)
1641      {
1642          global $debugObject;
1643          if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1644  
1645          while (($pos=strpos($text, '___noise___'))!==false)
1646          {
1647              // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us...

1648              if (strlen($text) > $pos+15)
1649              {
1650                  $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15];
1651                  if (is_object($debugObject)) { $debugObject->debugLog(2, 'located key of: ' . $key); }
1652  
1653                  if (isset($this->noise[$key]))
1654                  {
1655                      $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16);
1656                  }
1657                  else
1658                  {
1659                      // do this to prevent an infinite loop.

1660                      $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16);
1661                  }
1662              }
1663              else
1664              {
1665                  // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem.

1666                  $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11);
1667              }
1668          }
1669          return $text;
1670      }
1671  
1672      // Sometimes we NEED one of the noise elements.

1673      function search_noise($text)
1674      {
1675          global $debugObject;
1676          if (is_object($debugObject)) { $debugObject->debugLogEntry(1); }
1677  
1678          foreach($this->noise as $noiseElement)
1679          {
1680              if (strpos($noiseElement, $text)!==false)
1681              {
1682                  return $noiseElement;
1683              }
1684          }
1685      }
1686      function __toString()
1687      {
1688          return $this->root->innertext();
1689      }
1690  
1691      function __get($name)
1692      {
1693          switch ($name)
1694          {
1695              case 'outertext':
1696                  return $this->root->innertext();
1697              case 'innertext':
1698                  return $this->root->innertext();
1699              case 'plaintext':
1700                  return $this->root->text();
1701              case 'charset':
1702                  return $this->_charset;
1703              case 'target_charset':
1704                  return $this->_target_charset;
1705          }
1706      }
1707  
1708      // camel naming conventions

1709      function childNodes($idx=-1) {return $this->root->childNodes($idx);}
1710      function firstChild() {return $this->root->first_child();}
1711      function lastChild() {return $this->root->last_child();}
1712      function createElement($name, $value=null) {return @str_get_html("<$name>$value</$name>")->first_child();}
1713      function createTextNode($value) {return @end(str_get_html($value)->nodes);}
1714      function getElementById($id) {return $this->find("#$id", 0);}
1715      function getElementsById($id, $idx=null) {return $this->find("#$id", $idx);}
1716      function getElementByTagName($name) {return $this->find($name, 0);}
1717      function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
1718      function loadFile() {$args = func_get_args();$this->load_file($args);}
1719  }
1720  
1721  ?>


Generated: Fri Nov 28 20:08:37 2014 Cross-referenced by PHPXref 0.7.1