PHPXRef 0.7.1 : moodle-2.8 : /lib/html2text.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /*************************************************************************
   4   *                                                                       *
   5   * class.html2text.inc                                                   *
   6   *                                                                       *
   7   *************************************************************************
   8   *                                                                       *
   9   * Converts HTML to formatted plain text                                 *
  10   *                                                                       *
  11   * Copyright (c) 2005-2007 Jon Abernathy <[email protected]>             *
  12   * All rights reserved.                                                  *
  13   *                                                                       *
  14   * This script is free software; you can redistribute it and/or modify   *
  15   * it under the terms of the GNU General Public License as published by  *
  16   * the Free Software Foundation; either version 2 of the License, or     *
  17   * (at your option) any later version.                                   *
  18   *                                                                       *
  19   * The GNU General Public License can be found at                        *
  20   * http://www.gnu.org/copyleft/gpl.html.                                 *
  21   *                                                                       *
  22   * This script is distributed in the hope that it will be useful,        *
  23   * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  24   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          *
  25   * GNU General Public License for more details.                          *
  26   *                                                                       *
  27   * Author(s): Jon Abernathy <[email protected]>                          *
  28   *                                                                       *
  29   * Last modified: 08/08/07                                               *
  30   *                                                                       *
  31   *************************************************************************/
  32  
  33  if (!defined('RCMAIL_CHARSET')) {
  34      define('RCMAIL_CHARSET', 'UTF-8');
  35  }
  36  
  37  /**
  38   *  Takes HTML and converts it to formatted, plain text.
  39   *
  40   *  Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and
  41   *  correcting an error in the regexp search array. Fixed 7/30/03.
  42   *
  43   *  Updated set_html() function's file reading mechanism, 9/25/03.
  44   *
  45   *  Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding
  46   *  several more HTML entity codes to the $search and $replace arrays.
  47   *  Updated 11/7/03.
  48   *
  49   *  Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for
  50   *  suggesting the addition of $allowed_tags and its supporting function
  51   *  (which I slightly modified). Updated 3/12/04.
  52   *
  53   *  Thanks to Justin Dearing for pointing out that a replacement for the
  54   *  <TH> tag was missing, and suggesting an appropriate fix.
  55   *  Updated 8/25/04.
  56   *
  57   *  Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a
  58   *  display/formatting bug in the _build_link_list() function: email
  59   *  readers would show the left bracket and number ("[1") as part of the
  60   *  rendered email address.
  61   *  Updated 12/16/04.
  62   *
  63   *  Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code
  64   *  to handle relative links, which I hadn't considered. I modified his
  65   *  code a bit to handle normal HTTP links and MAILTO links. Also for
  66   *  suggesting three additional HTML entity codes to search for.
  67   *  Updated 03/02/05.
  68   *
  69   *  Thanks to Jacob Chandler for pointing out another link condition
  70   *  for the _build_link_list() function: "https".
  71   *  Updated 04/06/05.
  72   *
  73   *  Thanks to Marc Bertrand (http://www.dresdensky.com/) for
  74   *  suggesting a revision to the word wrapping functionality; if you
  75   *  specify a $width of 0 or less, word wrapping will be ignored.
  76   *  Updated 11/02/06.
  77   *
  78   *  *** Big housecleaning updates below:
  79   *
  80   *  Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for
  81   *  suggesting the fix to handle </li> and blank lines (whitespace).
  82   *  Christian Basedau (http://www.movetheweb.de/) also suggested the
  83   *  blank lines fix.
  84   *
  85   *  Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/),
  86   *  Christian Basedau, Norbert Laposa (http://ln5.co.uk/),
  87   *  Bas van de Weijer, and Marijn van Butselaar
  88   *  for pointing out my glaring error in the <th> handling. Marcus also
  89   *  supplied a host of fixes.
  90   *
  91   *  Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing
  92   *  out that extra spaces should be compressed--a problem addressed with
  93   *  Marcus Bointon's fixes but that I had not yet incorporated.
  94   *
  95   *    Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for
  96   *  suggesting a valuable fix with <a> tag handling.
  97   *
  98   *  Thanks to Wojciech Bajon (again!) for suggesting fixes and additions,
  99   *  including the <a> tag handling that Daniel Schledermann pointed
 100   *  out but that I had not yet incorporated. I haven't (yet)
 101   *  incorporated all of Wojciech's changes, though I may at some
 102   *  future time.
 103   *
 104   *  *** End of the housecleaning updates. Updated 08/08/07.
 105   *
 106   *  @author Jon Abernathy <[email protected]>
 107   *  @version 1.0.0
 108   *  @since PHP 4.0.2
 109   */
 110  class html2text
 111  {
 112  
 113      /**
 114       *  Contains the HTML content to convert.
 115       *
 116       *  @var string $html
 117       *  @access public
 118       */
 119      var $html;
 120  
 121      /**
 122       *  Contains the converted, formatted text.
 123       *
 124       *  @var string $text
 125       *  @access public
 126       */
 127      var $text;
 128  
 129      /**
 130       *  Maximum width of the formatted text, in columns.
 131       *
 132       *  Set this value to 0 (or less) to ignore word wrapping
 133       *  and not constrain text to a fixed-width column.
 134       *
 135       *  @var integer $width
 136       *  @access public
 137       */
 138      var $width = 70;
 139  
 140      /**
 141       *  List of preg* regular expression patterns to search for,
 142       *  used in conjunction with $replace.
 143       *
 144       *  @var array $search
 145       *  @access public
 146       *  @see $replace
 147       */
 148      var $search = array(
 149          "/\r/",                                  // Non-legal carriage return
 150          "/[\n\t]+/",                             // Newlines and tabs
 151          '/<script[^>]*>.*?<\/script>/i',         // <script>s -- which strip_tags supposedly has problems with
 152          '/<style[^>]*>.*?<\/style>/i',           // <style>s -- which strip_tags supposedly has problems with
 153          '/<p[^>]*>/i',                           // <P>
 154          '/<br[^>]*>/i',                          // <br>
 155          '/<i[^>]*>(.*?)<\/i>/i',                 // <i>
 156          '/<em[^>]*>(.*?)<\/em>/i',               // <em>
 157          '/(<ul[^>]*>|<\/ul>)/i',                 // <ul> and </ul>
 158          '/(<ol[^>]*>|<\/ol>)/i',                 // <ol> and </ol>
 159          '/<li[^>]*>(.*?)<\/li>/i',               // <li> and </li>
 160          '/<li[^>]*>/i',                          // <li>
 161          '/<hr[^>]*>/i',                          // <hr>
 162          '/<div[^>]*>/i',                         // <div>
 163          '/(<table[^>]*>|<\/table>)/i',           // <table> and </table>
 164          '/(<tr[^>]*>|<\/tr>)/i',                 // <tr> and </tr>
 165          '/<td[^>]*>(.*?)<\/td>/i',               // <td> and </td>
 166      );
 167  
 168      /**
 169       *  List of pattern replacements corresponding to patterns searched.
 170       *
 171       *  @var array $replace
 172       *  @access public
 173       *  @see $search
 174       */
 175      var $replace = array(
 176          '',                                     // Non-legal carriage return
 177          ' ',                                    // Newlines and tabs
 178          '',                                     // <script>s -- which strip_tags supposedly has problems with
 179          '',                                     // <style>s -- which strip_tags supposedly has problems with
 180          "\n\n",                                 // <P>
 181          "\n",                                   // <br>
 182          '_\\1_',                                // <i>
 183          '_\\1_',                                // <em>
 184          "\n\n",                                 // <ul> and </ul>
 185          "\n\n",                                 // <ol> and </ol>
 186          "\t* \\1\n",                            // <li> and </li>
 187          "\n\t* ",                               // <li>
 188          "\n-------------------------\n",        // <hr>
 189          "<div>\n",                              // <div>
 190          "\n\n",                                 // <table> and </table>
 191          "\n",                                   // <tr> and </tr>
 192          "\t\t\\1\n",                            // <td> and </td>
 193      );
 194  
 195      /**
 196       *  List of preg* regular expression patterns to search for,
 197       *  used in conjunction with $ent_replace.
 198       *
 199       *  @var array $ent_search
 200       *  @access public
 201       *  @see $ent_replace
 202       */
 203      var $ent_search = array(
 204          '/&(nbsp|#160);/i',                      // Non-breaking space
 205          '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i',
 206          // Double quotes
 207          '/&(apos|rsquo|lsquo|#8216|#8217);/i',   // Single quotes
 208          '/&gt;/i',                               // Greater-than
 209          '/&lt;/i',                               // Less-than
 210          '/&(copy|#169);/i',                      // Copyright
 211          '/&(trade|#8482|#153);/i',               // Trademark
 212          '/&(reg|#174);/i',                       // Registered
 213          '/&(mdash|#151|#8212);/i',               // mdash
 214          '/&(ndash|minus|#8211|#8722);/i',        // ndash
 215          '/&(bull|#149|#8226);/i',                // Bullet
 216          '/&(pound|#163);/i',                     // Pound sign
 217          '/&(euro|#8364);/i',                     // Euro sign
 218          '/&(amp|#38);/i',                        // Ampersand: see _converter()
 219          '/[ ]{2,}/',                             // Runs of spaces, post-handling
 220      );
 221  
 222      /**
 223       *  List of pattern replacements corresponding to patterns searched.
 224       *
 225       *  @var array $ent_replace
 226       *  @access public
 227       *  @see $ent_search
 228       */
 229      var $ent_replace = array(
 230          ' ',                                    // Non-breaking space
 231          '"',                                    // Double quotes
 232          "'",                                    // Single quotes
 233          '>',
 234          '<',
 235          '(c)',
 236          '(tm)',
 237          '(R)',
 238          '--',
 239          '-',
 240          '*',
 241          '£',
 242          'EUR',                                  // Euro sign. € ?
 243          '|+|amp|+|',                            // Ampersand: see _converter()
 244          ' ',                                    // Runs of spaces, post-handling
 245      );
 246  
 247      /**
 248       *  List of preg* regular expression patterns to search for
 249       *  and replace using callback function.
 250       *
 251       *  @var array $callback_search
 252       *  @access public
 253       */
 254      var $callback_search = array(
 255          '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i',
 256          // <a href="">
 257          '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3
 258          '/<(b)[^>]*>(.*?)<\/b>/i',                 // <b>
 259          '/<(strong)[^>]*>(.*?)<\/strong>/i',       // <strong>
 260          '/<(th)[^>]*>(.*?)<\/th>/i',               // <th> and </th>
 261          '/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i',    // <img> with alt
 262      );
 263  
 264      /**
 265       *  List of preg* regular expression patterns to search for in PRE body,
 266       *  used in conjunction with $pre_replace.
 267       *
 268       *  @var array $pre_search
 269       *  @access public
 270       *  @see $pre_replace
 271       */
 272      var $pre_search = array(
 273          "/\n/",
 274          "/\t/",
 275          '/ /',
 276          '/<pre[^>]*>/',
 277          '/<\/pre>/'
 278      );
 279  
 280      /**
 281       *  List of pattern replacements corresponding to patterns searched for PRE body.
 282       *
 283       *  @var array $pre_replace
 284       *  @access public
 285       *  @see $pre_search
 286       */
 287      var $pre_replace = array(
 288          '<br>',
 289          '&nbsp;&nbsp;&nbsp;&nbsp;',
 290          '&nbsp;',
 291          '',
 292          ''
 293      );
 294  
 295      /**
 296       *  Contains a list of HTML tags to allow in the resulting text.
 297       *
 298       *  @var string $allowed_tags
 299       *  @access public
 300       *  @see set_allowed_tags()
 301       */
 302      var $allowed_tags = '';
 303  
 304      /**
 305       *  Contains the base URL that relative links should resolve to.
 306       *
 307       *  @var string $url
 308       *  @access public
 309       */
 310      var $url;
 311  
 312      /**
 313       *  Indicates whether content in the $html variable has been converted yet.
 314       *
 315       *  @var boolean $_converted
 316       *  @access private
 317       *  @see $html, $text
 318       */
 319      var $_converted = false;
 320  
 321      /**
 322       *  Contains URL addresses from links to be rendered in plain text.
 323       *
 324       *  @var array $_link_list
 325       *  @access private
 326       *  @see _build_link_list()
 327       */
 328      var $_link_list = array();
 329  
 330      /**
 331       * Boolean flag, true if a table of link URLs should be listed after the text.
 332       *
 333       * @var boolean $_do_links
 334       * @access private
 335       * @see html2text()
 336       */
 337      var $_do_links = true;
 338  
 339      /**
 340       *  Constructor.
 341       *
 342       *  If the HTML source string (or file) is supplied, the class
 343       *  will instantiate with that source propagated, all that has
 344       *  to be done it to call get_text().
 345       *
 346       *  @param string $source HTML content
 347       *  @param boolean $from_file Indicates $source is a file to pull content from
 348       *  @param boolean $do_links Indicate whether a table of link URLs is desired
 349       *  @param integer $width Maximum width of the formatted text, 0 for no limit
 350       *  @access public
 351       *  @return void
 352       */
 353      function html2text( $source = '', $from_file = false, $do_links = true, $width = 75 )
 354      {
 355          if ( $source !== '' ) {
 356              $this->set_html($source, $from_file);
 357          }
 358  
 359          $this->set_base_url();
 360          $this->_do_links = $do_links;
 361          $this->width = $width;
 362      }
 363  
 364      /**
 365       *  Loads source HTML into memory, either from $source string or a file.
 366       *
 367       *  @param string $source HTML content
 368       *  @param boolean $from_file Indicates $source is a file to pull content from
 369       *  @access public
 370       *  @return void
 371       */
 372      function set_html( $source, $from_file = false )
 373      {
 374          if ( $from_file && file_exists($source) ) {
 375              $this->html = file_get_contents($source);
 376          }
 377          else
 378              $this->html = $source;
 379  
 380          $this->_converted = false;
 381      }
 382  
 383      /**
 384       *  Returns the text, converted from HTML.
 385       *
 386       *  @access public
 387       *  @return string
 388       */
 389      function get_text()
 390      {
 391          if ( !$this->_converted ) {
 392              $this->_convert();
 393          }
 394  
 395          return $this->text;
 396      }
 397  
 398      /**
 399       *  Prints the text, converted from HTML.
 400       *
 401       *  @access public
 402       *  @return void
 403       */
 404      function print_text()
 405      {
 406          print $this->get_text();
 407      }
 408  
 409      /**
 410       *  Alias to print_text(), operates identically.
 411       *
 412       *  @access public
 413       *  @return void
 414       *  @see print_text()
 415       */
 416      function p()
 417      {
 418          print $this->get_text();
 419      }
 420  
 421      /**
 422       *  Sets the allowed HTML tags to pass through to the resulting text.
 423       *
 424       *  Tags should be in the form "<p>", with no corresponding closing tag.
 425       *
 426       *  @access public
 427       *  @return void
 428       */
 429      function set_allowed_tags( $allowed_tags = '' )
 430      {
 431          if ( !empty($allowed_tags) ) {
 432              $this->allowed_tags = $allowed_tags;
 433          }
 434      }
 435  
 436      /**
 437       *  Sets a base URL to handle relative links.
 438       *
 439       *  @access public
 440       *  @return void
 441       */
 442      function set_base_url( $url = '' )
 443      {
 444          if ( empty($url) ) {
 445              if ( !empty($_SERVER['HTTP_HOST']) ) {
 446                  $this->url = 'http://' . $_SERVER['HTTP_HOST'];
 447              } else {
 448                  $this->url = '';
 449              }
 450          } else {
 451              // Strip any trailing slashes for consistency (relative
 452              // URLs may already start with a slash like "/file.html")
 453              if ( substr($url, -1) == '/' ) {
 454                  $url = substr($url, 0, -1);
 455              }
 456              $this->url = $url;
 457          }
 458      }
 459  
 460      /**
 461       *  Workhorse function that does actual conversion (calls _converter() method).
 462       *
 463       *  @access private
 464       *  @return void
 465       */
 466      function _convert()
 467      {
 468          // Variables used for building the link list
 469          $this->_link_list = array();
 470  
 471          $text = trim($this->html);
 472  
 473          // Convert HTML to TXT
 474          $this->_converter($text);
 475  
 476          // Add link list
 477          if (!empty($this->_link_list)) {
 478              $text .= "\n\nLinks:\n------\n";
 479              foreach ($this->_link_list as $idx => $url) {
 480                  $text .= '[' . ($idx+1) . '] ' . $url . "\n";
 481              }
 482          }
 483  
 484          $this->text = $text;
 485  
 486          $this->_converted = true;
 487      }
 488  
 489      /**
 490       *  Workhorse function that does actual conversion.
 491       *
 492       *  First performs custom tag replacement specified by $search and
 493       *  $replace arrays. Then strips any remaining HTML tags, reduces whitespace
 494       *  and newlines to a readable format, and word wraps the text to
 495       *  $width characters.
 496       *
 497       *  @param string Reference to HTML content string
 498       *
 499       *  @access private
 500       *  @return void
 501       */
 502      function _converter(&$text)
 503      {
 504          // Convert <BLOCKQUOTE> (before PRE!)
 505          $this->_convert_blockquotes($text);
 506  
 507          // Convert <PRE>
 508          $this->_convert_pre($text);
 509  
 510          // Run our defined tags search-and-replace
 511          $text = preg_replace($this->search, $this->replace, $text);
 512  
 513          // Run our defined tags search-and-replace with callback
 514          $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text);
 515  
 516          // Strip any other HTML tags
 517          $text = strip_tags($text, $this->allowed_tags);
 518  
 519          // Run our defined entities/characters search-and-replace
 520          $text = preg_replace($this->ent_search, $this->ent_replace, $text);
 521  
 522          // Replace known html entities
 523          $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8');
 524  
 525          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 526          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 527  
 528          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 529          // This properly handles situation of "&amp;quot;" in input string
 530          $text = str_replace('|+|amp|+|', '&', $text);
 531  
 532          // Bring down number of empty lines to 2 max
 533          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 534          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 535  
 536          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 537          $text = ltrim($text, "\n");
 538  
 539          // Wrap the text to a readable format
 540          // for PHP versions >= 4.0.2. Default width is 75
 541          // If width is 0 or less, don't wrap the text.
 542          if ( $this->width > 0 ) {
 543              $text = wordwrap($text, $this->width);
 544          }
 545      }
 546  
 547      /**
 548       *  Helper function called by preg_replace() on link replacement.
 549       *
 550       *  Maintains an internal list of links to be displayed at the end of the
 551       *  text, with numeric indices to the original point in the text they
 552       *  appeared. Also makes an effort at identifying and handling absolute
 553       *  and relative links.
 554       *
 555       *  @param string $link URL of the link
 556       *  @param string $display Part of the text to associate number with
 557       *  @access private
 558       *  @return string
 559       */
 560      function _build_link_list( $link, $display )
 561      {
 562          if (!$this->_do_links || empty($link)) {
 563              return $display;
 564          }
 565  
 566          // Ignored link types
 567          if (preg_match('!^(javascript|mailto|#):!i', $link)) {
 568              return $display;
 569          }
 570  
 571          if (preg_match('!^(https?://)!i', $link)) {
 572              $url = $link;
 573          }
 574          else {
 575              $url = $this->url;
 576              if (substr($link, 0, 1) != '/') {
 577                  $url .= '/';
 578              }
 579              $url .= "$link";
 580          }
 581  
 582          if (($index = array_search($url, $this->_link_list)) === false) {
 583              // Add the link to the list.
 584              $this->_link_list[] = $url;
 585              $index = count($this->_link_list);
 586          }
 587          else {
 588              // Use the index of the existing link in the list (we enumerate from 1, not from 0).
 589              $index++;
 590          }
 591  
 592          return $display . ' [' . ($index) . ']';
 593      }
 594  
 595      /**
 596       *  Helper function for PRE body conversion.
 597       *
 598       *  @param string HTML content
 599       *  @access private
 600       */
 601      function _convert_pre(&$text)
 602      {
 603          // get the content of PRE element
 604          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 605              // convert the content
 606              $this->pre_content = sprintf('<div><br>%s<br></div>',
 607                  preg_replace($this->pre_search, $this->pre_replace, $matches[1]));
 608              // replace the content (use callback because content can contain $0 variable)
 609              $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU',
 610                  array('html2text', '_preg_pre_callback'), $text, 1);
 611              // free memory
 612              $this->pre_content = '';
 613          }
 614      }
 615  
 616      /**
 617       *  Helper function for BLOCKQUOTE body conversion.
 618       *
 619       *  @param string HTML content
 620       *  @access private
 621       */
 622      function _convert_blockquotes(&$text)
 623      {
 624          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 625              $level = 0;
 626              $diff = 0;
 627              foreach ($matches[0] as $m) {
 628                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 629                      $level--;
 630                      if ($level < 0) {
 631                          $level = 0; // malformed HTML: go to next blockquote
 632                      }
 633                      else if ($level > 0) {
 634                          // skip inner blockquote
 635                      }
 636                      else {
 637                          $end  = $m[1];
 638                          $len  = $end - $taglen - $start;
 639                          // Get blockquote content
 640                          $body = substr($text, $start + $taglen - $diff, $len);
 641  
 642                          // Set text width
 643                          $p_width = $this->width;
 644                          if ($this->width > 0) $this->width -= 2;
 645                          // Convert blockquote content
 646                          $body = trim($body);
 647                          $this->_converter($body);
 648                          // Add citation markers and create PRE block
 649                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 650                          $body = '<pre>' . htmlspecialchars($body) . '</pre>';
 651                          // Re-set text width
 652                          $this->width = $p_width;
 653                          // Replace content
 654                          $text = substr($text, 0, $start - $diff)
 655                              . $body . substr($text, $end + strlen($m[0]) - $diff);
 656  
 657                          $diff = $len + $taglen + strlen($m[0]) - strlen($body);
 658                          unset($body);
 659                      }
 660                  }
 661                  else {
 662                      if ($level == 0) {
 663                          $start = $m[1];
 664                          $taglen = strlen($m[0]);
 665                      }
 666                      $level ++;
 667                  }
 668              }
 669          }
 670      }
 671  
 672      /**
 673       *  Callback function for preg_replace_callback use.
 674       *
 675       *  @param  array PREG matches
 676       *  @return string
 677       */
 678      private function _preg_callback($matches)
 679      {
 680          switch($matches[1]) {
 681              case 'b':
 682              case 'strong':
 683                  return $this->_toupper($matches[2]);
 684              case 'th':
 685                  return $this->_toupper("\t\t". $matches[2] ."\n");
 686              case 'h':
 687                  return $this->_toupper("\n\n". $matches[2] ."\n\n");
 688              case 'a':
 689                  // Remove spaces in URL (#1487805)
 690                  $url = str_replace(' ', '', $matches[3]);
 691                  return $this->_build_link_list($url, $matches[4]);
 692              case 'img':
 693                  return '[' . $matches[2] . ']';
 694          }
 695      }
 696  
 697      /**
 698       *  Callback function for preg_replace_callback use in PRE content handler.
 699       *
 700       *  @param  array PREG matches
 701       *  @return string
 702       */
 703      private function _preg_pre_callback($matches)
 704      {
 705          return $this->pre_content;
 706      }
 707  
 708      /**
 709       * Strtoupper function with HTML tags and entities handling.
 710       *
 711       * @param string $str Text to convert
 712       * @return string Converted text
 713       */
 714      private function _toupper($str)
 715      {
 716          // string can containg HTML tags
 717          $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 718  
 719          // convert toupper only the text between HTML tags
 720          foreach ($chunks as $idx => $chunk) {
 721              if ($chunk[0] != '<') {
 722                  $chunks[$idx] = $this->_strtoupper($chunk);
 723              }
 724          }
 725  
 726          return implode($chunks);
 727      }
 728  
 729      /**
 730       * Strtoupper multibyte wrapper function with HTML entities handling.
 731       *
 732       * @param string $str Text to convert
 733       * @return string Converted text
 734       */
 735      private function _strtoupper($str)
 736      {
 737          $str = html_entity_decode($str, ENT_COMPAT, RCMAIL_CHARSET);
 738  
 739          if (class_exists('core_text'))
 740              $str = core_text::strtoupper($str);
 741          else if (function_exists('mb_strtoupper'))
 742              $str = mb_strtoupper($str);
 743          else
 744              $str = strtoupper($str);
 745  
 746          $str = htmlspecialchars($str, ENT_COMPAT, RCMAIL_CHARSET);
 747  
 748          return $str;
 749      }
 750  }
PHP Cross Reference of moodle-2.8

/lib/ -> html2text.php (source)