[ Index ] |
PHP Cross Reference of moodle-2.8 |
[Summary view] [Print] [Text view]
1 <?php 2 3 /************************************************************************* 4 * * 5 * class.html2text.inc * 6 * * 7 ************************************************************************* 8 * * 9 * Converts HTML to formatted plain text * 10 * * 11 * Copyright (c) 2005-2007 Jon Abernathy <[email protected]> * 12 * All rights reserved. * 13 * * 14 * This script is free software; you can redistribute it and/or modify * 15 * it under the terms of the GNU General Public License as published by * 16 * the Free Software Foundation; either version 2 of the License, or * 17 * (at your option) any later version. * 18 * * 19 * The GNU General Public License can be found at * 20 * http://www.gnu.org/copyleft/gpl.html. * 21 * * 22 * This script is distributed in the hope that it will be useful, * 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 25 * GNU General Public License for more details. * 26 * * 27 * Author(s): Jon Abernathy <[email protected]> * 28 * * 29 * Last modified: 08/08/07 * 30 * * 31 *************************************************************************/ 32 33 if (!defined('RCMAIL_CHARSET')) { 34 define('RCMAIL_CHARSET', 'UTF-8'); 35 } 36 37 /** 38 * Takes HTML and converts it to formatted, plain text. 39 * 40 * Thanks to Alexander Krug (http://www.krugar.de/) to pointing out and 41 * correcting an error in the regexp search array. Fixed 7/30/03. 42 * 43 * Updated set_html() function's file reading mechanism, 9/25/03. 44 * 45 * Thanks to Joss Sanglier (http://www.dancingbear.co.uk/) for adding 46 * several more HTML entity codes to the $search and $replace arrays. 47 * Updated 11/7/03. 48 * 49 * Thanks to Darius Kasperavicius (http://www.dar.dar.lt/) for 50 * suggesting the addition of $allowed_tags and its supporting function 51 * (which I slightly modified). Updated 3/12/04. 52 * 53 * Thanks to Justin Dearing for pointing out that a replacement for the 54 * <TH> tag was missing, and suggesting an appropriate fix. 55 * Updated 8/25/04. 56 * 57 * Thanks to Mathieu Collas (http://www.myefarm.com/) for finding a 58 * display/formatting bug in the _build_link_list() function: email 59 * readers would show the left bracket and number ("[1") as part of the 60 * rendered email address. 61 * Updated 12/16/04. 62 * 63 * Thanks to Wojciech Bajon (http://histeria.pl/) for submitting code 64 * to handle relative links, which I hadn't considered. I modified his 65 * code a bit to handle normal HTTP links and MAILTO links. Also for 66 * suggesting three additional HTML entity codes to search for. 67 * Updated 03/02/05. 68 * 69 * Thanks to Jacob Chandler for pointing out another link condition 70 * for the _build_link_list() function: "https". 71 * Updated 04/06/05. 72 * 73 * Thanks to Marc Bertrand (http://www.dresdensky.com/) for 74 * suggesting a revision to the word wrapping functionality; if you 75 * specify a $width of 0 or less, word wrapping will be ignored. 76 * Updated 11/02/06. 77 * 78 * *** Big housecleaning updates below: 79 * 80 * Thanks to Colin Brown (http://www.sparkdriver.co.uk/) for 81 * suggesting the fix to handle </li> and blank lines (whitespace). 82 * Christian Basedau (http://www.movetheweb.de/) also suggested the 83 * blank lines fix. 84 * 85 * Special thanks to Marcus Bointon (http://www.synchromedia.co.uk/), 86 * Christian Basedau, Norbert Laposa (http://ln5.co.uk/), 87 * Bas van de Weijer, and Marijn van Butselaar 88 * for pointing out my glaring error in the <th> handling. Marcus also 89 * supplied a host of fixes. 90 * 91 * Thanks to Jeffrey Silverman (http://www.newtnotes.com/) for pointing 92 * out that extra spaces should be compressed--a problem addressed with 93 * Marcus Bointon's fixes but that I had not yet incorporated. 94 * 95 * Thanks to Daniel Schledermann (http://www.typoconsult.dk/) for 96 * suggesting a valuable fix with <a> tag handling. 97 * 98 * Thanks to Wojciech Bajon (again!) for suggesting fixes and additions, 99 * including the <a> tag handling that Daniel Schledermann pointed 100 * out but that I had not yet incorporated. I haven't (yet) 101 * incorporated all of Wojciech's changes, though I may at some 102 * future time. 103 * 104 * *** End of the housecleaning updates. Updated 08/08/07. 105 * 106 * @author Jon Abernathy <[email protected]> 107 * @version 1.0.0 108 * @since PHP 4.0.2 109 */ 110 class html2text 111 { 112 113 /** 114 * Contains the HTML content to convert. 115 * 116 * @var string $html 117 * @access public 118 */ 119 var $html; 120 121 /** 122 * Contains the converted, formatted text. 123 * 124 * @var string $text 125 * @access public 126 */ 127 var $text; 128 129 /** 130 * Maximum width of the formatted text, in columns. 131 * 132 * Set this value to 0 (or less) to ignore word wrapping 133 * and not constrain text to a fixed-width column. 134 * 135 * @var integer $width 136 * @access public 137 */ 138 var $width = 70; 139 140 /** 141 * List of preg* regular expression patterns to search for, 142 * used in conjunction with $replace. 143 * 144 * @var array $search 145 * @access public 146 * @see $replace 147 */ 148 var $search = array( 149 "/\r/", // Non-legal carriage return 150 "/[\n\t]+/", // Newlines and tabs 151 '/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 152 '/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 153 '/<p[^>]*>/i', // <P> 154 '/<br[^>]*>/i', // <br> 155 '/<i[^>]*>(.*?)<\/i>/i', // <i> 156 '/<em[^>]*>(.*?)<\/em>/i', // <em> 157 '/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> 158 '/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> 159 '/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> 160 '/<li[^>]*>/i', // <li> 161 '/<hr[^>]*>/i', // <hr> 162 '/<div[^>]*>/i', // <div> 163 '/(<table[^>]*>|<\/table>)/i', // <table> and </table> 164 '/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> 165 '/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> 166 ); 167 168 /** 169 * List of pattern replacements corresponding to patterns searched. 170 * 171 * @var array $replace 172 * @access public 173 * @see $search 174 */ 175 var $replace = array( 176 '', // Non-legal carriage return 177 ' ', // Newlines and tabs 178 '', // <script>s -- which strip_tags supposedly has problems with 179 '', // <style>s -- which strip_tags supposedly has problems with 180 "\n\n", // <P> 181 "\n", // <br> 182 '_\\1_', // <i> 183 '_\\1_', // <em> 184 "\n\n", // <ul> and </ul> 185 "\n\n", // <ol> and </ol> 186 "\t* \\1\n", // <li> and </li> 187 "\n\t* ", // <li> 188 "\n-------------------------\n", // <hr> 189 "<div>\n", // <div> 190 "\n\n", // <table> and </table> 191 "\n", // <tr> and </tr> 192 "\t\t\\1\n", // <td> and </td> 193 ); 194 195 /** 196 * List of preg* regular expression patterns to search for, 197 * used in conjunction with $ent_replace. 198 * 199 * @var array $ent_search 200 * @access public 201 * @see $ent_replace 202 */ 203 var $ent_search = array( 204 '/&(nbsp|#160);/i', // Non-breaking space 205 '/&(quot|rdquo|ldquo|#8220|#8221|#147|#148);/i', 206 // Double quotes 207 '/&(apos|rsquo|lsquo|#8216|#8217);/i', // Single quotes 208 '/>/i', // Greater-than 209 '/</i', // Less-than 210 '/&(copy|#169);/i', // Copyright 211 '/&(trade|#8482|#153);/i', // Trademark 212 '/&(reg|#174);/i', // Registered 213 '/&(mdash|#151|#8212);/i', // mdash 214 '/&(ndash|minus|#8211|#8722);/i', // ndash 215 '/&(bull|#149|#8226);/i', // Bullet 216 '/&(pound|#163);/i', // Pound sign 217 '/&(euro|#8364);/i', // Euro sign 218 '/&(amp|#38);/i', // Ampersand: see _converter() 219 '/[ ]{2,}/', // Runs of spaces, post-handling 220 ); 221 222 /** 223 * List of pattern replacements corresponding to patterns searched. 224 * 225 * @var array $ent_replace 226 * @access public 227 * @see $ent_search 228 */ 229 var $ent_replace = array( 230 ' ', // Non-breaking space 231 '"', // Double quotes 232 "'", // Single quotes 233 '>', 234 '<', 235 '(c)', 236 '(tm)', 237 '(R)', 238 '--', 239 '-', 240 '*', 241 '£', 242 'EUR', // Euro sign. € ? 243 '|+|amp|+|', // Ampersand: see _converter() 244 ' ', // Runs of spaces, post-handling 245 ); 246 247 /** 248 * List of preg* regular expression patterns to search for 249 * and replace using callback function. 250 * 251 * @var array $callback_search 252 * @access public 253 */ 254 var $callback_search = array( 255 '/<(a) [^>]*href=("|\')([^"\']+)\2[^>]*>(.*?)<\/a>/i', 256 // <a href=""> 257 '/<(h)[123456][^>]*>(.*?)<\/h[123456]>/i', // H1 - H3 258 '/<(b)[^>]*>(.*?)<\/b>/i', // <b> 259 '/<(strong)[^>]*>(.*?)<\/strong>/i', // <strong> 260 '/<(th)[^>]*>(.*?)<\/th>/i', // <th> and </th> 261 '/<(img)[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt 262 ); 263 264 /** 265 * List of preg* regular expression patterns to search for in PRE body, 266 * used in conjunction with $pre_replace. 267 * 268 * @var array $pre_search 269 * @access public 270 * @see $pre_replace 271 */ 272 var $pre_search = array( 273 "/\n/", 274 "/\t/", 275 '/ /', 276 '/<pre[^>]*>/', 277 '/<\/pre>/' 278 ); 279 280 /** 281 * List of pattern replacements corresponding to patterns searched for PRE body. 282 * 283 * @var array $pre_replace 284 * @access public 285 * @see $pre_search 286 */ 287 var $pre_replace = array( 288 '<br>', 289 ' ', 290 ' ', 291 '', 292 '' 293 ); 294 295 /** 296 * Contains a list of HTML tags to allow in the resulting text. 297 * 298 * @var string $allowed_tags 299 * @access public 300 * @see set_allowed_tags() 301 */ 302 var $allowed_tags = ''; 303 304 /** 305 * Contains the base URL that relative links should resolve to. 306 * 307 * @var string $url 308 * @access public 309 */ 310 var $url; 311 312 /** 313 * Indicates whether content in the $html variable has been converted yet. 314 * 315 * @var boolean $_converted 316 * @access private 317 * @see $html, $text 318 */ 319 var $_converted = false; 320 321 /** 322 * Contains URL addresses from links to be rendered in plain text. 323 * 324 * @var array $_link_list 325 * @access private 326 * @see _build_link_list() 327 */ 328 var $_link_list = array(); 329 330 /** 331 * Boolean flag, true if a table of link URLs should be listed after the text. 332 * 333 * @var boolean $_do_links 334 * @access private 335 * @see html2text() 336 */ 337 var $_do_links = true; 338 339 /** 340 * Constructor. 341 * 342 * If the HTML source string (or file) is supplied, the class 343 * will instantiate with that source propagated, all that has 344 * to be done it to call get_text(). 345 * 346 * @param string $source HTML content 347 * @param boolean $from_file Indicates $source is a file to pull content from 348 * @param boolean $do_links Indicate whether a table of link URLs is desired 349 * @param integer $width Maximum width of the formatted text, 0 for no limit 350 * @access public 351 * @return void 352 */ 353 function html2text( $source = '', $from_file = false, $do_links = true, $width = 75 ) 354 { 355 if ( $source !== '' ) { 356 $this->set_html($source, $from_file); 357 } 358 359 $this->set_base_url(); 360 $this->_do_links = $do_links; 361 $this->width = $width; 362 } 363 364 /** 365 * Loads source HTML into memory, either from $source string or a file. 366 * 367 * @param string $source HTML content 368 * @param boolean $from_file Indicates $source is a file to pull content from 369 * @access public 370 * @return void 371 */ 372 function set_html( $source, $from_file = false ) 373 { 374 if ( $from_file && file_exists($source) ) { 375 $this->html = file_get_contents($source); 376 } 377 else 378 $this->html = $source; 379 380 $this->_converted = false; 381 } 382 383 /** 384 * Returns the text, converted from HTML. 385 * 386 * @access public 387 * @return string 388 */ 389 function get_text() 390 { 391 if ( !$this->_converted ) { 392 $this->_convert(); 393 } 394 395 return $this->text; 396 } 397 398 /** 399 * Prints the text, converted from HTML. 400 * 401 * @access public 402 * @return void 403 */ 404 function print_text() 405 { 406 print $this->get_text(); 407 } 408 409 /** 410 * Alias to print_text(), operates identically. 411 * 412 * @access public 413 * @return void 414 * @see print_text() 415 */ 416 function p() 417 { 418 print $this->get_text(); 419 } 420 421 /** 422 * Sets the allowed HTML tags to pass through to the resulting text. 423 * 424 * Tags should be in the form "<p>", with no corresponding closing tag. 425 * 426 * @access public 427 * @return void 428 */ 429 function set_allowed_tags( $allowed_tags = '' ) 430 { 431 if ( !empty($allowed_tags) ) { 432 $this->allowed_tags = $allowed_tags; 433 } 434 } 435 436 /** 437 * Sets a base URL to handle relative links. 438 * 439 * @access public 440 * @return void 441 */ 442 function set_base_url( $url = '' ) 443 { 444 if ( empty($url) ) { 445 if ( !empty($_SERVER['HTTP_HOST']) ) { 446 $this->url = 'http://' . $_SERVER['HTTP_HOST']; 447 } else { 448 $this->url = ''; 449 } 450 } else { 451 // Strip any trailing slashes for consistency (relative 452 // URLs may already start with a slash like "/file.html") 453 if ( substr($url, -1) == '/' ) { 454 $url = substr($url, 0, -1); 455 } 456 $this->url = $url; 457 } 458 } 459 460 /** 461 * Workhorse function that does actual conversion (calls _converter() method). 462 * 463 * @access private 464 * @return void 465 */ 466 function _convert() 467 { 468 // Variables used for building the link list 469 $this->_link_list = array(); 470 471 $text = trim($this->html); 472 473 // Convert HTML to TXT 474 $this->_converter($text); 475 476 // Add link list 477 if (!empty($this->_link_list)) { 478 $text .= "\n\nLinks:\n------\n"; 479 foreach ($this->_link_list as $idx => $url) { 480 $text .= '[' . ($idx+1) . '] ' . $url . "\n"; 481 } 482 } 483 484 $this->text = $text; 485 486 $this->_converted = true; 487 } 488 489 /** 490 * Workhorse function that does actual conversion. 491 * 492 * First performs custom tag replacement specified by $search and 493 * $replace arrays. Then strips any remaining HTML tags, reduces whitespace 494 * and newlines to a readable format, and word wraps the text to 495 * $width characters. 496 * 497 * @param string Reference to HTML content string 498 * 499 * @access private 500 * @return void 501 */ 502 function _converter(&$text) 503 { 504 // Convert <BLOCKQUOTE> (before PRE!) 505 $this->_convert_blockquotes($text); 506 507 // Convert <PRE> 508 $this->_convert_pre($text); 509 510 // Run our defined tags search-and-replace 511 $text = preg_replace($this->search, $this->replace, $text); 512 513 // Run our defined tags search-and-replace with callback 514 $text = preg_replace_callback($this->callback_search, array('html2text', '_preg_callback'), $text); 515 516 // Strip any other HTML tags 517 $text = strip_tags($text, $this->allowed_tags); 518 519 // Run our defined entities/characters search-and-replace 520 $text = preg_replace($this->ent_search, $this->ent_replace, $text); 521 522 // Replace known html entities 523 $text = html_entity_decode($text, ENT_COMPAT, 'UTF-8'); 524 525 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 526 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 527 528 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 529 // This properly handles situation of "&quot;" in input string 530 $text = str_replace('|+|amp|+|', '&', $text); 531 532 // Bring down number of empty lines to 2 max 533 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 534 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 535 536 // remove leading empty lines (can be produced by eg. P tag on the beginning) 537 $text = ltrim($text, "\n"); 538 539 // Wrap the text to a readable format 540 // for PHP versions >= 4.0.2. Default width is 75 541 // If width is 0 or less, don't wrap the text. 542 if ( $this->width > 0 ) { 543 $text = wordwrap($text, $this->width); 544 } 545 } 546 547 /** 548 * Helper function called by preg_replace() on link replacement. 549 * 550 * Maintains an internal list of links to be displayed at the end of the 551 * text, with numeric indices to the original point in the text they 552 * appeared. Also makes an effort at identifying and handling absolute 553 * and relative links. 554 * 555 * @param string $link URL of the link 556 * @param string $display Part of the text to associate number with 557 * @access private 558 * @return string 559 */ 560 function _build_link_list( $link, $display ) 561 { 562 if (!$this->_do_links || empty($link)) { 563 return $display; 564 } 565 566 // Ignored link types 567 if (preg_match('!^(javascript|mailto|#):!i', $link)) { 568 return $display; 569 } 570 571 if (preg_match('!^(https?://)!i', $link)) { 572 $url = $link; 573 } 574 else { 575 $url = $this->url; 576 if (substr($link, 0, 1) != '/') { 577 $url .= '/'; 578 } 579 $url .= "$link"; 580 } 581 582 if (($index = array_search($url, $this->_link_list)) === false) { 583 // Add the link to the list. 584 $this->_link_list[] = $url; 585 $index = count($this->_link_list); 586 } 587 else { 588 // Use the index of the existing link in the list (we enumerate from 1, not from 0). 589 $index++; 590 } 591 592 return $display . ' [' . ($index) . ']'; 593 } 594 595 /** 596 * Helper function for PRE body conversion. 597 * 598 * @param string HTML content 599 * @access private 600 */ 601 function _convert_pre(&$text) 602 { 603 // get the content of PRE element 604 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 605 // convert the content 606 $this->pre_content = sprintf('<div><br>%s<br></div>', 607 preg_replace($this->pre_search, $this->pre_replace, $matches[1])); 608 // replace the content (use callback because content can contain $0 variable) 609 $text = preg_replace_callback('/<pre[^>]*>.*<\/pre>/ismU', 610 array('html2text', '_preg_pre_callback'), $text, 1); 611 // free memory 612 $this->pre_content = ''; 613 } 614 } 615 616 /** 617 * Helper function for BLOCKQUOTE body conversion. 618 * 619 * @param string HTML content 620 * @access private 621 */ 622 function _convert_blockquotes(&$text) 623 { 624 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 625 $level = 0; 626 $diff = 0; 627 foreach ($matches[0] as $m) { 628 if ($m[0][0] == '<' && $m[0][1] == '/') { 629 $level--; 630 if ($level < 0) { 631 $level = 0; // malformed HTML: go to next blockquote 632 } 633 else if ($level > 0) { 634 // skip inner blockquote 635 } 636 else { 637 $end = $m[1]; 638 $len = $end - $taglen - $start; 639 // Get blockquote content 640 $body = substr($text, $start + $taglen - $diff, $len); 641 642 // Set text width 643 $p_width = $this->width; 644 if ($this->width > 0) $this->width -= 2; 645 // Convert blockquote content 646 $body = trim($body); 647 $this->_converter($body); 648 // Add citation markers and create PRE block 649 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 650 $body = '<pre>' . htmlspecialchars($body) . '</pre>'; 651 // Re-set text width 652 $this->width = $p_width; 653 // Replace content 654 $text = substr($text, 0, $start - $diff) 655 . $body . substr($text, $end + strlen($m[0]) - $diff); 656 657 $diff = $len + $taglen + strlen($m[0]) - strlen($body); 658 unset($body); 659 } 660 } 661 else { 662 if ($level == 0) { 663 $start = $m[1]; 664 $taglen = strlen($m[0]); 665 } 666 $level ++; 667 } 668 } 669 } 670 } 671 672 /** 673 * Callback function for preg_replace_callback use. 674 * 675 * @param array PREG matches 676 * @return string 677 */ 678 private function _preg_callback($matches) 679 { 680 switch($matches[1]) { 681 case 'b': 682 case 'strong': 683 return $this->_toupper($matches[2]); 684 case 'th': 685 return $this->_toupper("\t\t". $matches[2] ."\n"); 686 case 'h': 687 return $this->_toupper("\n\n". $matches[2] ."\n\n"); 688 case 'a': 689 // Remove spaces in URL (#1487805) 690 $url = str_replace(' ', '', $matches[3]); 691 return $this->_build_link_list($url, $matches[4]); 692 case 'img': 693 return '[' . $matches[2] . ']'; 694 } 695 } 696 697 /** 698 * Callback function for preg_replace_callback use in PRE content handler. 699 * 700 * @param array PREG matches 701 * @return string 702 */ 703 private function _preg_pre_callback($matches) 704 { 705 return $this->pre_content; 706 } 707 708 /** 709 * Strtoupper function with HTML tags and entities handling. 710 * 711 * @param string $str Text to convert 712 * @return string Converted text 713 */ 714 private function _toupper($str) 715 { 716 // string can containg HTML tags 717 $chunks = preg_split('/(<[^>]*>)/', $str, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 718 719 // convert toupper only the text between HTML tags 720 foreach ($chunks as $idx => $chunk) { 721 if ($chunk[0] != '<') { 722 $chunks[$idx] = $this->_strtoupper($chunk); 723 } 724 } 725 726 return implode($chunks); 727 } 728 729 /** 730 * Strtoupper multibyte wrapper function with HTML entities handling. 731 * 732 * @param string $str Text to convert 733 * @return string Converted text 734 */ 735 private function _strtoupper($str) 736 { 737 $str = html_entity_decode($str, ENT_COMPAT, RCMAIL_CHARSET); 738 739 if (class_exists('core_text')) 740 $str = core_text::strtoupper($str); 741 else if (function_exists('mb_strtoupper')) 742 $str = mb_strtoupper($str); 743 else 744 $str = strtoupper($str); 745 746 $str = htmlspecialchars($str, ENT_COMPAT, RCMAIL_CHARSET); 747 748 return $str; 749 } 750 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Fri Nov 28 20:29:05 2014 | Cross-referenced by PHPXref 0.7.1 |