Support Joomla!

Joomla! 1.5 Documentation

Packages

Package: utf8

Developer Network License

The Joomla! Developer Network content is © copyright 2006 by the individual contributors and can be used in accordance with the Creative Commons License, Attribution- NonCommercial- ShareAlike 2.5
Source code for file /phputf8/utils/specials.php

Documentation is available at specials.php

  1. <?php
  2. /**
  3. * Utilities for processing "special" characters in UTF-8. "Special" largely means anything which would
  4. * be regarded as a non-word character, like ASCII control characters and punctuation. This has a "Roman"
  5. * bias - it would be unaware of modern Chinese "punctuation" characters for example.
  6. * Note: requires utils/unicode.php to be loaded
  7. @version $Id: specials.php,v 1.1 2006/02/26 13:17:29 harryf Exp $
  8. @package utf8
  9. @subpackage utils
  10. @see utf8_is_valid
  11. */
  12.  
  13. //--------------------------------------------------------------------
  14. /**
  15. * Used internally. Builds a PCRE pattern from the $UTF8_SPECIAL_CHARS
  16. * array defined in this file
  17. * This function adds the control chars 0x00 to 0x19 to the array of
  18. * special chars (they are not included in $UTF8_SPECIAL_CHARS)
  19. @package utf8
  20. @subpackage utils
  21. @return string 
  22. @see utf8_from_unicode
  23. @see utf8_is_word_chars
  24. @see utf8_strip_specials
  25. */
  26. function utf8_specials_pattern({
  27.     static $pattern NULL;
  28.  
  29.     if !$pattern {
  30.         global $UTF8_SPECIAL_CHARS;
  31.         $pattern preg_quote(utf8_from_unicode($UTF8_SPECIAL_CHARS)'/');
  32.         $pattern '/[\x00-\x19'.$pattern.']/u';
  33.     }
  34.  
  35.     return $pattern;
  36. }
  37.  
  38. //--------------------------------------------------------------------
  39. /**
  40. * Checks a string for whether it contains only word characters. This
  41. * is logically equivalent to the \w PCRE meta character. Note that
  42. * this is not a 100% guarantee that the string only contains alpha /
  43. * numeric characters but just that common non-alphanumeric are not
  44. * in the string, including ASCII device control characters.
  45. @package utf8
  46. @subpackage utils
  47. @param string to check
  48. @return boolean TRUE if the string only contains word characters
  49. @see utf8_specials_pattern
  50. */
  51. function utf8_is_word_chars($str{
  52.     return !(bool)preg_match(utf8_specials_pattern(),$str);
  53. }
  54.  
  55. //--------------------------------------------------------------------
  56. /**
  57. * Removes special characters (nonalphanumeric) from a UTF-8 string
  58. *
  59. * This can be useful as a helper for sanitizing a string for use as
  60. * something like a file name or a unique identifier. Be warned though
  61. * it does not handle all possible non-alphanumeric characters and is
  62. * not intended is some kind of security / injection filter.
  63. *
  64. @package utf8
  65. @subpackage utils
  66. @author Andreas Gohr <[email protected]>
  67. @param string $string The UTF8 string to strip of special chars
  68. @param string (optional) $repl   Replace special with this string
  69. @return string with common non-alphanumeric characters removed
  70. @see utf8_specials_pattern
  71. */
  72. function utf8_strip_specials($string$repl=''){
  73.     return preg_replace(utf8_specials_pattern()$repl$string);
  74. }
  75.  
  76. //--------------------------------------------------------------------
  77. /**
  78. * UTF-8 array of common special characters
  79. * This array should contain all special characters (not a letter or digit)
  80. * defined in the various local charsets - it's not a complete list of
  81. * non-alphanum characters in UTF-8. It's not perfect but should match most
  82. * cases of special chars.
  83. * The controlchars 0x00 to 0x19 are _not_ included in this array. The space
  84. * 0x20 is! These chars are _not_ in the array either:  _ (0x5f), : 0x3a,
  85. * . 0x2e, - 0x2d
  86. @package utf8
  87. @subpackage utils
  88. @author Andreas Gohr <[email protected]>
  89. @see utf8_specials_pattern
  90. */
  91. $UTF8_SPECIAL_CHARS array(
  92.     0x001a0x001b0x001c0x001d0x001e0x001f0x00200x00210x00220x0023,
  93.     0x00240x00250x00260x00270x00280x00290x002a0x002b0x002c,
  94.     0x002f,         0x003b0x003c0x003d0x003e0x003f0x00400x005b,
  95.     0x005c0x005d0x005e,         0x00600x007b0x007c0x007d0x007e,
  96.     0x007f0x00800x00810x00820x00830x00840x00850x00860x00870x0088,
  97.     0x00890x008a0x008b0x008c0x008d0x008e0x008f0x00900x00910x0092,
  98.     0x00930x00940x00950x00960x00970x00980x00990x009a0x009b0x009c,
  99.     0x009d0x009e0x009f0x00a00x00a10x00a20x00a30x00a40x00a50x00a6,
  100.     0x00a70x00a80x00a90x00aa0x00ab0x00ac0x00ad0x00ae0x00af0x00b0,
  101.     0x00b10x00b20x00b30x00b40x00b50x00b60x00b70x00b80x00b90x00ba,
  102.     0x00bb0x00bc0x00bd0x00be0x00bf0x00d70x00f70x02c70x02d80x02d9,
  103.     0x02da0x02db0x02dc0x02dd0x03000x03010x03030x03090x03230x0384,
  104.     0x03850x03870x03b20x03c60x03d10x03d20x03d50x03d60x05b00x05b1,
  105.     0x05b20x05b30x05b40x05b50x05b60x05b70x05b80x05b90x05bb0x05bc,
  106.     0x05bd0x05be0x05bf0x05c00x05c10x05c20x05c30x05f30x05f40x060c,
  107.     0x061b0x061f0x06400x064b0x064c0x064d0x064e0x064f0x06500x0651,
  108.     0x06520x066a0x0e3f0x200c0x200d0x200e0x200f0x20130x20140x2015,
  109.     0x20170x20180x20190x201a0x201c0x201d0x201e0x20200x20210x2022,
  110.     0x20260x20300x20320x20330x20390x203a0x20440x20a70x20aa0x20ab,
  111.     0x20ac0x21160x21180x21220x21260x21350x21900x21910x21920x2193,
  112.     0x21940x21950x21b50x21d00x21d10x21d20x21d30x21d40x22000x2202,
  113.     0x22030x22050x22060x22070x22080x22090x220b0x220f0x22110x2212,
  114.     0x22150x22170x22190x221a0x221d0x221e0x22200x22270x22280x2229,
  115.     0x222a0x222b0x22340x223c0x22450x22480x22600x22610x22640x2265,
  116.     0x22820x22830x22840x22860x22870x22950x22970x22a50x22c50x2310,
  117.     0x23200x23210x23290x232a0x24690x25000x25020x250c0x25100x2514,
  118.     0x25180x251c0x25240x252c0x25340x253c0x25500x25510x25520x2553,
  119.     0x25540x25550x25560x25570x25580x25590x255a0x255b0x255c0x255d,
  120.     0x255e0x255f0x25600x25610x25620x25630x25640x25650x25660x2567,
  121.     0x25680x25690x256a0x256b0x256c0x25800x25840x25880x258c0x2590,
  122.     0x25910x25920x25930x25a00x25b20x25bc0x25c60x25ca0x25cf0x25d7,
  123.     0x26050x260e0x261b0x261e0x26600x26630x26650x26660x27010x2702,
  124.     0x27030x27040x27060x27070x27080x27090x270c0x270d0x270e0x270f,
  125.     0x27100x27110x27120x27130x27140x27150x27160x27170x27180x2719,
  126.     0x271a0x271b0x271c0x271d0x271e0x271f0x27200x27210x27220x2723,
  127.     0x27240x27250x27260x27270x27290x272a0x272b0x272c0x272d0x272e,
  128.     0x272f0x27300x27310x27320x27330x27340x27350x27360x27370x2738,
  129.     0x27390x273a0x273b0x273c0x273d0x273e0x273f0x27400x27410x2742,
  130.     0x27430x27440x27450x27460x27470x27480x27490x274a0x274b0x274d,
  131.     0x274f0x27500x27510x27520x27560x27580x27590x275a0x275b0x275c,
  132.     0x275d0x275e0x27610x27620x27630x27640x27650x27660x27670x277f,
  133.     0x27890x27930x27940x27980x27990x279a0x279b0x279c0x279d0x279e,
  134.     0x279f0x27a00x27a10x27a20x27a30x27a40x27a50x27a60x27a70x27a8,
  135.     0x27a90x27aa0x27ab0x27ac0x27ad0x27ae0x27af0x27b10x27b20x27b3,
  136.     0x27b40x27b50x27b60x27b70x27b80x27b90x27ba0x27bb0x27bc0x27bd,
  137.     0x27be0xf6d90xf6da0xf6db0xf8d70xf8d80xf8d90xf8da0xf8db0xf8dc,
  138.     0xf8dd0xf8de0xf8df0xf8e00xf8e10xf8e20xf8e30xf8e40xf8e50xf8e6,
  139.     0xf8e70xf8e80xf8e90xf8ea0xf8eb0xf8ec0xf8ed0xf8ee0xf8ef0xf8f0,
  140.     0xf8f10xf8f20xf8f30xf8f40xf8f50xf8f60xf8f70xf8f80xf8f90xf8fa,
  141.     0xf8fb0xf8fc0xf8fd0xf8fe0xfe7c0xfe7d,
  142. );

Documentation generated on Mon, 05 Mar 2007 21:26:22 +0000 by phpDocumentor 1.3.1