MediaWiki  REL1_24
UtfNormalUtil.php
Go to the documentation of this file.
00001 <?php
00036 function codepointToUtf8( $codepoint ) {
00037     if ( $codepoint < 0x80 ) {
00038         return chr( $codepoint );
00039     }
00040 
00041     if ( $codepoint < 0x800 ) {
00042         return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
00043             chr( $codepoint & 0x3f | 0x80 );
00044     }
00045 
00046     if ( $codepoint < 0x10000 ) {
00047         return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
00048             chr( $codepoint >> 6 & 0x3f | 0x80 ) .
00049             chr( $codepoint & 0x3f | 0x80 );
00050     }
00051 
00052     if ( $codepoint < 0x110000 ) {
00053         return chr( $codepoint >> 18 & 0x07 | 0xf0 ) .
00054             chr( $codepoint >> 12 & 0x3f | 0x80 ) .
00055             chr( $codepoint >> 6 & 0x3f | 0x80 ) .
00056             chr( $codepoint & 0x3f | 0x80 );
00057     }
00058 
00059     echo "Asked for code outside of range ($codepoint)\n";
00060     die( -1 );
00061 }
00062 
00072 function hexSequenceToUtf8( $sequence ) {
00073     $utf = '';
00074     foreach ( explode( ' ', $sequence ) as $hex ) {
00075         $n = hexdec( $hex );
00076         $utf .= codepointToUtf8( $n );
00077     }
00078 
00079     return $utf;
00080 }
00081 
00090 function utf8ToHexSequence( $str ) {
00091     $buf = '';
00092     foreach ( preg_split( '//u', $str, -1, PREG_SPLIT_NO_EMPTY ) as $cp ) {
00093         $buf .= sprintf( '%04x ', utf8ToCodepoint( $cp ) );
00094     }
00095 
00096     return rtrim( $buf );
00097 }
00098 
00107 function utf8ToCodepoint( $char ) {
00108     # Find the length
00109     $z = ord( $char[0] );
00110     if ( $z & 0x80 ) {
00111         $length = 0;
00112         while ( $z & 0x80 ) {
00113             $length++;
00114             $z <<= 1;
00115         }
00116     } else {
00117         $length = 1;
00118     }
00119 
00120     if ( $length != strlen( $char ) ) {
00121         return false;
00122     }
00123 
00124     if ( $length == 1 ) {
00125         return ord( $char );
00126     }
00127 
00128     # Mask off the length-determining bits and shift back to the original location
00129     $z &= 0xff;
00130     $z >>= $length;
00131 
00132     # Add in the free bits from subsequent bytes
00133     for ( $i = 1; $i < $length; $i++ ) {
00134         $z <<= 6;
00135         $z |= ord( $char[$i] ) & 0x3f;
00136     }
00137 
00138     return $z;
00139 }
00140 
00148 function escapeSingleString( $string ) {
00149     return strtr( $string,
00150         array(
00151             '\\' => '\\\\',
00152             '\'' => '\\\''
00153         ) );
00154 }