Source code for file /phputf8/utils/ascii.php
Documentation is available at ascii.php
* Tools to help with ASCII in UTF-8
* @version $Id: ascii.php,v 1.1 2006/02/26 13:17:29 harryf Exp $
//---------------------------------------------------------------
* UTF-8 lookup table for lower case accented letters
* This lookuptable defines replacements for accented characters from the ASCII-7
* range. This are lower case letters only.
$GLOBALS['UTF8_LOWER_ACCENTS'] =
array(
'Ã ' =>
'a', 'ô' =>
'o', '�' =>
'd', 'ḟ' =>
'f', 'ë' =>
'e', 'Å¡' =>
's', 'Æ¡' =>
'o',
'ß' =>
'ss', 'ă' =>
'a', 'Ã…â„¢' =>
'r', 'È›' =>
't', 'ň' =>
'n', '�' =>
'a', 'Ä·' =>
'k',
'Ã…Â?' =>
's', 'ỳ' =>
'y', 'ņ' =>
'n', 'ĺ' =>
'l', 'ħ' =>
'h', 'á¹—' =>
'p', 'ó' =>
'o',
'ú' =>
'u', 'Ä›' =>
'e', 'é' =>
'e', 'ç' =>
'c', '�' =>
'w', 'Ä‹' =>
'c', 'õ' =>
'o',
'ṡ' =>
's', 'ø' =>
'o', 'Ä£' =>
'g', 'ŧ' =>
't', 'È™' =>
's', 'Ä—' =>
'e', 'ĉ' =>
'c',
'Å›' =>
's', 'î' =>
'i', 'ű' =>
'u', 'ć' =>
'c', 'Ä™' =>
'e', 'ŵ' =>
'w', 'ṫ' =>
't',
'Å«' =>
'u', '�' =>
'c', 'ö' =>
'oe', 'è' =>
'e', 'Å·' =>
'y', 'Ä…' =>
'a', 'Å‚' =>
'l',
'ų' =>
'u', 'ů' =>
'u', 'ÅŸ' =>
's', 'ÄŸ' =>
'g', 'ļ' =>
'l', 'Æ’' =>
'f', 'ž' =>
'z',
'ẃ' =>
'w', 'ḃ' =>
'b', 'Ã¥' =>
'a', 'ì' =>
'i', 'ï' =>
'i', 'ḋ' =>
'd', 'Ã…Â¥' =>
't',
'Å—' =>
'r', 'ä' =>
'ae', 'ÃÂ' =>
'i', 'Å•' =>
'r', 'ê' =>
'e', 'ü' =>
'ue', 'ò' =>
'o',
'Ä“' =>
'e', 'ñ' =>
'n', 'Å„' =>
'n', 'Ä¥' =>
'h', '�' =>
'g', 'Ä‘' =>
'd', 'ĵ' =>
'j',
'ÿ' =>
'y', 'Å©' =>
'u', 'Ã…Â' =>
'u', 'Æ°' =>
'u', 'Å£' =>
't', 'ý' =>
'y', 'Å‘' =>
'o',
'â' =>
'a', 'ľ' =>
'l', 'ẅ' =>
'w', 'ż' =>
'z', 'Ä«' =>
'i', 'ã' =>
'a', 'Ä¡' =>
'g',
'�' =>
'm', 'Ã…Â?' =>
'o', 'Ä©' =>
'i', 'ù' =>
'u', 'į' =>
'i', 'ź' =>
'z', 'á' =>
'a',
'û' =>
'u', 'þ' =>
'th', 'ð' =>
'dh', 'æ' =>
'ae', 'µ' =>
'u',
//---------------------------------------------------------------
* UTF-8 lookup table for upper case accented letters
* This lookuptable defines replacements for accented characters from the ASCII-7
* range. This are upper case letters only.
$GLOBALS['UTF8_UPPER_ACCENTS'] =
array(
'Ã ' =>
'A', 'ô' =>
'O', '�' =>
'D', 'ḟ' =>
'F', 'ë' =>
'E', 'Å¡' =>
'S', 'Æ¡' =>
'O',
'ß' =>
'Ss', 'ă' =>
'A', 'Ã…â„¢' =>
'R', 'È›' =>
'T', 'ň' =>
'N', '�' =>
'A', 'Ä·' =>
'K',
'Ã…Â?' =>
'S', 'ỳ' =>
'Y', 'ņ' =>
'N', 'ĺ' =>
'L', 'ħ' =>
'H', 'á¹—' =>
'P', 'ó' =>
'O',
'ú' =>
'U', 'Ä›' =>
'E', 'é' =>
'E', 'ç' =>
'C', '�' =>
'W', 'Ä‹' =>
'C', 'õ' =>
'O',
'ṡ' =>
'S', 'ø' =>
'O', 'Ä£' =>
'G', 'ŧ' =>
'T', 'È™' =>
'S', 'Ä—' =>
'E', 'ĉ' =>
'C',
'Å›' =>
'S', 'î' =>
'I', 'ű' =>
'U', 'ć' =>
'C', 'Ä™' =>
'E', 'ŵ' =>
'W', 'ṫ' =>
'T',
'Å«' =>
'U', '�' =>
'C', 'ö' =>
'Oe', 'è' =>
'E', 'Å·' =>
'Y', 'Ä…' =>
'A', 'Å‚' =>
'L',
'ų' =>
'U', 'ů' =>
'U', 'ÅŸ' =>
'S', 'ÄŸ' =>
'G', 'ļ' =>
'L', 'Æ’' =>
'F', 'ž' =>
'Z',
'ẃ' =>
'W', 'ḃ' =>
'B', 'Ã¥' =>
'A', 'ì' =>
'I', 'ï' =>
'I', 'ḋ' =>
'D', 'Ã…Â¥' =>
'T',
'Å—' =>
'R', 'ä' =>
'Ae', 'ÃÂ' =>
'I', 'Å•' =>
'R', 'ê' =>
'E', 'ü' =>
'Ue', 'ò' =>
'O',
'Ä“' =>
'E', 'ñ' =>
'N', 'Å„' =>
'N', 'Ä¥' =>
'H', '�' =>
'G', 'Ä‘' =>
'D', 'ĵ' =>
'J',
'ÿ' =>
'Y', 'Å©' =>
'U', 'Ã…Â' =>
'U', 'Æ°' =>
'U', 'Å£' =>
'T', 'ý' =>
'Y', 'Å‘' =>
'O',
'â' =>
'A', 'ľ' =>
'L', 'ẅ' =>
'W', 'ż' =>
'Z', 'Ä«' =>
'I', 'ã' =>
'A', 'Ä¡' =>
'G',
'�' =>
'M', 'Ã…Â?' =>
'O', 'Ä©' =>
'I', 'ù' =>
'U', 'į' =>
'I', 'ź' =>
'Z', 'á' =>
'A',
'û' =>
'U', 'Þ' =>
'Th', '�' =>
'Dh', 'Æ' =>
'Ae',
//--------------------------------------------------------------------
* Tests whether a string contains only 7bit ASCII bytes.
* You might use this to conditionally check whether a string
* needs handling as UTF-8 or not, potentially offering performance
* benefits by using the native PHP equivalent if it's just ASCII e.g.;
* if ( utf8_is_ascii($someString) ) {
* // It's just ASCII - use the native PHP version
* $someString = strtolower($someString);
* $someString = utf8_strtolower($someString);
* @return boolean TRUE if it's all ASCII
* @see utf8_is_ascii_ctrl
// Search for any bytes which are outside the ASCII range...
//--------------------------------------------------------------------
* Tests whether a string contains only 7bit ASCII bytes with device
* control codes omitted. The device control codes can be found on the
* second table here: http://www.w3schools.com/tags/ref_ascii.asp
* @return boolean TRUE if it's all ASCII without device control codes
// Search for any bytes which are outside the ASCII range,
// or are device control codes
return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/',$str) !==
1);
//--------------------------------------------------------------------
* Strip out all non-7bit ASCII bytes
* If you need to transmit a string to system which you know can only
* support 7bit ASCII, you could use this function.
* @return string with non ASCII bytes removed
* @see utf8_strip_non_ascii_ctrl
'/^([\x00-\x7F]+)|([^\x00-\x7F]+)/S',
if ( !isset
($matches[2]) ) {
//--------------------------------------------------------------------
* Strip out all non 7bit ASCII bytes and ASCII device control codes.
* For a list of ASCII device control codes see the 2nd table here:
* http://www.w3schools.com/tags/ref_ascii.asp
* @return boolean TRUE if it's all ASCII
'/^([\x09\x0A\x0D\x20-\x7E]+)|([^\x09\x0A\x0D\x20-\x7E]+)/S',
if ( !isset
($matches[2]) ) {
//---------------------------------------------------------------
* Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents".
* The purpose of this function is to replace characters commonly found in Latin
* alphabets with something more or less equivalent from the ASCII range. This can
* be useful for converting a UTF-8 to something ready for a filename, for example.
* Following the use of this function, you would probably also pass the string
* through utf8_strip_non_ascii to clean out any other non-ASCII chars
* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
* letters. Default is to deaccent both cases ($case = 0)
* @param string UTF-8 string
* @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases
* @param string UTF-8 with accented characters replaced by ASCII chars
* @return string accented chars replaced with ascii equivalents
global $UTF8_LOWER_ACCENTS;
global $UTF8_UPPER_ACCENTS;