MediaWiki
REL1_22
|
00001 <?php 00028 if( PHP_SAPI != 'cli' ) { 00029 die( "Run me from the command line please.\n" ); 00030 } 00031 00032 require_once 'UtfNormalDefines.php'; 00033 require_once 'UtfNormalUtil.php'; 00034 00035 $in = fopen("DerivedNormalizationProps.txt", "rt" ); 00036 if( !$in ) { 00037 print "Can't open DerivedNormalizationProps.txt for reading.\n"; 00038 print "If necessary, fetch this file from the internet:\n"; 00039 print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n"; 00040 exit(-1); 00041 } 00042 print "Initializing normalization quick check tables...\n"; 00043 $checkNFC = array(); 00044 while( false !== ($line = fgets( $in ) ) ) { 00045 $matches = array(); 00046 if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) { 00047 list( $junk, $first, $last, $prop, $value ) = $matches; 00048 #print "$first $last $prop $value\n"; 00049 if( !$last ) $last = $first; 00050 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) { 00051 $char = codepointToUtf8( $i ); 00052 $checkNFC[$char] = $value; 00053 } 00054 } 00055 } 00056 fclose( $in ); 00057 00058 $in = fopen("CompositionExclusions.txt", "rt" ); 00059 if( !$in ) { 00060 print "Can't open CompositionExclusions.txt for reading.\n"; 00061 print "If necessary, fetch this file from the internet:\n"; 00062 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 00063 exit(-1); 00064 } 00065 $exclude = array(); 00066 while( false !== ($line = fgets( $in ) ) ) { 00067 if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) { 00068 $codepoint = $matches[1]; 00069 $source = codepointToUtf8( hexdec( $codepoint ) ); 00070 $exclude[$source] = true; 00071 } 00072 } 00073 fclose($in); 00074 00075 $in = fopen("UnicodeData.txt", "rt" ); 00076 if( !$in ) { 00077 print "Can't open UnicodeData.txt for reading.\n"; 00078 print "If necessary, fetch this file from the internet:\n"; 00079 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"; 00080 exit(-1); 00081 } 00082 00083 $compatibilityDecomp = array(); 00084 $canonicalDecomp = array(); 00085 $canonicalComp = array(); 00086 $combiningClass = array(); 00087 $total = 0; 00088 $compat = 0; 00089 $canon = 0; 00090 00091 print "Reading character definitions...\n"; 00092 while( false !== ($line = fgets( $in ) ) ) { 00093 $columns = explode(';', $line); 00094 $codepoint = $columns[0]; 00095 $name = $columns[1]; 00096 $canonicalCombiningClass = $columns[3]; 00097 $decompositionMapping = $columns[5]; 00098 00099 $source = codepointToUtf8( hexdec( $codepoint ) ); 00100 00101 if( $canonicalCombiningClass != 0 ) { 00102 $combiningClass[$source] = intval( $canonicalCombiningClass ); 00103 } 00104 00105 if( $decompositionMapping === '' ) continue; 00106 if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) { 00107 # Compatibility decomposition 00108 $canonical = false; 00109 $decompositionMapping = $matches[2]; 00110 $compat++; 00111 } else { 00112 $canonical = true; 00113 $canon++; 00114 } 00115 $total++; 00116 $dest = hexSequenceToUtf8( $decompositionMapping ); 00117 00118 $compatibilityDecomp[$source] = $dest; 00119 if( $canonical ) { 00120 $canonicalDecomp[$source] = $dest; 00121 if( empty( $exclude[$source] ) ) { 00122 $canonicalComp[$dest] = $source; 00123 } 00124 } 00125 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n"; 00126 } 00127 fclose( $in ); 00128 00129 print "Recursively expanding canonical mappings...\n"; 00130 $changed = 42; 00131 $pass = 1; 00132 while( $changed > 0 ) { 00133 print "pass $pass\n"; 00134 $changed = 0; 00135 foreach( $canonicalDecomp as $source => $dest ) { 00136 $newDest = preg_replace_callback( 00137 '/([\xc0-\xff][\x80-\xbf]+)/', 00138 'callbackCanonical', 00139 $dest); 00140 if( $newDest === $dest ) continue; 00141 $changed++; 00142 $canonicalDecomp[$source] = $newDest; 00143 } 00144 $pass++; 00145 } 00146 00147 print "Recursively expanding compatibility mappings...\n"; 00148 $changed = 42; 00149 $pass = 1; 00150 while( $changed > 0 ) { 00151 print "pass $pass\n"; 00152 $changed = 0; 00153 foreach( $compatibilityDecomp as $source => $dest ) { 00154 $newDest = preg_replace_callback( 00155 '/([\xc0-\xff][\x80-\xbf]+)/', 00156 'callbackCompat', 00157 $dest); 00158 if( $newDest === $dest ) continue; 00159 $changed++; 00160 $compatibilityDecomp[$source] = $newDest; 00161 } 00162 $pass++; 00163 } 00164 00165 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n"; 00166 00167 $out = fopen("UtfNormalData.inc", "wt"); 00168 if( $out ) { 00169 $serCombining = escapeSingleString( serialize( $combiningClass ) ); 00170 $serComp = escapeSingleString( serialize( $canonicalComp ) ); 00171 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) ); 00172 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) ); 00173 $outdata = "<" . "?php 00181 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' ); 00182 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' ); 00183 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' ); 00184 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' ); 00185 \n"; 00186 fputs( $out, $outdata ); 00187 fclose( $out ); 00188 print "Wrote out UtfNormalData.inc\n"; 00189 } else { 00190 print "Can't create file UtfNormalData.inc\n"; 00191 exit(-1); 00192 } 00193 00194 00195 $out = fopen("UtfNormalDataK.inc", "wt"); 00196 if( $out ) { 00197 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) ); 00198 $outdata = "<" . "?php 00206 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' ); 00207 \n"; 00208 fputs( $out, $outdata ); 00209 fclose( $out ); 00210 print "Wrote out UtfNormalDataK.inc\n"; 00211 exit(0); 00212 } else { 00213 print "Can't create file UtfNormalDataK.inc\n"; 00214 exit(-1); 00215 } 00216 00217 # --------------- 00218 00219 function callbackCanonical( $matches ) { 00220 global $canonicalDecomp; 00221 if( isset( $canonicalDecomp[$matches[1]] ) ) { 00222 return $canonicalDecomp[$matches[1]]; 00223 } 00224 return $matches[1]; 00225 } 00226 00227 function callbackCompat( $matches ) { 00228 global $compatibilityDecomp; 00229 if( isset( $compatibilityDecomp[$matches[1]] ) ) { 00230 return $compatibilityDecomp[$matches[1]]; 00231 } 00232 return $matches[1]; 00233 }