MediaWiki
REL1_24
|
00001 <?php 00028 if ( PHP_SAPI != 'cli' ) { 00029 die( "Run me from the command line please.\n" ); 00030 } 00031 00032 require_once 'UtfNormalDefines.php'; 00033 require_once 'UtfNormalUtil.php'; 00034 00035 $in = fopen( "DerivedNormalizationProps.txt", "rt" ); 00036 if ( !$in ) { 00037 print "Can't open DerivedNormalizationProps.txt for reading.\n"; 00038 print "If necessary, fetch this file from the internet:\n"; 00039 print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n"; 00040 exit( -1 ); 00041 } 00042 print "Initializing normalization quick check tables...\n"; 00043 $checkNFC = array(); 00044 while ( false !== ( $line = fgets( $in ) ) ) { 00045 $matches = array(); 00046 if ( preg_match( 00047 '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', 00048 $line, 00049 $matches ) 00050 ) { 00051 list( $junk, $first, $last, $prop, $value ) = $matches; 00052 #print "$first $last $prop $value\n"; 00053 if ( !$last ) { 00054 $last = $first; 00055 } 00056 00057 $lastInDecimal = hexdec( $last ); 00058 for ( $i = hexdec( $first ); $i <= $lastInDecimal; $i++ ) { 00059 $char = codepointToUtf8( $i ); 00060 $checkNFC[$char] = $value; 00061 } 00062 } 00063 } 00064 fclose( $in ); 00065 00066 $in = fopen( "CompositionExclusions.txt", "rt" ); 00067 if ( !$in ) { 00068 print "Can't open CompositionExclusions.txt for reading.\n"; 00069 print "If necessary, fetch this file from the internet:\n"; 00070 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 00071 exit( -1 ); 00072 } 00073 $exclude = array(); 00074 while ( false !== ( $line = fgets( $in ) ) ) { 00075 if ( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) { 00076 $codepoint = $matches[1]; 00077 $source = codepointToUtf8( hexdec( $codepoint ) ); 00078 $exclude[$source] = true; 00079 } 00080 } 00081 fclose( $in ); 00082 00083 $in = fopen( "UnicodeData.txt", "rt" ); 00084 if ( !$in ) { 00085 print "Can't open UnicodeData.txt for reading.\n"; 00086 print "If necessary, fetch this file from the internet:\n"; 00087 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"; 00088 exit( -1 ); 00089 } 00090 00091 $compatibilityDecomp = array(); 00092 $canonicalDecomp = array(); 00093 $canonicalComp = array(); 00094 $combiningClass = array(); 00095 $total = 0; 00096 $compat = 0; 00097 $canon = 0; 00098 00099 print "Reading character definitions...\n"; 00100 while ( false !== ( $line = fgets( $in ) ) ) { 00101 $columns = explode( ';', $line ); 00102 $codepoint = $columns[0]; 00103 $name = $columns[1]; 00104 $canonicalCombiningClass = $columns[3]; 00105 $decompositionMapping = $columns[5]; 00106 00107 $source = codepointToUtf8( hexdec( $codepoint ) ); 00108 00109 if ( $canonicalCombiningClass != 0 ) { 00110 $combiningClass[$source] = intval( $canonicalCombiningClass ); 00111 } 00112 00113 if ( $decompositionMapping === '' ) continue; 00114 if ( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) { 00115 # Compatibility decomposition 00116 $canonical = false; 00117 $decompositionMapping = $matches[2]; 00118 $compat++; 00119 } else { 00120 $canonical = true; 00121 $canon++; 00122 } 00123 $total++; 00124 $dest = hexSequenceToUtf8( $decompositionMapping ); 00125 00126 $compatibilityDecomp[$source] = $dest; 00127 if ( $canonical ) { 00128 $canonicalDecomp[$source] = $dest; 00129 if ( empty( $exclude[$source] ) ) { 00130 $canonicalComp[$dest] = $source; 00131 } 00132 } 00133 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n"; 00134 } 00135 fclose( $in ); 00136 00137 print "Recursively expanding canonical mappings...\n"; 00138 $changed = 42; 00139 $pass = 1; 00140 while ( $changed > 0 ) { 00141 print "pass $pass\n"; 00142 $changed = 0; 00143 foreach ( $canonicalDecomp as $source => $dest ) { 00144 $newDest = preg_replace_callback( 00145 '/([\xc0-\xff][\x80-\xbf]+)/', 00146 'callbackCanonical', 00147 $dest ); 00148 if ( $newDest === $dest ) continue; 00149 $changed++; 00150 $canonicalDecomp[$source] = $newDest; 00151 } 00152 $pass++; 00153 } 00154 00155 print "Recursively expanding compatibility mappings...\n"; 00156 $changed = 42; 00157 $pass = 1; 00158 while ( $changed > 0 ) { 00159 print "pass $pass\n"; 00160 $changed = 0; 00161 foreach ( $compatibilityDecomp as $source => $dest ) { 00162 $newDest = preg_replace_callback( 00163 '/([\xc0-\xff][\x80-\xbf]+)/', 00164 'callbackCompat', 00165 $dest ); 00166 if ( $newDest === $dest ) continue; 00167 $changed++; 00168 $compatibilityDecomp[$source] = $newDest; 00169 } 00170 $pass++; 00171 } 00172 00173 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n"; 00174 00175 $out = fopen( "UtfNormalData.inc", "wt" ); 00176 if ( $out ) { 00177 $serCombining = escapeSingleString( serialize( $combiningClass ) ); 00178 $serComp = escapeSingleString( serialize( $canonicalComp ) ); 00179 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) ); 00180 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) ); 00181 $outdata = "<" . "?php 00188 // @codingStandardsIgnoreFile 00189 00190 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' ); 00191 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' ); 00192 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' ); 00193 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' ); 00194 \n"; 00195 fputs( $out, $outdata ); 00196 fclose( $out ); 00197 print "Wrote out UtfNormalData.inc\n"; 00198 } else { 00199 print "Can't create file UtfNormalData.inc\n"; 00200 exit( -1 ); 00201 } 00202 00203 $out = fopen( "UtfNormalDataK.inc", "wt" ); 00204 if ( $out ) { 00205 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) ); 00206 $outdata = "<" . "?php 00213 // @codingStandardsIgnoreFile 00214 00215 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' ); 00216 \n"; 00217 fputs( $out, $outdata ); 00218 fclose( $out ); 00219 print "Wrote out UtfNormalDataK.inc\n"; 00220 exit( 0 ); 00221 } else { 00222 print "Can't create file UtfNormalDataK.inc\n"; 00223 exit( -1 ); 00224 } 00225 00226 # --------------- 00227 00228 function callbackCanonical( $matches ) { 00229 // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix 00230 global $canonicalDecomp; 00231 // @codingStandardsIgnoreEnd 00232 00233 if ( isset( $canonicalDecomp[$matches[1]] ) ) { 00234 return $canonicalDecomp[$matches[1]]; 00235 } 00236 00237 return $matches[1]; 00238 } 00239 00240 function callbackCompat( $matches ) { 00241 // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix 00242 global $compatibilityDecomp; 00243 // @codingStandardsIgnoreEnd 00244 00245 if ( isset( $compatibilityDecomp[$matches[1]] ) ) { 00246 return $compatibilityDecomp[$matches[1]]; 00247 } 00248 00249 return $matches[1]; 00250 }