MediaWiki  REL1_24
UtfNormalGenerate.php
Go to the documentation of this file.
00001 <?php
00028 if ( PHP_SAPI != 'cli' ) {
00029     die( "Run me from the command line please.\n" );
00030 }
00031 
00032 require_once 'UtfNormalDefines.php';
00033 require_once 'UtfNormalUtil.php';
00034 
00035 $in = fopen( "DerivedNormalizationProps.txt", "rt" );
00036 if ( !$in ) {
00037     print "Can't open DerivedNormalizationProps.txt for reading.\n";
00038     print "If necessary, fetch this file from the internet:\n";
00039     print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
00040     exit( -1 );
00041 }
00042 print "Initializing normalization quick check tables...\n";
00043 $checkNFC = array();
00044 while ( false !== ( $line = fgets( $in ) ) ) {
00045     $matches = array();
00046     if ( preg_match(
00047         '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/',
00048         $line,
00049         $matches )
00050     ) {
00051         list( $junk, $first, $last, $prop, $value ) = $matches;
00052         #print "$first $last $prop $value\n";
00053         if ( !$last ) {
00054             $last = $first;
00055         }
00056 
00057         $lastInDecimal = hexdec( $last );
00058         for ( $i = hexdec( $first ); $i <= $lastInDecimal; $i++ ) {
00059             $char = codepointToUtf8( $i );
00060             $checkNFC[$char] = $value;
00061         }
00062     }
00063 }
00064 fclose( $in );
00065 
00066 $in = fopen( "CompositionExclusions.txt", "rt" );
00067 if ( !$in ) {
00068     print "Can't open CompositionExclusions.txt for reading.\n";
00069     print "If necessary, fetch this file from the internet:\n";
00070     print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
00071     exit( -1 );
00072 }
00073 $exclude = array();
00074 while ( false !== ( $line = fgets( $in ) ) ) {
00075     if ( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
00076         $codepoint = $matches[1];
00077         $source = codepointToUtf8( hexdec( $codepoint ) );
00078         $exclude[$source] = true;
00079     }
00080 }
00081 fclose( $in );
00082 
00083 $in = fopen( "UnicodeData.txt", "rt" );
00084 if ( !$in ) {
00085     print "Can't open UnicodeData.txt for reading.\n";
00086     print "If necessary, fetch this file from the internet:\n";
00087     print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00088     exit( -1 );
00089 }
00090 
00091 $compatibilityDecomp = array();
00092 $canonicalDecomp = array();
00093 $canonicalComp = array();
00094 $combiningClass = array();
00095 $total = 0;
00096 $compat = 0;
00097 $canon = 0;
00098 
00099 print "Reading character definitions...\n";
00100 while ( false !== ( $line = fgets( $in ) ) ) {
00101     $columns = explode( ';', $line );
00102     $codepoint = $columns[0];
00103     $name = $columns[1];
00104     $canonicalCombiningClass = $columns[3];
00105     $decompositionMapping = $columns[5];
00106 
00107     $source = codepointToUtf8( hexdec( $codepoint ) );
00108 
00109     if ( $canonicalCombiningClass != 0 ) {
00110         $combiningClass[$source] = intval( $canonicalCombiningClass );
00111     }
00112 
00113     if ( $decompositionMapping === '' ) continue;
00114     if ( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
00115         # Compatibility decomposition
00116         $canonical = false;
00117         $decompositionMapping = $matches[2];
00118         $compat++;
00119     } else {
00120         $canonical = true;
00121         $canon++;
00122     }
00123     $total++;
00124     $dest = hexSequenceToUtf8( $decompositionMapping );
00125 
00126     $compatibilityDecomp[$source] = $dest;
00127     if ( $canonical ) {
00128         $canonicalDecomp[$source] = $dest;
00129         if ( empty( $exclude[$source] ) ) {
00130             $canonicalComp[$dest] = $source;
00131         }
00132     }
00133     #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
00134 }
00135 fclose( $in );
00136 
00137 print "Recursively expanding canonical mappings...\n";
00138 $changed = 42;
00139 $pass = 1;
00140 while ( $changed > 0 ) {
00141     print "pass $pass\n";
00142     $changed = 0;
00143     foreach ( $canonicalDecomp as $source => $dest ) {
00144         $newDest = preg_replace_callback(
00145             '/([\xc0-\xff][\x80-\xbf]+)/',
00146             'callbackCanonical',
00147             $dest );
00148         if ( $newDest === $dest ) continue;
00149         $changed++;
00150         $canonicalDecomp[$source] = $newDest;
00151     }
00152     $pass++;
00153 }
00154 
00155 print "Recursively expanding compatibility mappings...\n";
00156 $changed = 42;
00157 $pass = 1;
00158 while ( $changed > 0 ) {
00159     print "pass $pass\n";
00160     $changed = 0;
00161     foreach ( $compatibilityDecomp as $source => $dest ) {
00162         $newDest = preg_replace_callback(
00163             '/([\xc0-\xff][\x80-\xbf]+)/',
00164             'callbackCompat',
00165             $dest );
00166         if ( $newDest === $dest ) continue;
00167         $changed++;
00168         $compatibilityDecomp[$source] = $newDest;
00169     }
00170     $pass++;
00171 }
00172 
00173 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
00174 
00175 $out = fopen( "UtfNormalData.inc", "wt" );
00176 if ( $out ) {
00177     $serCombining = escapeSingleString( serialize( $combiningClass ) );
00178     $serComp = escapeSingleString( serialize( $canonicalComp ) );
00179     $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
00180     $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
00181     $outdata = "<" . "?php
00188 // @codingStandardsIgnoreFile
00189 
00190 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
00191 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
00192 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
00193 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
00194 \n";
00195     fputs( $out, $outdata );
00196     fclose( $out );
00197     print "Wrote out UtfNormalData.inc\n";
00198 } else {
00199     print "Can't create file UtfNormalData.inc\n";
00200     exit( -1 );
00201 }
00202 
00203 $out = fopen( "UtfNormalDataK.inc", "wt" );
00204 if ( $out ) {
00205     $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
00206     $outdata = "<" . "?php
00213 // @codingStandardsIgnoreFile
00214 
00215 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
00216 \n";
00217     fputs( $out, $outdata );
00218     fclose( $out );
00219     print "Wrote out UtfNormalDataK.inc\n";
00220     exit( 0 );
00221 } else {
00222     print "Can't create file UtfNormalDataK.inc\n";
00223     exit( -1 );
00224 }
00225 
00226 # ---------------
00227 
00228 function callbackCanonical( $matches ) {
00229     // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
00230     global $canonicalDecomp;
00231     // @codingStandardsIgnoreEnd
00232 
00233     if ( isset( $canonicalDecomp[$matches[1]] ) ) {
00234         return $canonicalDecomp[$matches[1]];
00235     }
00236 
00237     return $matches[1];
00238 }
00239 
00240 function callbackCompat( $matches ) {
00241     // @codingStandardsIgnoreStart MediaWiki.NamingConventions.ValidGlobalName.wgPrefix
00242     global $compatibilityDecomp;
00243     // @codingStandardsIgnoreEnd
00244 
00245     if ( isset( $compatibilityDecomp[$matches[1]] ) ) {
00246         return $compatibilityDecomp[$matches[1]];
00247     }
00248 
00249     return $matches[1];
00250 }