MediaWiki  REL1_22
UtfNormalGenerate.php
Go to the documentation of this file.
00001 <?php
00028 if( PHP_SAPI != 'cli' ) {
00029     die( "Run me from the command line please.\n" );
00030 }
00031 
00032 require_once 'UtfNormalDefines.php';
00033 require_once 'UtfNormalUtil.php';
00034 
00035 $in = fopen("DerivedNormalizationProps.txt", "rt" );
00036 if( !$in ) {
00037     print "Can't open DerivedNormalizationProps.txt for reading.\n";
00038     print "If necessary, fetch this file from the internet:\n";
00039     print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
00040     exit(-1);
00041 }
00042 print "Initializing normalization quick check tables...\n";
00043 $checkNFC = array();
00044 while( false !== ($line = fgets( $in ) ) ) {
00045     $matches = array();
00046     if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) {
00047         list( $junk, $first, $last, $prop, $value ) = $matches;
00048         #print "$first $last $prop $value\n";
00049         if( !$last ) $last = $first;
00050         for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) {
00051             $char = codepointToUtf8( $i );
00052             $checkNFC[$char] = $value;
00053         }
00054     }
00055 }
00056 fclose( $in );
00057 
00058 $in = fopen("CompositionExclusions.txt", "rt" );
00059 if( !$in ) {
00060     print "Can't open CompositionExclusions.txt for reading.\n";
00061     print "If necessary, fetch this file from the internet:\n";
00062     print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
00063     exit(-1);
00064 }
00065 $exclude = array();
00066 while( false !== ($line = fgets( $in ) ) ) {
00067     if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
00068         $codepoint = $matches[1];
00069         $source = codepointToUtf8( hexdec( $codepoint ) );
00070         $exclude[$source] = true;
00071     }
00072 }
00073 fclose($in);
00074 
00075 $in = fopen("UnicodeData.txt", "rt" );
00076 if( !$in ) {
00077     print "Can't open UnicodeData.txt for reading.\n";
00078     print "If necessary, fetch this file from the internet:\n";
00079     print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00080     exit(-1);
00081 }
00082 
00083 $compatibilityDecomp = array();
00084 $canonicalDecomp = array();
00085 $canonicalComp = array();
00086 $combiningClass = array();
00087 $total = 0;
00088 $compat = 0;
00089 $canon = 0;
00090 
00091 print "Reading character definitions...\n";
00092 while( false !== ($line = fgets( $in ) ) ) {
00093     $columns = explode(';', $line);
00094     $codepoint = $columns[0];
00095     $name = $columns[1];
00096     $canonicalCombiningClass = $columns[3];
00097     $decompositionMapping = $columns[5];
00098 
00099     $source = codepointToUtf8( hexdec( $codepoint ) );
00100 
00101     if( $canonicalCombiningClass != 0 ) {
00102         $combiningClass[$source] = intval( $canonicalCombiningClass );
00103     }
00104 
00105     if( $decompositionMapping === '' ) continue;
00106     if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
00107         # Compatibility decomposition
00108         $canonical = false;
00109         $decompositionMapping = $matches[2];
00110         $compat++;
00111     } else {
00112         $canonical = true;
00113         $canon++;
00114     }
00115     $total++;
00116     $dest = hexSequenceToUtf8( $decompositionMapping );
00117 
00118     $compatibilityDecomp[$source] = $dest;
00119     if( $canonical ) {
00120         $canonicalDecomp[$source] = $dest;
00121         if( empty( $exclude[$source] ) ) {
00122             $canonicalComp[$dest] = $source;
00123         }
00124     }
00125     #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
00126 }
00127 fclose( $in );
00128 
00129 print "Recursively expanding canonical mappings...\n";
00130 $changed = 42;
00131 $pass = 1;
00132 while( $changed > 0 ) {
00133     print "pass $pass\n";
00134     $changed = 0;
00135     foreach( $canonicalDecomp as $source => $dest ) {
00136         $newDest = preg_replace_callback(
00137             '/([\xc0-\xff][\x80-\xbf]+)/',
00138             'callbackCanonical',
00139             $dest);
00140         if( $newDest === $dest ) continue;
00141         $changed++;
00142         $canonicalDecomp[$source] = $newDest;
00143     }
00144     $pass++;
00145 }
00146 
00147 print "Recursively expanding compatibility mappings...\n";
00148 $changed = 42;
00149 $pass = 1;
00150 while( $changed > 0 ) {
00151     print "pass $pass\n";
00152     $changed = 0;
00153     foreach( $compatibilityDecomp as $source => $dest ) {
00154         $newDest = preg_replace_callback(
00155             '/([\xc0-\xff][\x80-\xbf]+)/',
00156             'callbackCompat',
00157             $dest);
00158         if( $newDest === $dest ) continue;
00159         $changed++;
00160         $compatibilityDecomp[$source] = $newDest;
00161     }
00162     $pass++;
00163 }
00164 
00165 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
00166 
00167 $out = fopen("UtfNormalData.inc", "wt");
00168 if( $out ) {
00169     $serCombining = escapeSingleString( serialize( $combiningClass ) );
00170     $serComp = escapeSingleString( serialize( $canonicalComp ) );
00171     $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
00172     $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
00173     $outdata = "<" . "?php
00181 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
00182 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
00183 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
00184 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
00185 \n";
00186     fputs( $out, $outdata );
00187     fclose( $out );
00188     print "Wrote out UtfNormalData.inc\n";
00189 } else {
00190     print "Can't create file UtfNormalData.inc\n";
00191     exit(-1);
00192 }
00193 
00194 
00195 $out = fopen("UtfNormalDataK.inc", "wt");
00196 if( $out ) {
00197     $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
00198     $outdata = "<" . "?php
00206 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
00207 \n";
00208     fputs( $out, $outdata );
00209     fclose( $out );
00210     print "Wrote out UtfNormalDataK.inc\n";
00211     exit(0);
00212 } else {
00213     print "Can't create file UtfNormalDataK.inc\n";
00214     exit(-1);
00215 }
00216 
00217 # ---------------
00218 
00219 function callbackCanonical( $matches ) {
00220     global $canonicalDecomp;
00221     if( isset( $canonicalDecomp[$matches[1]] ) ) {
00222         return $canonicalDecomp[$matches[1]];
00223     }
00224     return $matches[1];
00225 }
00226 
00227 function callbackCompat( $matches ) {
00228     global $compatibilityDecomp;
00229     if( isset( $compatibilityDecomp[$matches[1]] ) ) {
00230         return $compatibilityDecomp[$matches[1]];
00231     }
00232     return $matches[1];
00233 }