MediaWiki  REL1_19
UtfNormalGenerate.php
Go to the documentation of this file.
00001 <?php
00028 if( php_sapi_name() != 'cli' ) {
00029         die( "Run me from the command line please.\n" );
00030 }
00031 
00032 require_once 'UtfNormalDefines.php';
00033 require_once 'UtfNormalUtil.php';
00034 
00035 $in = fopen("DerivedNormalizationProps.txt", "rt" );
00036 if( !$in ) {
00037         print "Can't open DerivedNormalizationProps.txt for reading.\n";
00038         print "If necessary, fetch this file from the internet:\n";
00039         print "http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt\n";
00040         exit(-1);
00041 }
00042 print "Initializing normalization quick check tables...\n";
00043 $checkNFC = array();
00044 while( false !== ($line = fgets( $in ) ) ) {
00045         $matches = array();
00046         if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) {
00047                 list( $junk, $first, $last, $prop, $value ) = $matches;
00048                 #print "$first $last $prop $value\n";
00049                 if( !$last ) $last = $first;
00050                 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) {
00051                         $char = codepointToUtf8( $i );
00052                         $checkNFC[$char] = $value;
00053                 }
00054         }
00055 }
00056 fclose( $in );
00057 
00058 $in = fopen("CompositionExclusions.txt", "rt" );
00059 if( !$in ) {
00060         print "Can't open CompositionExclusions.txt for reading.\n";
00061         print "If necessary, fetch this file from the internet:\n";
00062         print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
00063         exit(-1);
00064 }
00065 $exclude = array();
00066 while( false !== ($line = fgets( $in ) ) ) {
00067         if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
00068                 $codepoint = $matches[1];
00069                 $source = codepointToUtf8( hexdec( $codepoint ) );
00070                 $exclude[$source] = true;
00071         }
00072 }
00073 fclose($in);
00074 
00075 $in = fopen("UnicodeData.txt", "rt" );
00076 if( !$in ) {
00077         print "Can't open UnicodeData.txt for reading.\n";
00078         print "If necessary, fetch this file from the internet:\n";
00079         print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
00080         exit(-1);
00081 }
00082 
00083 $compatibilityDecomp = array();
00084 $canonicalDecomp = array();
00085 $canonicalComp = array();
00086 $combiningClass = array();
00087 $total = 0;
00088 $compat = 0;
00089 $canon = 0;
00090 
00091 print "Reading character definitions...\n";
00092 while( false !== ($line = fgets( $in ) ) ) {
00093         $columns = explode(';', $line);
00094         $codepoint = $columns[0];
00095         $name = $columns[1];
00096         $canonicalCombiningClass = $columns[3];
00097         $decompositionMapping = $columns[5];
00098 
00099         $source = codepointToUtf8( hexdec( $codepoint ) );
00100 
00101         if( $canonicalCombiningClass != 0 ) {
00102                 $combiningClass[$source] = intval( $canonicalCombiningClass );
00103         }
00104 
00105         if( $decompositionMapping === '' ) continue;
00106         if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
00107                 # Compatibility decomposition
00108                 $canonical = false;
00109                 $decompositionMapping = $matches[2];
00110                 $compat++;
00111         } else {
00112                 $canonical = true;
00113                 $canon++;
00114         }
00115         $total++;
00116         $dest = hexSequenceToUtf8( $decompositionMapping );
00117 
00118         $compatibilityDecomp[$source] = $dest;
00119         if( $canonical ) {
00120                 $canonicalDecomp[$source] = $dest;
00121                 if( empty( $exclude[$source] ) ) {
00122                         $canonicalComp[$dest] = $source;
00123                 }
00124         }
00125         #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
00126 }
00127 fclose( $in );
00128 
00129 print "Recursively expanding canonical mappings...\n";
00130 $changed = 42;
00131 $pass = 1;
00132 while( $changed > 0 ) {
00133         print "pass $pass\n";
00134         $changed = 0;
00135         foreach( $canonicalDecomp as $source => $dest ) {
00136                 $newDest = preg_replace_callback(
00137                         '/([\xc0-\xff][\x80-\xbf]+)/',
00138                         'callbackCanonical',
00139                         $dest);
00140                 if( $newDest === $dest ) continue;
00141                 $changed++;
00142                 $canonicalDecomp[$source] = $newDest;
00143         }
00144         $pass++;
00145 }
00146 
00147 print "Recursively expanding compatibility mappings...\n";
00148 $changed = 42;
00149 $pass = 1;
00150 while( $changed > 0 ) {
00151         print "pass $pass\n";
00152         $changed = 0;
00153         foreach( $compatibilityDecomp as $source => $dest ) {
00154                 $newDest = preg_replace_callback(
00155                         '/([\xc0-\xff][\x80-\xbf]+)/',
00156                         'callbackCompat',
00157                         $dest);
00158                 if( $newDest === $dest ) continue;
00159                 $changed++;
00160                 $compatibilityDecomp[$source] = $newDest;
00161         }
00162         $pass++;
00163 }
00164 
00165 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
00166 
00167 $out = fopen("UtfNormalData.inc", "wt");
00168 if( $out ) {
00169         $serCombining = escapeSingleString( serialize( $combiningClass ) );
00170         $serComp = escapeSingleString( serialize( $canonicalComp ) );
00171         $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
00172         $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
00173         $outdata = "<" . "?php
00181 UtfNormal::\$utfCombiningClass = unserialize( '$serCombining' );
00182 UtfNormal::\$utfCanonicalComp = unserialize( '$serComp' );
00183 UtfNormal::\$utfCanonicalDecomp = unserialize( '$serCanon' );
00184 UtfNormal::\$utfCheckNFC = unserialize( '$serCheckNFC' );
00185 \n";
00186         fputs( $out, $outdata );
00187         fclose( $out );
00188         print "Wrote out UtfNormalData.inc\n";
00189 } else {
00190         print "Can't create file UtfNormalData.inc\n";
00191         exit(-1);
00192 }
00193 
00194 
00195 $out = fopen("UtfNormalDataK.inc", "wt");
00196 if( $out ) {
00197         $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
00198         $outdata = "<" . "?php
00206 UtfNormal::\$utfCompatibilityDecomp = unserialize( '$serCompat' );
00207 \n";
00208         fputs( $out, $outdata );
00209         fclose( $out );
00210         print "Wrote out UtfNormalDataK.inc\n";
00211         exit(0);
00212 } else {
00213         print "Can't create file UtfNormalDataK.inc\n";
00214         exit(-1);
00215 }
00216 
00217 # ---------------
00218 
00219 function callbackCanonical( $matches ) {
00220         global $canonicalDecomp;
00221         if( isset( $canonicalDecomp[$matches[1]] ) ) {
00222                 return $canonicalDecomp[$matches[1]];
00223         }
00224         return $matches[1];
00225 }
00226 
00227 function callbackCompat( $matches ) {
00228         global $compatibilityDecomp;
00229         if( isset( $compatibilityDecomp[$matches[1]] ) ) {
00230                 return $compatibilityDecomp[$matches[1]];
00231         }
00232         return $matches[1];
00233 }