MediaWiki  REL1_24
generateUtf8Case.php
Go to the documentation of this file.
00001 <?php
00028 require_once __DIR__ . '/../Maintenance.php';
00029 
00036 class GenerateUtf8Case extends Maintenance {
00037 
00038     public function __construct() {
00039         parent::__construct();
00040         $this->mDescription = 'Generate Utf8Case.ser from the Unicode Character Database ' .
00041             'and supplementary files';
00042         $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
00043             'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
00044     }
00045 
00046     public function getDbType() {
00047         return Maintenance::DB_NONE;
00048     }
00049 
00050     public function execute() {
00051         if ( !$this->hasOption( 'unicode-data-file' ) ) {
00052             $dataFile = 'UnicodeData.txt';
00053             if ( !file_exists( $dataFile ) ) {
00054                 $this->error( "Unable to find UnicodeData.txt. Please specify " .
00055                     "its location with --unicode-data-file=<FILE>" );
00056                 exit( 1 );
00057             }
00058         } else {
00059             $dataFile = $this->getOption( 'unicode-data-file' );
00060             if ( !file_exists( $dataFile ) ) {
00061                 $this->error( 'Unable to find the specified data file.' );
00062                 exit( 1 );
00063             }
00064         }
00065 
00066         $file = fopen( $dataFile, 'r' );
00067         if ( !$file ) {
00068             $this->error( 'Unable to open the data file.' );
00069             exit( 1 );
00070         }
00071 
00072         // For the file format, see http://www.unicode.org/reports/tr44/
00073         $fieldNames = array(
00074             'Code',
00075             'Name',
00076             'General_Category',
00077             'Canonical_Combining_Class',
00078             'Bidi_Class',
00079             'Decomposition_Type_Mapping',
00080             'Numeric_Type_Value_6',
00081             'Numeric_Type_Value_7',
00082             'Numeric_Type_Value_8',
00083             'Bidi_Mirrored',
00084             'Unicode_1_Name',
00085             'ISO_Comment',
00086             'Simple_Uppercase_Mapping',
00087             'Simple_Lowercase_Mapping',
00088             'Simple_Titlecase_Mapping'
00089         );
00090 
00091         $upper = array();
00092         $lower = array();
00093 
00094         $lineNum = 0;
00095         while ( false !== ( $line = fgets( $file ) ) ) {
00096             ++$lineNum;
00097 
00098             # Strip comments
00099             $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
00100             if ( $line === '' ) {
00101                 continue;
00102             }
00103 
00104             # Split fields
00105             $numberedData = explode( ';', $line );
00106             $data = array();
00107             foreach ( $fieldNames as $number => $name ) {
00108                 $data[$name] = $numberedData[$number];
00109             }
00110 
00111             $source = hexSequenceToUtf8( $data['Code'] );
00112             if ( $data['Simple_Uppercase_Mapping'] ) {
00113                 $upper[$source] = hexSequenceToUtf8( $data['Simple_Uppercase_Mapping'] );
00114             }
00115             if ( $data['Simple_Lowercase_Mapping'] ) {
00116                 $lower[$source] = hexSequenceToUtf8( $data['Simple_Lowercase_Mapping'] );
00117             }
00118         }
00119 
00120         global $IP;
00121         file_put_contents( "$IP/serialized/Utf8Case.ser", serialize( array(
00122             'wikiUpperChars' => $upper,
00123             'wikiLowerChars' => $lower,
00124         ) ) );
00125     }
00126 }
00127 
00128 $maintClass = 'GenerateUtf8Case';
00129 require_once RUN_MAINTENANCE_IF_MAIN;