MediaWiki  REL1_24
generateNormalizerDataAr.php
Go to the documentation of this file.
00001 <?php
00024 require_once __DIR__ . '/../Maintenance.php';
00025 
00032 class GenerateNormalizerDataAr extends Maintenance {
00033     public function __construct() {
00034         parent::__construct();
00035         $this->mDescription = 'Generate the normalizer data file for Arabic';
00036         $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
00037             'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
00038     }
00039 
00040     public function getDbType() {
00041         return Maintenance::DB_NONE;
00042     }
00043 
00044     public function execute() {
00045         if ( !$this->hasOption( 'unicode-data-file' ) ) {
00046             $dataFile = 'UnicodeData.txt';
00047             if ( !file_exists( $dataFile ) ) {
00048                 $this->error( "Unable to find UnicodeData.txt. Please specify " .
00049                     "its location with --unicode-data-file=<FILE>" );
00050                 exit( 1 );
00051             }
00052         } else {
00053             $dataFile = $this->getOption( 'unicode-data-file' );
00054             if ( !file_exists( $dataFile ) ) {
00055                 $this->error( 'Unable to find the specified data file.' );
00056                 exit( 1 );
00057             }
00058         }
00059 
00060         $file = fopen( $dataFile, 'r' );
00061         if ( !$file ) {
00062             $this->error( 'Unable to open the data file.' );
00063             exit( 1 );
00064         }
00065 
00066         // For the file format, see http://www.unicode.org/reports/tr44/
00067         $fieldNames = array(
00068             'Code',
00069             'Name',
00070             'General_Category',
00071             'Canonical_Combining_Class',
00072             'Bidi_Class',
00073             'Decomposition_Type_Mapping',
00074             'Numeric_Type_Value_6',
00075             'Numeric_Type_Value_7',
00076             'Numeric_Type_Value_8',
00077             'Bidi_Mirrored',
00078             'Unicode_1_Name',
00079             'ISO_Comment',
00080             'Simple_Uppercase_Mapping',
00081             'Simple_Lowercase_Mapping',
00082             'Simple_Titlecase_Mapping'
00083         );
00084 
00085         $pairs = array();
00086 
00087         $lineNum = 0;
00088         while ( false !== ( $line = fgets( $file ) ) ) {
00089             ++$lineNum;
00090 
00091             # Strip comments
00092             $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
00093             if ( $line === '' ) {
00094                 continue;
00095             }
00096 
00097             # Split fields
00098             $numberedData = explode( ';', $line );
00099             $data = array();
00100             foreach ( $fieldNames as $number => $name ) {
00101                 $data[$name] = $numberedData[$number];
00102             }
00103 
00104             $code = base_convert( $data['Code'], 16, 10 );
00105             if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
00106                 || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B
00107             ) {
00108                 if ( $data['Decomposition_Type_Mapping'] === '' ) {
00109                     // No decomposition
00110                     continue;
00111                 }
00112                 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
00113                     $data['Decomposition_Type_Mapping'], $m )
00114                 ) {
00115                     $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
00116                     $this->error( $line );
00117                     continue;
00118                 }
00119 
00120                 $source = hexSequenceToUtf8( $data['Code'] );
00121                 $dest = hexSequenceToUtf8( $m[2] );
00122                 $pairs[$source] = $dest;
00123             }
00124         }
00125 
00126         global $IP;
00127         file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
00128         echo "ar: " . count( $pairs ) . " pairs written.\n";
00129     }
00130 }
00131 
00132 $maintClass = 'GenerateNormalizerDataAr';
00133 require_once RUN_MAINTENANCE_IF_MAIN;