MediaWiki  REL1_22
generateNormalizerData.php
Go to the documentation of this file.
00001 <?php
00024 require_once __DIR__ . '/../../includes/normal/UtfNormalUtil.php';
00025 
00026 require_once __DIR__ . '/../Maintenance.php';
00027 
00034 class GenerateNormalizerData extends Maintenance {
00035     public $dataFile;
00036 
00037     public function __construct() {
00038         parent::__construct();
00039         $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
00040             'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
00041     }
00042 
00043     public function execute() {
00044         if ( !$this->hasOption( 'unicode-data-file' ) ) {
00045             $this->dataFile = 'UnicodeData.txt';
00046             if ( !file_exists( $this->dataFile ) ) {
00047                 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" );
00048                 exit( 1 );
00049             }
00050         } else {
00051             $this->dataFile = $this->getOption( 'unicode-data-file' );
00052             if ( !file_exists( $this->dataFile ) ) {
00053                 $this->error( 'Unable to find the specified data file.' );
00054                 exit( 1 );
00055             }
00056         }
00057 
00058         $this->generateArabic();
00059         $this->generateMalayalam();
00060     }
00061 
00062     function generateArabic() {
00063         $file = fopen( $this->dataFile, 'r' );
00064         if ( !$file ) {
00065             $this->error( 'Unable to open the data file.' );
00066             exit( 1 );
00067         }
00068 
00069         // For the file format, see http://www.unicode.org/reports/tr44/
00070         $fieldNames = array(
00071             'Code',
00072             'Name',
00073             'General_Category',
00074             'Canonical_Combining_Class',
00075             'Bidi_Class',
00076             'Decomposition_Type_Mapping',
00077             'Numeric_Type_Value',
00078             'Bidi_Mirrored',
00079             'Unicode_1_Name',
00080             'ISO_Comment',
00081             'Simple_Uppercase_Mapping',
00082             'Simple_Lowercase_Mapping',
00083             'Simple_Titlecase_Mapping'
00084         );
00085 
00086         $pairs = array();
00087 
00088         $lineNum = 0;
00089         while ( false !== ( $line = fgets( $file ) ) ) {
00090             ++$lineNum;
00091 
00092             # Strip comments
00093             $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
00094             if ( $line === '' ) {
00095                 continue;
00096             }
00097 
00098             # Split fields
00099             $numberedData = explode( ';', $line );
00100             $data = array();
00101             foreach ( $fieldNames as $number => $name ) {
00102                 $data[$name] = $numberedData[$number];
00103             }
00104 
00105             $code = base_convert( $data['Code'], 16, 10 );
00106             if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
00107                 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B
00108             {
00109                 if ( $data['Decomposition_Type_Mapping'] === '' ) {
00110                     // No decomposition
00111                     continue;
00112                 }
00113                 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
00114                     $data['Decomposition_Type_Mapping'], $m ) )
00115                 {
00116                     $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
00117                     $this->error( $line );
00118                     continue;
00119                 }
00120 
00121                 $source = hexSequenceToUtf8( $data['Code'] );
00122                 $dest = hexSequenceToUtf8( $m[2] );
00123                 $pairs[$source] = $dest;
00124             }
00125         }
00126 
00127         global $IP;
00128         file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
00129         echo "ar: " . count( $pairs ) . " pairs written.\n";
00130     }
00131 
00132     function generateMalayalam() {
00133         $hexPairs = array(
00134             # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters
00135             '0D23 0D4D 200D' => '0D7A',
00136             '0D28 0D4D 200D' => '0D7B',
00137             '0D30 0D4D 200D' => '0D7C',
00138             '0D32 0D4D 200D' => '0D7D',
00139             '0D33 0D4D 200D' => '0D7E',
00140 
00141             # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413
00142             '0D15 0D4D 200D' => '0D7F',
00143         );
00144 
00145         $pairs = array();
00146         foreach ( $hexPairs as $hexSource => $hexDest ) {
00147             $source = hexSequenceToUtf8( $hexSource );
00148             $dest = hexSequenceToUtf8( $hexDest );
00149             $pairs[$source] = $dest;
00150         }
00151 
00152         global $IP;
00153         file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) );
00154         echo "ml: " . count( $pairs ) . " pairs written.\n";
00155     }
00156 }
00157 
00158 $maintClass = 'GenerateNormalizerData';
00159 require_once RUN_MAINTENANCE_IF_MAIN;