MediaWiki
REL1_24
|
00001 <?php 00024 require_once __DIR__ . '/../Maintenance.php'; 00025 00032 class GenerateNormalizerDataAr extends Maintenance { 00033 public function __construct() { 00034 parent::__construct(); 00035 $this->mDescription = 'Generate the normalizer data file for Arabic'; 00036 $this->addOption( 'unicode-data-file', 'The local location of the data file ' . 00037 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true ); 00038 } 00039 00040 public function getDbType() { 00041 return Maintenance::DB_NONE; 00042 } 00043 00044 public function execute() { 00045 if ( !$this->hasOption( 'unicode-data-file' ) ) { 00046 $dataFile = 'UnicodeData.txt'; 00047 if ( !file_exists( $dataFile ) ) { 00048 $this->error( "Unable to find UnicodeData.txt. Please specify " . 00049 "its location with --unicode-data-file=<FILE>" ); 00050 exit( 1 ); 00051 } 00052 } else { 00053 $dataFile = $this->getOption( 'unicode-data-file' ); 00054 if ( !file_exists( $dataFile ) ) { 00055 $this->error( 'Unable to find the specified data file.' ); 00056 exit( 1 ); 00057 } 00058 } 00059 00060 $file = fopen( $dataFile, 'r' ); 00061 if ( !$file ) { 00062 $this->error( 'Unable to open the data file.' ); 00063 exit( 1 ); 00064 } 00065 00066 // For the file format, see http://www.unicode.org/reports/tr44/ 00067 $fieldNames = array( 00068 'Code', 00069 'Name', 00070 'General_Category', 00071 'Canonical_Combining_Class', 00072 'Bidi_Class', 00073 'Decomposition_Type_Mapping', 00074 'Numeric_Type_Value_6', 00075 'Numeric_Type_Value_7', 00076 'Numeric_Type_Value_8', 00077 'Bidi_Mirrored', 00078 'Unicode_1_Name', 00079 'ISO_Comment', 00080 'Simple_Uppercase_Mapping', 00081 'Simple_Lowercase_Mapping', 00082 'Simple_Titlecase_Mapping' 00083 ); 00084 00085 $pairs = array(); 00086 00087 $lineNum = 0; 00088 while ( false !== ( $line = fgets( $file ) ) ) { 00089 ++$lineNum; 00090 00091 # Strip comments 00092 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); 00093 if ( $line === '' ) { 00094 continue; 00095 } 00096 00097 # Split fields 00098 $numberedData = explode( ';', $line ); 00099 $data = array(); 00100 foreach ( $fieldNames as $number => $name ) { 00101 $data[$name] = $numberedData[$number]; 00102 } 00103 00104 $code = base_convert( $data['Code'], 16, 10 ); 00105 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A 00106 || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B 00107 ) { 00108 if ( $data['Decomposition_Type_Mapping'] === '' ) { 00109 // No decomposition 00110 continue; 00111 } 00112 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', 00113 $data['Decomposition_Type_Mapping'], $m ) 00114 ) { 00115 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); 00116 $this->error( $line ); 00117 continue; 00118 } 00119 00120 $source = hexSequenceToUtf8( $data['Code'] ); 00121 $dest = hexSequenceToUtf8( $m[2] ); 00122 $pairs[$source] = $dest; 00123 } 00124 } 00125 00126 global $IP; 00127 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) ); 00128 echo "ar: " . count( $pairs ) . " pairs written.\n"; 00129 } 00130 } 00131 00132 $maintClass = 'GenerateNormalizerDataAr'; 00133 require_once RUN_MAINTENANCE_IF_MAIN;