MediaWiki
REL1_22
|
00001 <?php 00024 require_once __DIR__ . '/../../includes/normal/UtfNormalUtil.php'; 00025 00026 require_once __DIR__ . '/../Maintenance.php'; 00027 00034 class GenerateNormalizerData extends Maintenance { 00035 public $dataFile; 00036 00037 public function __construct() { 00038 parent::__construct(); 00039 $this->addOption( 'unicode-data-file', 'The local location of the data file ' . 00040 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true ); 00041 } 00042 00043 public function execute() { 00044 if ( !$this->hasOption( 'unicode-data-file' ) ) { 00045 $this->dataFile = 'UnicodeData.txt'; 00046 if ( !file_exists( $this->dataFile ) ) { 00047 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" ); 00048 exit( 1 ); 00049 } 00050 } else { 00051 $this->dataFile = $this->getOption( 'unicode-data-file' ); 00052 if ( !file_exists( $this->dataFile ) ) { 00053 $this->error( 'Unable to find the specified data file.' ); 00054 exit( 1 ); 00055 } 00056 } 00057 00058 $this->generateArabic(); 00059 $this->generateMalayalam(); 00060 } 00061 00062 function generateArabic() { 00063 $file = fopen( $this->dataFile, 'r' ); 00064 if ( !$file ) { 00065 $this->error( 'Unable to open the data file.' ); 00066 exit( 1 ); 00067 } 00068 00069 // For the file format, see http://www.unicode.org/reports/tr44/ 00070 $fieldNames = array( 00071 'Code', 00072 'Name', 00073 'General_Category', 00074 'Canonical_Combining_Class', 00075 'Bidi_Class', 00076 'Decomposition_Type_Mapping', 00077 'Numeric_Type_Value', 00078 'Bidi_Mirrored', 00079 'Unicode_1_Name', 00080 'ISO_Comment', 00081 'Simple_Uppercase_Mapping', 00082 'Simple_Lowercase_Mapping', 00083 'Simple_Titlecase_Mapping' 00084 ); 00085 00086 $pairs = array(); 00087 00088 $lineNum = 0; 00089 while ( false !== ( $line = fgets( $file ) ) ) { 00090 ++$lineNum; 00091 00092 # Strip comments 00093 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); 00094 if ( $line === '' ) { 00095 continue; 00096 } 00097 00098 # Split fields 00099 $numberedData = explode( ';', $line ); 00100 $data = array(); 00101 foreach ( $fieldNames as $number => $name ) { 00102 $data[$name] = $numberedData[$number]; 00103 } 00104 00105 $code = base_convert( $data['Code'], 16, 10 ); 00106 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A 00107 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B 00108 { 00109 if ( $data['Decomposition_Type_Mapping'] === '' ) { 00110 // No decomposition 00111 continue; 00112 } 00113 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', 00114 $data['Decomposition_Type_Mapping'], $m ) ) 00115 { 00116 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); 00117 $this->error( $line ); 00118 continue; 00119 } 00120 00121 $source = hexSequenceToUtf8( $data['Code'] ); 00122 $dest = hexSequenceToUtf8( $m[2] ); 00123 $pairs[$source] = $dest; 00124 } 00125 } 00126 00127 global $IP; 00128 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) ); 00129 echo "ar: " . count( $pairs ) . " pairs written.\n"; 00130 } 00131 00132 function generateMalayalam() { 00133 $hexPairs = array( 00134 # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters 00135 '0D23 0D4D 200D' => '0D7A', 00136 '0D28 0D4D 200D' => '0D7B', 00137 '0D30 0D4D 200D' => '0D7C', 00138 '0D32 0D4D 200D' => '0D7D', 00139 '0D33 0D4D 200D' => '0D7E', 00140 00141 # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413 00142 '0D15 0D4D 200D' => '0D7F', 00143 ); 00144 00145 $pairs = array(); 00146 foreach ( $hexPairs as $hexSource => $hexDest ) { 00147 $source = hexSequenceToUtf8( $hexSource ); 00148 $dest = hexSequenceToUtf8( $hexDest ); 00149 $pairs[$source] = $dest; 00150 } 00151 00152 global $IP; 00153 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) ); 00154 echo "ml: " . count( $pairs ) . " pairs written.\n"; 00155 } 00156 } 00157 00158 $maintClass = 'GenerateNormalizerData'; 00159 require_once RUN_MAINTENANCE_IF_MAIN;