MediaWiki
REL1_19
|
00001 <?php 00024 require_once( dirname( __FILE__ ) . '/../Maintenance.php' ); 00025 00026 require_once( dirname( __FILE__ ) . '/../../includes/normal/UtfNormalUtil.php' ); 00027 00032 class GenerateNormalizerData extends Maintenance { 00033 var $dataFile; 00034 00035 public function __construct() { 00036 parent::__construct(); 00037 $this->addOption( 'unicode-data-file', 'The local location of the data file ' . 00038 'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true ); 00039 } 00040 00041 public function execute() { 00042 if ( !$this->hasOption( 'unicode-data-file' ) ) { 00043 $this->dataFile = 'UnicodeData.txt'; 00044 if ( !file_exists( $this->dataFile ) ) { 00045 $this->error( "Unable to find UnicodeData.txt. Please specify its location with --unicode-data-file=<FILE>" ); 00046 exit( 1 ); 00047 } 00048 } else { 00049 $this->dataFile = $this->getOption( 'unicode-data-file' ); 00050 if ( !file_exists( $this->dataFile ) ) { 00051 $this->error( 'Unable to find the specified data file.' ); 00052 exit( 1 ); 00053 } 00054 } 00055 00056 $this->generateArabic(); 00057 $this->generateMalayalam(); 00058 } 00059 00060 function generateArabic() { 00061 $file = fopen( $this->dataFile, 'r' ); 00062 if ( !$file ) { 00063 $this->error( 'Unable to open the data file.' ); 00064 exit( 1 ); 00065 } 00066 00067 // For the file format, see http://www.unicode.org/reports/tr44/ 00068 $fieldNames = array( 00069 'Code', 00070 'Name', 00071 'General_Category', 00072 'Canonical_Combining_Class', 00073 'Bidi_Class', 00074 'Decomposition_Type_Mapping', 00075 'Numeric_Type_Value', 00076 'Bidi_Mirrored', 00077 'Unicode_1_Name', 00078 'ISO_Comment', 00079 'Simple_Uppercase_Mapping', 00080 'Simple_Lowercase_Mapping', 00081 'Simple_Titlecase_Mapping' 00082 ); 00083 00084 $pairs = array(); 00085 00086 $lineNum = 0; 00087 while ( false !== ( $line = fgets( $file ) ) ) { 00088 ++$lineNum; 00089 00090 # Strip comments 00091 $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) ); 00092 if ( $line === '' ) { 00093 continue; 00094 } 00095 00096 # Split fields 00097 $numberedData = explode( ';', $line ); 00098 $data = array(); 00099 foreach ( $fieldNames as $number => $name ) { 00100 $data[$name] = $numberedData[$number]; 00101 } 00102 00103 $code = base_convert( $data['Code'], 16, 10 ); 00104 if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A 00105 || ( $code >= 0xFE70 && $code <= 0xFEFF ) ) # Arabic presentation forms B 00106 { 00107 if ( $data['Decomposition_Type_Mapping'] === '' ) { 00108 // No decomposition 00109 continue; 00110 } 00111 if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/', 00112 $data['Decomposition_Type_Mapping'], $m ) ) 00113 { 00114 $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" ); 00115 $this->error( $line ); 00116 continue; 00117 } 00118 00119 $source = hexSequenceToUtf8( $data['Code'] ); 00120 $dest = hexSequenceToUtf8( $m[2] ); 00121 $pairs[$source] = $dest; 00122 } 00123 } 00124 00125 global $IP; 00126 file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) ); 00127 echo "ar: " . count( $pairs ) . " pairs written.\n"; 00128 } 00129 00130 function generateMalayalam() { 00131 $hexPairs = array( 00132 # From http://unicode.org/versions/Unicode5.1.0/#Malayalam_Chillu_Characters 00133 '0D23 0D4D 200D' => '0D7A', 00134 '0D28 0D4D 200D' => '0D7B', 00135 '0D30 0D4D 200D' => '0D7C', 00136 '0D32 0D4D 200D' => '0D7D', 00137 '0D33 0D4D 200D' => '0D7E', 00138 00139 # From http://permalink.gmane.org/gmane.science.linguistics.wikipedia.technical/46413 00140 '0D15 0D4D 200D' => '0D7F', 00141 ); 00142 00143 $pairs = array(); 00144 foreach ( $hexPairs as $hexSource => $hexDest ) { 00145 $source = hexSequenceToUtf8( $hexSource ); 00146 $dest = hexSequenceToUtf8( $hexDest ); 00147 $pairs[$source] = $dest; 00148 } 00149 00150 global $IP; 00151 file_put_contents( "$IP/serialized/normalize-ml.ser", serialize( $pairs ) ); 00152 echo "ml: " . count( $pairs ) . " pairs written.\n"; 00153 } 00154 } 00155 00156 $maintClass = 'GenerateNormalizerData'; 00157 require_once( RUN_MAINTENANCE_IF_MAIN ); 00158