[ Index ]

PHP Cross Reference of MediaWiki-1.24.0

title

Body

[close]

/maintenance/language/ -> generateNormalizerDataAr.php (source)

   1  <?php
   2  /**
   3   * Generates the normalizer data file for Arabic.
   4   *
   5   * This program is free software; you can redistribute it and/or modify
   6   * it under the terms of the GNU General Public License as published by
   7   * the Free Software Foundation; either version 2 of the License, or
   8   * (at your option) any later version.
   9   *
  10   * This program is distributed in the hope that it will be useful,
  11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13   * GNU General Public License for more details.
  14   *
  15   * You should have received a copy of the GNU General Public License along
  16   * with this program; if not, write to the Free Software Foundation, Inc.,
  17   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18   * http://www.gnu.org/copyleft/gpl.html
  19   *
  20   * @file
  21   * @ingroup MaintenanceLanguage
  22   */
  23  
  24  require_once  __DIR__ . '/../Maintenance.php';
  25  
  26  /**
  27   * Generates the normalizer data file for Arabic.
  28   * For NFC see includes/normal.
  29   *
  30   * @ingroup MaintenanceLanguage
  31   */
  32  class GenerateNormalizerDataAr extends Maintenance {
  33  	public function __construct() {
  34          parent::__construct();
  35          $this->mDescription = 'Generate the normalizer data file for Arabic';
  36          $this->addOption( 'unicode-data-file', 'The local location of the data file ' .
  37              'from http://unicode.org/Public/UNIDATA/UnicodeData.txt', false, true );
  38      }
  39  
  40  	public function getDbType() {
  41          return Maintenance::DB_NONE;
  42      }
  43  
  44  	public function execute() {
  45          if ( !$this->hasOption( 'unicode-data-file' ) ) {
  46              $dataFile = 'UnicodeData.txt';
  47              if ( !file_exists( $dataFile ) ) {
  48                  $this->error( "Unable to find UnicodeData.txt. Please specify " .
  49                      "its location with --unicode-data-file=<FILE>" );
  50                  exit( 1 );
  51              }
  52          } else {
  53              $dataFile = $this->getOption( 'unicode-data-file' );
  54              if ( !file_exists( $dataFile ) ) {
  55                  $this->error( 'Unable to find the specified data file.' );
  56                  exit( 1 );
  57              }
  58          }
  59  
  60          $file = fopen( $dataFile, 'r' );
  61          if ( !$file ) {
  62              $this->error( 'Unable to open the data file.' );
  63              exit( 1 );
  64          }
  65  
  66          // For the file format, see http://www.unicode.org/reports/tr44/
  67          $fieldNames = array(
  68              'Code',
  69              'Name',
  70              'General_Category',
  71              'Canonical_Combining_Class',
  72              'Bidi_Class',
  73              'Decomposition_Type_Mapping',
  74              'Numeric_Type_Value_6',
  75              'Numeric_Type_Value_7',
  76              'Numeric_Type_Value_8',
  77              'Bidi_Mirrored',
  78              'Unicode_1_Name',
  79              'ISO_Comment',
  80              'Simple_Uppercase_Mapping',
  81              'Simple_Lowercase_Mapping',
  82              'Simple_Titlecase_Mapping'
  83          );
  84  
  85          $pairs = array();
  86  
  87          $lineNum = 0;
  88          while ( false !== ( $line = fgets( $file ) ) ) {
  89              ++$lineNum;
  90  
  91              # Strip comments
  92              $line = trim( substr( $line, 0, strcspn( $line, '#' ) ) );
  93              if ( $line === '' ) {
  94                  continue;
  95              }
  96  
  97              # Split fields
  98              $numberedData = explode( ';', $line );
  99              $data = array();
 100              foreach ( $fieldNames as $number => $name ) {
 101                  $data[$name] = $numberedData[$number];
 102              }
 103  
 104              $code = base_convert( $data['Code'], 16, 10 );
 105              if ( ( $code >= 0xFB50 && $code <= 0xFDFF ) # Arabic presentation forms A
 106                  || ( $code >= 0xFE70 && $code <= 0xFEFF ) # Arabic presentation forms B
 107              ) {
 108                  if ( $data['Decomposition_Type_Mapping'] === '' ) {
 109                      // No decomposition
 110                      continue;
 111                  }
 112                  if ( !preg_match( '/^ *(<\w*>) +([0-9A-F ]*)$/',
 113                      $data['Decomposition_Type_Mapping'], $m )
 114                  ) {
 115                      $this->error( "Can't parse Decomposition_Type/Mapping on line $lineNum" );
 116                      $this->error( $line );
 117                      continue;
 118                  }
 119  
 120                  $source = hexSequenceToUtf8( $data['Code'] );
 121                  $dest = hexSequenceToUtf8( $m[2] );
 122                  $pairs[$source] = $dest;
 123              }
 124          }
 125  
 126          global $IP;
 127          file_put_contents( "$IP/serialized/normalize-ar.ser", serialize( $pairs ) );
 128          echo "ar: " . count( $pairs ) . " pairs written.\n";
 129      }
 130  }
 131  
 132  $maintClass = 'GenerateNormalizerDataAr';
 133  require_once RUN_MAINTENANCE_IF_MAIN;


Generated: Fri Nov 28 14:03:12 2014 Cross-referenced by PHPXref 0.7.1