MediaWiki  REL1_19
testCompression.php
Go to the documentation of this file.
00001 <?php
00023 $optionsWithArgs = array( 'start', 'limit', 'type' );
00024 require( dirname( __FILE__ ) . '/../commandLine.inc' );
00025 
00026 if ( !isset( $args[0] )  ) {
00027         echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
00028         exit( 1 );
00029 }
00030 
00031 $title = Title::newFromText( $args[0] );
00032 if ( isset( $options['start'] ) ) {
00033         $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
00034         echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
00035 } else {
00036         $start = '19700101000000';
00037 }
00038 if ( isset( $options['limit'] ) ) {
00039         $limit = $options['limit'];
00040         $untilHappy = false;
00041 } else {
00042         $limit = 1000;
00043         $untilHappy = true;
00044 }
00045 $type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
00046 
00047 
00048 $dbr = wfGetDB( DB_SLAVE );
00049 $res = $dbr->select(
00050         array( 'page', 'revision', 'text' ),
00051         '*',
00052         array(
00053                 'page_namespace' => $title->getNamespace(),
00054                 'page_title' => $title->getDBkey(),
00055                 'page_id=rev_page',
00056                 'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
00057                 'rev_text_id=old_id'
00058         ), __FILE__, array( 'LIMIT' => $limit )
00059 );
00060 
00061 $blob = new $type;
00062 $hashes = array();
00063 $keys = array();
00064 $uncompressedSize = 0;
00065 $t = -microtime( true );
00066 foreach ( $res as $row ) {
00067         $revision = new Revision( $row );
00068         $text = $revision->getText();
00069         $uncompressedSize += strlen( $text );
00070         $hashes[$row->rev_id] = md5( $text );
00071         $keys[$row->rev_id] = $blob->addItem( $text );
00072         if ( $untilHappy && !$blob->isHappy() ) {
00073                 break;
00074         }
00075 }
00076 
00077 $serialized = serialize( $blob );
00078 $t += microtime( true );
00079 # print_r( $blob->mDiffMap );
00080 
00081 printf( "%s\nCompression ratio for %d revisions: %5.2f, %s -> %d\n",
00082         $type,
00083         count( $hashes ),
00084         $uncompressedSize / strlen( $serialized ),
00085         $wgLang->formatSize( $uncompressedSize ),
00086         strlen( $serialized )
00087 );
00088 printf( "Compression time: %5.2f ms\n", $t * 1000 );
00089 
00090 $t = -microtime( true );
00091 $blob = unserialize( $serialized );
00092 foreach ( $keys as $id => $key ) {
00093         $text = $blob->getItem( $key );
00094         if ( md5( $text ) != $hashes[$id] ) {
00095                 echo "Content hash mismatch for rev_id $id\n";
00096                 # var_dump( $text );
00097         }
00098 }
00099 $t += microtime( true );
00100 printf( "Decompression time: %5.2f ms\n", $t * 1000 );
00101