MediaWiki  REL1_22
testCompression.php
Go to the documentation of this file.
00001 <?php
00024 $optionsWithArgs = array( 'start', 'limit', 'type' );
00025 require __DIR__ . '/../commandLine.inc';
00026 
00027 if ( !isset( $args[0] )  ) {
00028     echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] [--limit=<num-revs>] <page-title>\n";
00029     exit( 1 );
00030 }
00031 
00032 $title = Title::newFromText( $args[0] );
00033 if ( isset( $options['start'] ) ) {
00034     $start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
00035     echo "Starting from " . $wgLang->timeanddate( $start ) . "\n";
00036 } else {
00037     $start = '19700101000000';
00038 }
00039 if ( isset( $options['limit'] ) ) {
00040     $limit = $options['limit'];
00041     $untilHappy = false;
00042 } else {
00043     $limit = 1000;
00044     $untilHappy = true;
00045 }
00046 $type = isset( $options['type'] ) ? $options['type'] : 'ConcatenatedGzipHistoryBlob';
00047 
00048 
00049 $dbr = wfGetDB( DB_SLAVE );
00050 $res = $dbr->select(
00051     array( 'page', 'revision', 'text' ),
00052     '*',
00053     array(
00054         'page_namespace' => $title->getNamespace(),
00055         'page_title' => $title->getDBkey(),
00056         'page_id=rev_page',
00057         'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
00058         'rev_text_id=old_id'
00059     ), __FILE__, array( 'LIMIT' => $limit )
00060 );
00061 
00062 $blob = new $type;
00063 $hashes = array();
00064 $keys = array();
00065 $uncompressedSize = 0;
00066 $t = -microtime( true );
00067 foreach ( $res as $row ) {
00068     $revision = new Revision( $row );
00069     $text = $revision->getSerializedData();
00070     $uncompressedSize += strlen( $text );
00071     $hashes[$row->rev_id] = md5( $text );
00072     $keys[$row->rev_id] = $blob->addItem( $text );
00073     if ( $untilHappy && !$blob->isHappy() ) {
00074         break;
00075     }
00076 }
00077 
00078 $serialized = serialize( $blob );
00079 $t += microtime( true );
00080 # print_r( $blob->mDiffMap );
00081 
00082 printf( "%s\nCompression ratio for %d revisions: %5.2f, %s -> %d\n",
00083     $type,
00084     count( $hashes ),
00085     $uncompressedSize / strlen( $serialized ),
00086     $wgLang->formatSize( $uncompressedSize ),
00087     strlen( $serialized )
00088 );
00089 printf( "Compression time: %5.2f ms\n", $t * 1000 );
00090 
00091 $t = -microtime( true );
00092 $blob = unserialize( $serialized );
00093 foreach ( $keys as $id => $key ) {
00094     $text = $blob->getItem( $key );
00095     if ( md5( $text ) != $hashes[$id] ) {
00096         echo "Content hash mismatch for rev_id $id\n";
00097         # var_dump( $text );
00098     }
00099 }
00100 $t += microtime( true );
00101 printf( "Decompression time: %5.2f ms\n", $t * 1000 );