[ Index ]

PHP Cross Reference of Phabricator

title

Body

[close]

/src/applications/repository/graphcache/ -> PhabricatorRepositoryGraphCache.php (source)

   1  <?php
   2  
   3  /**
   4   * Given a commit and a path, efficiently determine the most recent ancestor
   5   * commit where the path was touched.
   6   *
   7   * In Git and Mercurial, log operations with a path are relatively slow. For
   8   * example:
   9   *
  10   *    git log -n1 <commit> -- <path>
  11   *
  12   * ...routinely takes several hundred milliseconds, and equivalent requests
  13   * often take longer in Mercurial.
  14   *
  15   * Unfortunately, this operation is fundamental to rendering a repository for
  16   * the web, and essentially everything else that's slow can be reduced to this
  17   * plus some trivial work afterward. Making this fast is desirable and powerful,
  18   * and allows us to make other things fast by expressing them in terms of this
  19   * query.
  20   *
  21   * Because the query is fundamentally a graph query, it isn't easy to express
  22   * in a reasonable way in MySQL, and we can't do round trips to the server to
  23   * walk the graph without incurring huge performance penalties.
  24   *
  25   * However, the total amount of data in the graph is relatively small. By
  26   * caching it in chunks and keeping it in APC, we can reasonably load and walk
  27   * the graph in PHP quickly.
  28   *
  29   * For more context, see T2683.
  30   *
  31   * Structure of the Cache
  32   * ======================
  33   *
  34   * The cache divides commits into buckets (see @{method:getBucketSize}). To
  35   * walk the graph, we pull a commit's bucket. The bucket is a map from commit
  36   * IDs to a list of parents and changed paths, separated by `null`. For
  37   * example, a bucket might look like this:
  38   *
  39   *   array(
  40   *     1 => array(0, null, 17, 18),
  41   *     2 => array(1, null, 4),
  42   *     // ...
  43   *   )
  44   *
  45   * This means that commit ID 1 has parent commit 0 (a special value meaning
  46   * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1,
  47   * and affected path 4.
  48   *
  49   * This data structure attempts to balance compactness, ease of construction,
  50   * simplicity of cache semantics, and lookup performance. In the average case,
  51   * it appears to do a reasonable job at this.
  52   *
  53   * @task query Querying the Graph Cache
  54   * @task cache Cache Internals
  55   */
  56  final class PhabricatorRepositoryGraphCache {
  57  
  58    private $rebuiltKeys = array();
  59  
  60  
  61  /* -(  Querying the Graph Cache  )------------------------------------------- */
  62  
  63  
  64    /**
  65     * Search the graph cache for the most modification to a path.
  66     *
  67     * @param int     The commit ID to search ancestors of.
  68     * @param int     The path ID to search for changes to.
  69     * @param float   Maximum number of seconds to spend trying to satisfy this
  70     *                query using the graph cache. By default, `0.5` (500ms).
  71     * @return mixed  Commit ID, or `null` if no ancestors exist, or `false` if
  72     *                the graph cache was unable to determine the answer.
  73     * @task query
  74     */
  75    public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) {
  76      $commit_id = (int)$commit_id;
  77      $path_id = (int)$path_id;
  78  
  79      $bucket_data = null;
  80      $data_key = null;
  81      $seen = array();
  82  
  83      $t_start = microtime(true);
  84      $iterations = 0;
  85      while (true) {
  86        $bucket_key = $this->getBucketKey($commit_id);
  87  
  88        if (($data_key != $bucket_key) || $bucket_data === null) {
  89          $bucket_data = $this->getBucketData($bucket_key);
  90          $data_key = $bucket_key;
  91        }
  92  
  93        if (empty($bucket_data[$commit_id])) {
  94          // Rebuild the cache bucket, since the commit might be a very recent
  95          // one that we'll pick up by rebuilding.
  96  
  97          $bucket_data = $this->getBucketData($bucket_key, $bucket_data);
  98          if (empty($bucket_data[$commit_id])) {
  99            // A rebuild didn't help. This can occur legitimately if the commit
 100            // is new and hasn't parsed yet.
 101            return false;
 102          }
 103  
 104          // Otherwise, the rebuild gave us the data, so we can keep going.
 105        }
 106  
 107        // Sanity check so we can survive and recover from bad data.
 108        if (isset($seen[$commit_id])) {
 109          phlog(pht('Unexpected infinite loop in RepositoryGraphCache!'));
 110          return false;
 111        } else {
 112          $seen[$commit_id] = true;
 113        }
 114  
 115        // `$data` is a list: the commit's parent IDs, followed by `null`,
 116        // followed by the modified paths in ascending order. We figure out the
 117        // first parent first, then check if the path was touched. If the path
 118        // was touched, this is the commit we're after. If not, walk backward
 119        // in the tree.
 120  
 121        $items = $bucket_data[$commit_id];
 122        $size = count($items);
 123  
 124        // Walk past the parent information.
 125        $parent_id = null;
 126        for ($ii = 0;; ++$ii) {
 127          if ($items[$ii] === null) {
 128            break;
 129          }
 130          if ($parent_id === null) {
 131            $parent_id = $items[$ii];
 132          }
 133        }
 134  
 135        // Look for a modification to the path.
 136        for (; $ii < $size; ++$ii) {
 137          $item = $items[$ii];
 138          if ($item > $path_id) {
 139            break;
 140          }
 141          if ($item === $path_id) {
 142            return $commit_id;
 143          }
 144        }
 145  
 146        if ($parent_id) {
 147          $commit_id = $parent_id;
 148  
 149          // Periodically check if we've spent too long looking for a result
 150          // in the cache, and return so we can fall back to a VCS operation. This
 151          // keeps us from having a degenerate worst case if, e.g., the cache
 152          // is cold and we need to inspect a very large number of blocks
 153          // to satisfy the query.
 154  
 155          if (((++$iterations) % 64) === 0) {
 156            $t_end = microtime(true);
 157            if (($t_end - $t_start) > $time) {
 158              return false;
 159            }
 160          }
 161          continue;
 162        }
 163  
 164        // If we have an explicit 0, that means this commit really has no parents.
 165        // Usually, it is the first commit in the repository.
 166        if ($parent_id === 0) {
 167          return null;
 168        }
 169  
 170        // If we didn't find a parent, the parent data isn't available. We fail
 171        // to find an answer in the cache and fall back to querying the VCS.
 172        return false;
 173      }
 174    }
 175  
 176  
 177  /* -(  Cache Internals  )---------------------------------------------------- */
 178  
 179  
 180    /**
 181     * Get the bucket key for a given commit ID.
 182     *
 183     * @param   int   Commit ID.
 184     * @return  int   Bucket key.
 185     * @task cache
 186     */
 187    private function getBucketKey($commit_id) {
 188      return (int)floor($commit_id / $this->getBucketSize());
 189    }
 190  
 191  
 192    /**
 193     * Get the cache key for a given bucket key (from @{method:getBucketKey}).
 194     *
 195     * @param   int     Bucket key.
 196     * @return  string  Cache key.
 197     * @task cache
 198     */
 199    private function getBucketCacheKey($bucket_key) {
 200      static $prefix;
 201  
 202      if ($prefix === null) {
 203        $self = get_class($this);
 204        $size = $this->getBucketSize();
 205        $prefix = "{$self}:{$size}:2:";
 206      }
 207  
 208      return $prefix.$bucket_key;
 209    }
 210  
 211  
 212    /**
 213     * Get the number of items per bucket.
 214     *
 215     * @return  int Number of items to store per bucket.
 216     * @task cache
 217     */
 218    private function getBucketSize() {
 219      return 4096;
 220    }
 221  
 222  
 223    /**
 224     * Retrieve or build a graph cache bucket from the cache.
 225     *
 226     * Normally, this operates as a readthrough cache call. It can also be used
 227     * to force a cache update by passing the existing data to `$rebuild_data`.
 228     *
 229     * @param   int     Bucket key, from @{method:getBucketKey}.
 230     * @param   mixed   Current data, to force a cache rebuild of this bucket.
 231     * @return  array   Data from the cache.
 232     * @task cache
 233     */
 234    private function getBucketData($bucket_key, $rebuild_data = null) {
 235      $cache_key = $this->getBucketCacheKey($bucket_key);
 236  
 237      // TODO: This cache stuff could be handled more gracefully, but the
 238      // database cache currently requires values to be strings and needs
 239      // some tweaking to support this as part of a stack. Our cache semantics
 240      // here are also unusual (not purely readthrough) because this cache is
 241      // appendable.
 242  
 243      $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache();
 244      $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache();
 245      if ($rebuild_data === null) {
 246        $bucket_data = $cache_level1->getKey($cache_key);
 247        if ($bucket_data) {
 248          return $bucket_data;
 249        }
 250  
 251        $bucket_data = $cache_level2->getKey($cache_key);
 252        if ($bucket_data) {
 253          $unserialized = @unserialize($bucket_data);
 254          if ($unserialized) {
 255            // Fill APC if we got a database hit but missed in APC.
 256            $cache_level1->setKey($cache_key, $unserialized);
 257            return $unserialized;
 258          }
 259        }
 260      }
 261  
 262      if (!is_array($rebuild_data)) {
 263        $rebuild_data = array();
 264      }
 265  
 266      $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data);
 267  
 268      // Don't bother writing the data if we didn't update anything.
 269      if ($bucket_data !== $rebuild_data) {
 270        $cache_level2->setKey($cache_key, serialize($bucket_data));
 271        $cache_level1->setKey($cache_key, $bucket_data);
 272      }
 273  
 274      return $bucket_data;
 275    }
 276  
 277  
 278    /**
 279     * Rebuild a cache bucket, amending existing data if available.
 280     *
 281     * @param   int     Bucket key, from @{method:getBucketKey}.
 282     * @param   array   Existing bucket data.
 283     * @return  array   Rebuilt bucket data.
 284     * @task cache
 285     */
 286    private function rebuildBucket($bucket_key, array $current_data) {
 287  
 288      // First, check if we've already rebuilt this bucket. In some cases (like
 289      // browsing a repository at some commit) it's common to issue many lookups
 290      // against one commit. If that commit has been discovered but not yet
 291      // fully imported, we'll repeatedly attempt to rebuild the bucket. If the
 292      // first rebuild did not work, subsequent rebuilds are very unlikely to
 293      // have any effect. We can just skip the rebuild in these cases.
 294  
 295      if (isset($this->rebuiltKeys[$bucket_key])) {
 296        return $current_data;
 297      } else {
 298        $this->rebuiltKeys[$bucket_key] = true;
 299      }
 300  
 301      $bucket_min = ($bucket_key * $this->getBucketSize());
 302      $bucket_max = ($bucket_min + $this->getBucketSize()) - 1;
 303  
 304      // We need to reload all of the commits in the bucket because there is
 305      // no guarantee that they'll get parsed in order, so we can fill large
 306      // commit IDs before small ones. Later on, we'll ignore the commits we
 307      // already know about.
 308  
 309      $table_commit = new PhabricatorRepositoryCommit();
 310      $table_repository = new PhabricatorRepository();
 311      $conn_r = $table_commit->establishConnection('r');
 312  
 313      // Find all the Git and Mercurial commits in the block which have completed
 314      // change import. We can't fill the cache accurately for commits which have
 315      // not completed change import, so just pretend we don't know about them.
 316      // In these cases, we will will ultimately fall back to VCS queries.
 317  
 318      $commit_rows = queryfx_all(
 319        $conn_r,
 320        'SELECT c.id FROM %T c
 321          JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls)
 322          WHERE c.id BETWEEN %d AND %d
 323            AND (c.importStatus & %d) = %d',
 324        $table_commit->getTableName(),
 325        $table_repository->getTableName(),
 326        array(
 327          PhabricatorRepositoryType::REPOSITORY_TYPE_GIT,
 328          PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL,
 329        ),
 330        $bucket_min,
 331        $bucket_max,
 332        PhabricatorRepositoryCommit::IMPORTED_CHANGE,
 333        PhabricatorRepositoryCommit::IMPORTED_CHANGE);
 334  
 335      // If we don't have any data, just return the existing data.
 336      if (!$commit_rows) {
 337        return $current_data;
 338      }
 339  
 340      // Remove the commits we already have data for. We don't need to rebuild
 341      // these. If there's nothing left, return the existing data.
 342  
 343      $commit_ids = ipull($commit_rows, 'id', 'id');
 344      $commit_ids = array_diff_key($commit_ids, $current_data);
 345  
 346      if (!$commit_ids) {
 347        return $current_data;
 348      }
 349  
 350      // Find all the path changes for the new commits.
 351      $path_changes = queryfx_all(
 352        $conn_r,
 353        'SELECT commitID, pathID FROM %T
 354          WHERE commitID IN (%Ld)
 355          AND (isDirect = 1 OR changeType = %d)',
 356        PhabricatorRepository::TABLE_PATHCHANGE,
 357        $commit_ids,
 358        DifferentialChangeType::TYPE_CHILD);
 359      $path_changes = igroup($path_changes, 'commitID');
 360  
 361      // Find all the parents for the new commits.
 362      $parents = queryfx_all(
 363        $conn_r,
 364        'SELECT childCommitID, parentCommitID FROM %T
 365          WHERE childCommitID IN (%Ld)
 366          ORDER BY id ASC',
 367        PhabricatorRepository::TABLE_PARENTS,
 368        $commit_ids);
 369      $parents = igroup($parents, 'childCommitID');
 370  
 371      // Build the actual data for the cache.
 372      foreach ($commit_ids as $commit_id) {
 373        $parent_ids = array();
 374        if (!empty($parents[$commit_id])) {
 375          foreach ($parents[$commit_id] as $row) {
 376            $parent_ids[] = (int)$row['parentCommitID'];
 377          }
 378        } else {
 379          // We expect all rows to have parents (commits with no parents get
 380          // an explicit "0" placeholder). If we're in an older repository, the
 381          // parent information might not have been populated yet. Decline to fill
 382          // the cache if we don't have the parent information, since the fill
 383          // will be incorrect.
 384          continue;
 385        }
 386  
 387        if (isset($path_changes[$commit_id])) {
 388          $path_ids = $path_changes[$commit_id];
 389          foreach ($path_ids as $key => $path_id) {
 390            $path_ids[$key] = (int)$path_id['pathID'];
 391          }
 392          sort($path_ids);
 393        } else {
 394          $path_ids = array();
 395        }
 396  
 397        $value = $parent_ids;
 398        $value[] = null;
 399        foreach ($path_ids as $path_id) {
 400          $value[] = $path_id;
 401        }
 402  
 403        $current_data[$commit_id] = $value;
 404      }
 405  
 406      return $current_data;
 407    }
 408  
 409  }


Generated: Sun Nov 30 09:20:46 2014 Cross-referenced by PHPXref 0.7.1