Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
ialloc.c
Go to the documentation of this file.
1 /*
2  * linux/fs/ext4/ialloc.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card ([email protected])
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  * BSD ufs-inspired inode and directory allocation by
10  * Stephen Tweedie ([email protected]), 1993
11  * Big-endian to little-endian byte-swapping/bitmaps by
12  * David S. Miller ([email protected]), 1995
13  */
14 
15 #include <linux/time.h>
16 #include <linux/fs.h>
17 #include <linux/jbd2.h>
18 #include <linux/stat.h>
19 #include <linux/string.h>
20 #include <linux/quotaops.h>
21 #include <linux/buffer_head.h>
22 #include <linux/random.h>
23 #include <linux/bitops.h>
24 #include <linux/blkdev.h>
25 #include <asm/byteorder.h>
26 
27 #include "ext4.h"
28 #include "ext4_jbd2.h"
29 #include "xattr.h"
30 #include "acl.h"
31 
32 #include <trace/events/ext4.h>
33 
34 /*
35  * ialloc.c contains the inodes allocation and deallocation routines
36  */
37 
38 /*
39  * The free inodes are managed by bitmaps. A file system contains several
40  * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap
41  * block for inodes, N blocks for the inode table and data blocks.
42  *
43  * The file system contains group descriptors which are located after the
44  * super block. Each descriptor contains the number of the bitmap block and
45  * the free blocks count in the block.
46  */
47 
48 /*
49  * To avoid calling the atomic setbit hundreds or thousands of times, we only
50  * need to use it within a single byte (to ensure we get endianness right).
51  * We can use memset for the rest of the bitmap as there are no other users.
52  */
53 void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
54 {
55  int i;
56 
57  if (start_bit >= end_bit)
58  return;
59 
60  ext4_debug("mark end bits +%d through +%d used\n", start_bit, end_bit);
61  for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++)
62  ext4_set_bit(i, bitmap);
63  if (i < end_bit)
64  memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
65 }
66 
67 /* Initializes an uninitialized inode bitmap */
68 static unsigned ext4_init_inode_bitmap(struct super_block *sb,
69  struct buffer_head *bh,
71  struct ext4_group_desc *gdp)
72 {
73  J_ASSERT_BH(bh, buffer_locked(bh));
74 
75  /* If checksum is bad mark all blocks and inodes use to prevent
76  * allocation, essentially implementing a per-group read-only flag. */
77  if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
78  ext4_error(sb, "Checksum bad for group %u", block_group);
79  ext4_free_group_clusters_set(sb, gdp, 0);
80  ext4_free_inodes_set(sb, gdp, 0);
81  ext4_itable_unused_set(sb, gdp, 0);
82  memset(bh->b_data, 0xff, sb->s_blocksize);
83  ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
84  EXT4_INODES_PER_GROUP(sb) / 8);
85  return 0;
86  }
87 
88  memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
90  bh->b_data);
91  ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
92  EXT4_INODES_PER_GROUP(sb) / 8);
93  ext4_group_desc_csum_set(sb, block_group, gdp);
94 
95  return EXT4_INODES_PER_GROUP(sb);
96 }
97 
98 void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
99 {
100  if (uptodate) {
101  set_buffer_uptodate(bh);
102  set_bitmap_uptodate(bh);
103  }
104  unlock_buffer(bh);
105  put_bh(bh);
106 }
107 
108 /*
109  * Read the inode allocation bitmap for a given block_group, reading
110  * into the specified slot in the superblock's bitmap cache.
111  *
112  * Return buffer_head of bitmap on success or NULL.
113  */
114 static struct buffer_head *
115 ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
116 {
117  struct ext4_group_desc *desc;
118  struct buffer_head *bh = NULL;
119  ext4_fsblk_t bitmap_blk;
120 
121  desc = ext4_get_group_desc(sb, block_group, NULL);
122  if (!desc)
123  return NULL;
124 
125  bitmap_blk = ext4_inode_bitmap(sb, desc);
126  bh = sb_getblk(sb, bitmap_blk);
127  if (unlikely(!bh)) {
128  ext4_error(sb, "Cannot read inode bitmap - "
129  "block_group = %u, inode_bitmap = %llu",
130  block_group, bitmap_blk);
131  return NULL;
132  }
133  if (bitmap_uptodate(bh))
134  goto verify;
135 
136  lock_buffer(bh);
137  if (bitmap_uptodate(bh)) {
138  unlock_buffer(bh);
139  goto verify;
140  }
141 
142  ext4_lock_group(sb, block_group);
144  ext4_init_inode_bitmap(sb, bh, block_group, desc);
145  set_bitmap_uptodate(bh);
146  set_buffer_uptodate(bh);
147  set_buffer_verified(bh);
148  ext4_unlock_group(sb, block_group);
149  unlock_buffer(bh);
150  return bh;
151  }
152  ext4_unlock_group(sb, block_group);
153 
154  if (buffer_uptodate(bh)) {
155  /*
156  * if not uninit if bh is uptodate,
157  * bitmap is also uptodate
158  */
159  set_bitmap_uptodate(bh);
160  unlock_buffer(bh);
161  goto verify;
162  }
163  /*
164  * submit the buffer_head for reading
165  */
166  trace_ext4_load_inode_bitmap(sb, block_group);
167  bh->b_end_io = ext4_end_bitmap_read;
168  get_bh(bh);
169  submit_bh(READ, bh);
170  wait_on_buffer(bh);
171  if (!buffer_uptodate(bh)) {
172  put_bh(bh);
173  ext4_error(sb, "Cannot read inode bitmap - "
174  "block_group = %u, inode_bitmap = %llu",
175  block_group, bitmap_blk);
176  return NULL;
177  }
178 
179 verify:
180  ext4_lock_group(sb, block_group);
181  if (!buffer_verified(bh) &&
182  !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
183  EXT4_INODES_PER_GROUP(sb) / 8)) {
184  ext4_unlock_group(sb, block_group);
185  put_bh(bh);
186  ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
187  "inode_bitmap = %llu", block_group, bitmap_blk);
188  return NULL;
189  }
190  ext4_unlock_group(sb, block_group);
191  set_buffer_verified(bh);
192  return bh;
193 }
194 
195 /*
196  * NOTE! When we get the inode, we're the only people
197  * that have access to it, and as such there are no
198  * race conditions we have to worry about. The inode
199  * is not on the hash-lists, and it cannot be reached
200  * through the filesystem because the directory entry
201  * has been deleted earlier.
202  *
203  * HOWEVER: we must make sure that we get no aliases,
204  * which means that we have to call "clear_inode()"
205  * _before_ we mark the inode not in use in the inode
206  * bitmaps. Otherwise a newly created file might use
207  * the same inode number (not actually the same pointer
208  * though), and then we'd have two inodes sharing the
209  * same inode number and space on the harddisk.
210  */
211 void ext4_free_inode(handle_t *handle, struct inode *inode)
212 {
213  struct super_block *sb = inode->i_sb;
214  int is_directory;
215  unsigned long ino;
216  struct buffer_head *bitmap_bh = NULL;
217  struct buffer_head *bh2;
219  unsigned long bit;
220  struct ext4_group_desc *gdp;
221  struct ext4_super_block *es;
222  struct ext4_sb_info *sbi;
223  int fatal = 0, err, count, cleared;
224 
225  if (!sb) {
226  printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
227  "nonexistent device\n", __func__, __LINE__);
228  return;
229  }
230  if (atomic_read(&inode->i_count) > 1) {
231  ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
232  __func__, __LINE__, inode->i_ino,
233  atomic_read(&inode->i_count));
234  return;
235  }
236  if (inode->i_nlink) {
237  ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
238  __func__, __LINE__, inode->i_ino, inode->i_nlink);
239  return;
240  }
241  sbi = EXT4_SB(sb);
242 
243  ino = inode->i_ino;
244  ext4_debug("freeing inode %lu\n", ino);
245  trace_ext4_free_inode(inode);
246 
247  /*
248  * Note: we must free any quota before locking the superblock,
249  * as writing the quota to disk may need the lock as well.
250  */
251  dquot_initialize(inode);
252  ext4_xattr_delete_inode(handle, inode);
253  dquot_free_inode(inode);
254  dquot_drop(inode);
255 
256  is_directory = S_ISDIR(inode->i_mode);
257 
258  /* Do this BEFORE marking the inode not in use or returning an error */
259  ext4_clear_inode(inode);
260 
261  es = EXT4_SB(sb)->s_es;
262  if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
263  ext4_error(sb, "reserved or nonexistent inode %lu", ino);
264  goto error_return;
265  }
266  block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
267  bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
268  bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
269  if (!bitmap_bh)
270  goto error_return;
271 
272  BUFFER_TRACE(bitmap_bh, "get_write_access");
273  fatal = ext4_journal_get_write_access(handle, bitmap_bh);
274  if (fatal)
275  goto error_return;
276 
277  fatal = -ESRCH;
278  gdp = ext4_get_group_desc(sb, block_group, &bh2);
279  if (gdp) {
280  BUFFER_TRACE(bh2, "get_write_access");
281  fatal = ext4_journal_get_write_access(handle, bh2);
282  }
283  ext4_lock_group(sb, block_group);
284  cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
285  if (fatal || !cleared) {
286  ext4_unlock_group(sb, block_group);
287  goto out;
288  }
289 
290  count = ext4_free_inodes_count(sb, gdp) + 1;
291  ext4_free_inodes_set(sb, gdp, count);
292  if (is_directory) {
293  count = ext4_used_dirs_count(sb, gdp) - 1;
294  ext4_used_dirs_set(sb, gdp, count);
295  percpu_counter_dec(&sbi->s_dirs_counter);
296  }
297  ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
298  EXT4_INODES_PER_GROUP(sb) / 8);
299  ext4_group_desc_csum_set(sb, block_group, gdp);
300  ext4_unlock_group(sb, block_group);
301 
302  percpu_counter_inc(&sbi->s_freeinodes_counter);
303  if (sbi->s_log_groups_per_flex) {
304  ext4_group_t f = ext4_flex_group(sbi, block_group);
305 
306  atomic_inc(&sbi->s_flex_groups[f].free_inodes);
307  if (is_directory)
308  atomic_dec(&sbi->s_flex_groups[f].used_dirs);
309  }
310  BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
311  fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
312 out:
313  if (cleared) {
314  BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
315  err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
316  if (!fatal)
317  fatal = err;
318  } else
319  ext4_error(sb, "bit already cleared for inode %lu", ino);
320 
321 error_return:
322  brelse(bitmap_bh);
323  ext4_std_error(sb, fatal);
324 }
325 
326 struct orlov_stats {
330 };
331 
332 /*
333  * Helper function for Orlov's allocator; returns critical information
334  * for a particular block group or flex_bg. If flex_size is 1, then g
335  * is a block group number; otherwise it is flex_bg number.
336  */
337 static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
338  int flex_size, struct orlov_stats *stats)
339 {
340  struct ext4_group_desc *desc;
341  struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
342 
343  if (flex_size > 1) {
344  stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
345  stats->free_clusters = atomic_read(&flex_group[g].free_clusters);
346  stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
347  return;
348  }
349 
350  desc = ext4_get_group_desc(sb, g, NULL);
351  if (desc) {
352  stats->free_inodes = ext4_free_inodes_count(sb, desc);
353  stats->free_clusters = ext4_free_group_clusters(sb, desc);
354  stats->used_dirs = ext4_used_dirs_count(sb, desc);
355  } else {
356  stats->free_inodes = 0;
357  stats->free_clusters = 0;
358  stats->used_dirs = 0;
359  }
360 }
361 
362 /*
363  * Orlov's allocator for directories.
364  *
365  * We always try to spread first-level directories.
366  *
367  * If there are blockgroups with both free inodes and free blocks counts
368  * not worse than average we return one with smallest directory count.
369  * Otherwise we simply return a random group.
370  *
371  * For the rest rules look so:
372  *
373  * It's OK to put directory into a group unless
374  * it has too many directories already (max_dirs) or
375  * it has too few free inodes left (min_inodes) or
376  * it has too few free blocks left (min_blocks) or
377  * Parent's group is preferred, if it doesn't satisfy these
378  * conditions we search cyclically through the rest. If none
379  * of the groups look good we just look for a group with more
380  * free inodes than average (starting at parent's group).
381  */
382 
383 static int find_group_orlov(struct super_block *sb, struct inode *parent,
385  const struct qstr *qstr)
386 {
387  ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
388  struct ext4_sb_info *sbi = EXT4_SB(sb);
389  ext4_group_t real_ngroups = ext4_get_groups_count(sb);
390  int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
391  unsigned int freei, avefreei, grp_free;
392  ext4_fsblk_t freeb, avefreec;
393  unsigned int ndirs;
394  int max_dirs, min_inodes;
395  ext4_grpblk_t min_clusters;
396  ext4_group_t i, grp, g, ngroups;
397  struct ext4_group_desc *desc;
398  struct orlov_stats stats;
399  int flex_size = ext4_flex_bg_size(sbi);
400  struct dx_hash_info hinfo;
401 
402  ngroups = real_ngroups;
403  if (flex_size > 1) {
404  ngroups = (real_ngroups + flex_size - 1) >>
405  sbi->s_log_groups_per_flex;
406  parent_group >>= sbi->s_log_groups_per_flex;
407  }
408 
409  freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
410  avefreei = freei / ngroups;
411  freeb = EXT4_C2B(sbi,
412  percpu_counter_read_positive(&sbi->s_freeclusters_counter));
413  avefreec = freeb;
414  do_div(avefreec, ngroups);
415  ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
416 
417  if (S_ISDIR(mode) &&
418  ((parent == sb->s_root->d_inode) ||
419  (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
420  int best_ndir = inodes_per_group;
421  int ret = -1;
422 
423  if (qstr) {
424  hinfo.hash_version = DX_HASH_HALF_MD4;
425  hinfo.seed = sbi->s_hash_seed;
426  ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
427  grp = hinfo.hash;
428  } else
429  get_random_bytes(&grp, sizeof(grp));
430  parent_group = (unsigned)grp % ngroups;
431  for (i = 0; i < ngroups; i++) {
432  g = (parent_group + i) % ngroups;
433  get_orlov_stats(sb, g, flex_size, &stats);
434  if (!stats.free_inodes)
435  continue;
436  if (stats.used_dirs >= best_ndir)
437  continue;
438  if (stats.free_inodes < avefreei)
439  continue;
440  if (stats.free_clusters < avefreec)
441  continue;
442  grp = g;
443  ret = 0;
444  best_ndir = stats.used_dirs;
445  }
446  if (ret)
447  goto fallback;
448  found_flex_bg:
449  if (flex_size == 1) {
450  *group = grp;
451  return 0;
452  }
453 
454  /*
455  * We pack inodes at the beginning of the flexgroup's
456  * inode tables. Block allocation decisions will do
457  * something similar, although regular files will
458  * start at 2nd block group of the flexgroup. See
459  * ext4_ext_find_goal() and ext4_find_near().
460  */
461  grp *= flex_size;
462  for (i = 0; i < flex_size; i++) {
463  if (grp+i >= real_ngroups)
464  break;
465  desc = ext4_get_group_desc(sb, grp+i, NULL);
466  if (desc && ext4_free_inodes_count(sb, desc)) {
467  *group = grp+i;
468  return 0;
469  }
470  }
471  goto fallback;
472  }
473 
474  max_dirs = ndirs / ngroups + inodes_per_group / 16;
475  min_inodes = avefreei - inodes_per_group*flex_size / 4;
476  if (min_inodes < 1)
477  min_inodes = 1;
478  min_clusters = avefreec - EXT4_CLUSTERS_PER_GROUP(sb)*flex_size / 4;
479 
480  /*
481  * Start looking in the flex group where we last allocated an
482  * inode for this parent directory
483  */
484  if (EXT4_I(parent)->i_last_alloc_group != ~0) {
485  parent_group = EXT4_I(parent)->i_last_alloc_group;
486  if (flex_size > 1)
487  parent_group >>= sbi->s_log_groups_per_flex;
488  }
489 
490  for (i = 0; i < ngroups; i++) {
491  grp = (parent_group + i) % ngroups;
492  get_orlov_stats(sb, grp, flex_size, &stats);
493  if (stats.used_dirs >= max_dirs)
494  continue;
495  if (stats.free_inodes < min_inodes)
496  continue;
497  if (stats.free_clusters < min_clusters)
498  continue;
499  goto found_flex_bg;
500  }
501 
502 fallback:
503  ngroups = real_ngroups;
504  avefreei = freei / ngroups;
505 fallback_retry:
506  parent_group = EXT4_I(parent)->i_block_group;
507  for (i = 0; i < ngroups; i++) {
508  grp = (parent_group + i) % ngroups;
509  desc = ext4_get_group_desc(sb, grp, NULL);
510  if (desc) {
511  grp_free = ext4_free_inodes_count(sb, desc);
512  if (grp_free && grp_free >= avefreei) {
513  *group = grp;
514  return 0;
515  }
516  }
517  }
518 
519  if (avefreei) {
520  /*
521  * The free-inodes counter is approximate, and for really small
522  * filesystems the above test can fail to find any blockgroups
523  */
524  avefreei = 0;
525  goto fallback_retry;
526  }
527 
528  return -1;
529 }
530 
531 static int find_group_other(struct super_block *sb, struct inode *parent,
532  ext4_group_t *group, umode_t mode)
533 {
534  ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
535  ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
536  struct ext4_group_desc *desc;
537  int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
538 
539  /*
540  * Try to place the inode is the same flex group as its
541  * parent. If we can't find space, use the Orlov algorithm to
542  * find another flex group, and store that information in the
543  * parent directory's inode information so that use that flex
544  * group for future allocations.
545  */
546  if (flex_size > 1) {
547  int retry = 0;
548 
549  try_again:
550  parent_group &= ~(flex_size-1);
551  last = parent_group + flex_size;
552  if (last > ngroups)
553  last = ngroups;
554  for (i = parent_group; i < last; i++) {
555  desc = ext4_get_group_desc(sb, i, NULL);
556  if (desc && ext4_free_inodes_count(sb, desc)) {
557  *group = i;
558  return 0;
559  }
560  }
561  if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
562  retry = 1;
563  parent_group = EXT4_I(parent)->i_last_alloc_group;
564  goto try_again;
565  }
566  /*
567  * If this didn't work, use the Orlov search algorithm
568  * to find a new flex group; we pass in the mode to
569  * avoid the topdir algorithms.
570  */
571  *group = parent_group + flex_size;
572  if (*group > ngroups)
573  *group = 0;
574  return find_group_orlov(sb, parent, group, mode, NULL);
575  }
576 
577  /*
578  * Try to place the inode in its parent directory
579  */
580  *group = parent_group;
581  desc = ext4_get_group_desc(sb, *group, NULL);
582  if (desc && ext4_free_inodes_count(sb, desc) &&
583  ext4_free_group_clusters(sb, desc))
584  return 0;
585 
586  /*
587  * We're going to place this inode in a different blockgroup from its
588  * parent. We want to cause files in a common directory to all land in
589  * the same blockgroup. But we want files which are in a different
590  * directory which shares a blockgroup with our parent to land in a
591  * different blockgroup.
592  *
593  * So add our directory's i_ino into the starting point for the hash.
594  */
595  *group = (*group + parent->i_ino) % ngroups;
596 
597  /*
598  * Use a quadratic hash to find a group with a free inode and some free
599  * blocks.
600  */
601  for (i = 1; i < ngroups; i <<= 1) {
602  *group += i;
603  if (*group >= ngroups)
604  *group -= ngroups;
605  desc = ext4_get_group_desc(sb, *group, NULL);
606  if (desc && ext4_free_inodes_count(sb, desc) &&
607  ext4_free_group_clusters(sb, desc))
608  return 0;
609  }
610 
611  /*
612  * That failed: try linear search for a free inode, even if that group
613  * has no free blocks.
614  */
615  *group = parent_group;
616  for (i = 0; i < ngroups; i++) {
617  if (++*group >= ngroups)
618  *group = 0;
619  desc = ext4_get_group_desc(sb, *group, NULL);
620  if (desc && ext4_free_inodes_count(sb, desc))
621  return 0;
622  }
623 
624  return -1;
625 }
626 
627 /*
628  * There are two policies for allocating an inode. If the new inode is
629  * a directory, then a forward search is made for a block group with both
630  * free space and a low directory-to-inode ratio; if that fails, then of
631  * the groups with above-average free space, that group with the fewest
632  * directories already is chosen.
633  *
634  * For other inodes, search forward from the parent directory's block
635  * group to find a free inode.
636  */
637 struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, umode_t mode,
638  const struct qstr *qstr, __u32 goal, uid_t *owner)
639 {
640  struct super_block *sb;
641  struct buffer_head *inode_bitmap_bh = NULL;
642  struct buffer_head *group_desc_bh;
643  ext4_group_t ngroups, group = 0;
644  unsigned long ino = 0;
645  struct inode *inode;
646  struct ext4_group_desc *gdp = NULL;
647  struct ext4_inode_info *ei;
648  struct ext4_sb_info *sbi;
649  int ret2, err = 0;
650  struct inode *ret;
651  ext4_group_t i;
652  ext4_group_t flex_group;
653 
654  /* Cannot create files in a deleted directory */
655  if (!dir || !dir->i_nlink)
656  return ERR_PTR(-EPERM);
657 
658  sb = dir->i_sb;
659  ngroups = ext4_get_groups_count(sb);
660  trace_ext4_request_inode(dir, mode);
661  inode = new_inode(sb);
662  if (!inode)
663  return ERR_PTR(-ENOMEM);
664  ei = EXT4_I(inode);
665  sbi = EXT4_SB(sb);
666 
667  if (!goal)
668  goal = sbi->s_inode_goal;
669 
670  if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
671  group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
672  ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
673  ret2 = 0;
674  goto got_group;
675  }
676 
677  if (S_ISDIR(mode))
678  ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
679  else
680  ret2 = find_group_other(sb, dir, &group, mode);
681 
682 got_group:
683  EXT4_I(dir)->i_last_alloc_group = group;
684  err = -ENOSPC;
685  if (ret2 == -1)
686  goto out;
687 
688  /*
689  * Normally we will only go through one pass of this loop,
690  * unless we get unlucky and it turns out the group we selected
691  * had its last inode grabbed by someone else.
692  */
693  for (i = 0; i < ngroups; i++, ino = 0) {
694  err = -EIO;
695 
696  gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
697  if (!gdp)
698  goto fail;
699 
700  /*
701  * Check free inodes count before loading bitmap.
702  */
703  if (ext4_free_inodes_count(sb, gdp) == 0) {
704  if (++group == ngroups)
705  group = 0;
706  continue;
707  }
708 
709  brelse(inode_bitmap_bh);
710  inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
711  if (!inode_bitmap_bh)
712  goto fail;
713 
714 repeat_in_this_group:
715  ino = ext4_find_next_zero_bit((unsigned long *)
716  inode_bitmap_bh->b_data,
717  EXT4_INODES_PER_GROUP(sb), ino);
718  if (ino >= EXT4_INODES_PER_GROUP(sb)) {
719  if (++group == ngroups)
720  group = 0;
721  continue;
722  }
723  if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
724  ext4_error(sb, "reserved inode found cleared - "
725  "inode=%lu", ino + 1);
726  continue;
727  }
728  BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
729  err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
730  if (err)
731  goto fail;
732  ext4_lock_group(sb, group);
733  ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
734  ext4_unlock_group(sb, group);
735  ino++; /* the inode bitmap is zero-based */
736  if (!ret2)
737  goto got; /* we grabbed the inode! */
738  if (ino < EXT4_INODES_PER_GROUP(sb))
739  goto repeat_in_this_group;
740  }
741  err = -ENOSPC;
742  goto out;
743 
744 got:
745  BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
746  err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
747  if (err)
748  goto fail;
749 
750  /* We may have to initialize the block bitmap if it isn't already */
751  if (ext4_has_group_desc_csum(sb) &&
753  struct buffer_head *block_bitmap_bh;
754 
755  block_bitmap_bh = ext4_read_block_bitmap(sb, group);
756  BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
757  err = ext4_journal_get_write_access(handle, block_bitmap_bh);
758  if (err) {
759  brelse(block_bitmap_bh);
760  goto fail;
761  }
762 
763  BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
764  err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
765  brelse(block_bitmap_bh);
766 
767  /* recheck and clear flag under lock if we still need to */
768  ext4_lock_group(sb, group);
772  ext4_free_clusters_after_init(sb, group, gdp));
773  ext4_block_bitmap_csum_set(sb, group, gdp,
774  block_bitmap_bh);
775  ext4_group_desc_csum_set(sb, group, gdp);
776  }
777  ext4_unlock_group(sb, group);
778 
779  if (err)
780  goto fail;
781  }
782 
783  BUFFER_TRACE(group_desc_bh, "get_write_access");
784  err = ext4_journal_get_write_access(handle, group_desc_bh);
785  if (err)
786  goto fail;
787 
788  /* Update the relevant bg descriptor fields */
789  if (ext4_has_group_desc_csum(sb)) {
790  int free;
791  struct ext4_group_info *grp = ext4_get_group_info(sb, group);
792 
793  down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
794  ext4_lock_group(sb, group); /* while we modify the bg desc */
795  free = EXT4_INODES_PER_GROUP(sb) -
796  ext4_itable_unused_count(sb, gdp);
799  free = 0;
800  }
801  /*
802  * Check the relative inode number against the last used
803  * relative inode number in this group. if it is greater
804  * we need to update the bg_itable_unused count
805  */
806  if (ino > free)
807  ext4_itable_unused_set(sb, gdp,
808  (EXT4_INODES_PER_GROUP(sb) - ino));
809  up_read(&grp->alloc_sem);
810  } else {
811  ext4_lock_group(sb, group);
812  }
813 
814  ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
815  if (S_ISDIR(mode)) {
816  ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
817  if (sbi->s_log_groups_per_flex) {
818  ext4_group_t f = ext4_flex_group(sbi, group);
819 
820  atomic_inc(&sbi->s_flex_groups[f].used_dirs);
821  }
822  }
823  if (ext4_has_group_desc_csum(sb)) {
824  ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
825  EXT4_INODES_PER_GROUP(sb) / 8);
826  ext4_group_desc_csum_set(sb, group, gdp);
827  }
828  ext4_unlock_group(sb, group);
829 
830  BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
831  err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
832  if (err)
833  goto fail;
834 
835  percpu_counter_dec(&sbi->s_freeinodes_counter);
836  if (S_ISDIR(mode))
837  percpu_counter_inc(&sbi->s_dirs_counter);
838 
839  if (sbi->s_log_groups_per_flex) {
840  flex_group = ext4_flex_group(sbi, group);
841  atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
842  }
843  if (owner) {
844  inode->i_mode = mode;
845  i_uid_write(inode, owner[0]);
846  i_gid_write(inode, owner[1]);
847  } else if (test_opt(sb, GRPID)) {
848  inode->i_mode = mode;
849  inode->i_uid = current_fsuid();
850  inode->i_gid = dir->i_gid;
851  } else
852  inode_init_owner(inode, dir, mode);
853 
854  inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
855  /* This is the optimal IO size (for stat), not the fs block size */
856  inode->i_blocks = 0;
857  inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
858  ext4_current_time(inode);
859 
860  memset(ei->i_data, 0, sizeof(ei->i_data));
861  ei->i_dir_start_lookup = 0;
862  ei->i_disksize = 0;
863 
864  /* Don't inherit extent flag from directory, amongst others. */
865  ei->i_flags =
866  ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
867  ei->i_file_acl = 0;
868  ei->i_dtime = 0;
869  ei->i_block_group = group;
870  ei->i_last_alloc_group = ~0;
871 
872  ext4_set_inode_flags(inode);
873  if (IS_DIRSYNC(inode))
874  ext4_handle_sync(handle);
875  if (insert_inode_locked(inode) < 0) {
876  /*
877  * Likely a bitmap corruption causing inode to be allocated
878  * twice.
879  */
880  err = -EIO;
881  goto fail;
882  }
883  spin_lock(&sbi->s_next_gen_lock);
884  inode->i_generation = sbi->s_next_generation++;
885  spin_unlock(&sbi->s_next_gen_lock);
886 
887  /* Precompute checksum seed for inode metadata */
890  __u32 csum;
891  struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
892  __le32 inum = cpu_to_le32(inode->i_ino);
894  csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
895  sizeof(inum));
896  ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
897  sizeof(gen));
898  }
899 
900  ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
901  ext4_set_inode_state(inode, EXT4_STATE_NEW);
902 
903  ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
904 
905  ret = inode;
906  dquot_initialize(inode);
907  err = dquot_alloc_inode(inode);
908  if (err)
909  goto fail_drop;
910 
911  err = ext4_init_acl(handle, inode, dir);
912  if (err)
913  goto fail_free_drop;
914 
915  err = ext4_init_security(handle, inode, dir, qstr);
916  if (err)
917  goto fail_free_drop;
918 
920  /* set extent flag only for directory, file and normal symlink*/
921  if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
922  ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
923  ext4_ext_tree_init(handle, inode);
924  }
925  }
926 
927  if (ext4_handle_valid(handle)) {
928  ei->i_sync_tid = handle->h_transaction->t_tid;
929  ei->i_datasync_tid = handle->h_transaction->t_tid;
930  }
931 
932  err = ext4_mark_inode_dirty(handle, inode);
933  if (err) {
934  ext4_std_error(sb, err);
935  goto fail_free_drop;
936  }
937 
938  ext4_debug("allocating inode %lu\n", inode->i_ino);
939  trace_ext4_allocate_inode(inode, dir, mode);
940  goto really_out;
941 fail:
942  ext4_std_error(sb, err);
943 out:
944  iput(inode);
945  ret = ERR_PTR(err);
946 really_out:
947  brelse(inode_bitmap_bh);
948  return ret;
949 
950 fail_free_drop:
951  dquot_free_inode(inode);
952 
953 fail_drop:
954  dquot_drop(inode);
955  inode->i_flags |= S_NOQUOTA;
956  clear_nlink(inode);
957  unlock_new_inode(inode);
958  iput(inode);
959  brelse(inode_bitmap_bh);
960  return ERR_PTR(err);
961 }
962 
963 /* Verify that we are loading a valid orphan from disk */
964 struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
965 {
966  unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
968  int bit;
969  struct buffer_head *bitmap_bh;
970  struct inode *inode = NULL;
971  long err = -EIO;
972 
973  /* Error cases - e2fsck has already cleaned up for us */
974  if (ino > max_ino) {
975  ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
976  goto error;
977  }
978 
979  block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
980  bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
981  bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
982  if (!bitmap_bh) {
983  ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
984  goto error;
985  }
986 
987  /* Having the inode bit set should be a 100% indicator that this
988  * is a valid orphan (no e2fsck run on fs). Orphans also include
989  * inodes that were being truncated, so we can't check i_nlink==0.
990  */
991  if (!ext4_test_bit(bit, bitmap_bh->b_data))
992  goto bad_orphan;
993 
994  inode = ext4_iget(sb, ino);
995  if (IS_ERR(inode))
996  goto iget_failed;
997 
998  /*
999  * If the orphans has i_nlinks > 0 then it should be able to be
1000  * truncated, otherwise it won't be removed from the orphan list
1001  * during processing and an infinite loop will result.
1002  */
1003  if (inode->i_nlink && !ext4_can_truncate(inode))
1004  goto bad_orphan;
1005 
1006  if (NEXT_ORPHAN(inode) > max_ino)
1007  goto bad_orphan;
1008  brelse(bitmap_bh);
1009  return inode;
1010 
1011 iget_failed:
1012  err = PTR_ERR(inode);
1013  inode = NULL;
1014 bad_orphan:
1015  ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino);
1016  printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n",
1017  bit, (unsigned long long)bitmap_bh->b_blocknr,
1018  ext4_test_bit(bit, bitmap_bh->b_data));
1019  printk(KERN_NOTICE "inode=%p\n", inode);
1020  if (inode) {
1021  printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
1022  is_bad_inode(inode));
1023  printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
1024  NEXT_ORPHAN(inode));
1025  printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
1026  printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
1027  /* Avoid freeing blocks if we got a bad deleted inode */
1028  if (inode->i_nlink == 0)
1029  inode->i_blocks = 0;
1030  iput(inode);
1031  }
1032  brelse(bitmap_bh);
1033 error:
1034  return ERR_PTR(err);
1035 }
1036 
1037 unsigned long ext4_count_free_inodes(struct super_block *sb)
1038 {
1039  unsigned long desc_count;
1040  struct ext4_group_desc *gdp;
1041  ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1042 #ifdef EXT4FS_DEBUG
1043  struct ext4_super_block *es;
1044  unsigned long bitmap_count, x;
1045  struct buffer_head *bitmap_bh = NULL;
1046 
1047  es = EXT4_SB(sb)->s_es;
1048  desc_count = 0;
1049  bitmap_count = 0;
1050  gdp = NULL;
1051  for (i = 0; i < ngroups; i++) {
1052  gdp = ext4_get_group_desc(sb, i, NULL);
1053  if (!gdp)
1054  continue;
1055  desc_count += ext4_free_inodes_count(sb, gdp);
1056  brelse(bitmap_bh);
1057  bitmap_bh = ext4_read_inode_bitmap(sb, i);
1058  if (!bitmap_bh)
1059  continue;
1060 
1061  x = ext4_count_free(bitmap_bh->b_data,
1062  EXT4_INODES_PER_GROUP(sb) / 8);
1063  printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1064  (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1065  bitmap_count += x;
1066  }
1067  brelse(bitmap_bh);
1068  printk(KERN_DEBUG "ext4_count_free_inodes: "
1069  "stored = %u, computed = %lu, %lu\n",
1070  le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
1071  return desc_count;
1072 #else
1073  desc_count = 0;
1074  for (i = 0; i < ngroups; i++) {
1075  gdp = ext4_get_group_desc(sb, i, NULL);
1076  if (!gdp)
1077  continue;
1078  desc_count += ext4_free_inodes_count(sb, gdp);
1079  cond_resched();
1080  }
1081  return desc_count;
1082 #endif
1083 }
1084 
1085 /* Called at mount-time, super-block is locked */
1086 unsigned long ext4_count_dirs(struct super_block * sb)
1087 {
1088  unsigned long count = 0;
1089  ext4_group_t i, ngroups = ext4_get_groups_count(sb);
1090 
1091  for (i = 0; i < ngroups; i++) {
1092  struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
1093  if (!gdp)
1094  continue;
1095  count += ext4_used_dirs_count(sb, gdp);
1096  }
1097  return count;
1098 }
1099 
1100 /*
1101  * Zeroes not yet zeroed inode table - just write zeroes through the whole
1102  * inode table. Must be called without any spinlock held. The only place
1103  * where it is called from on active part of filesystem is ext4lazyinit
1104  * thread, so we do not need any special locks, however we have to prevent
1105  * inode allocation from the current group, so we take alloc_sem lock, to
1106  * block ext4_new_inode() until we are finished.
1107  */
1109  int barrier)
1110 {
1111  struct ext4_group_info *grp = ext4_get_group_info(sb, group);
1112  struct ext4_sb_info *sbi = EXT4_SB(sb);
1113  struct ext4_group_desc *gdp = NULL;
1114  struct buffer_head *group_desc_bh;
1115  handle_t *handle;
1116  ext4_fsblk_t blk;
1117  int num, ret = 0, used_blks = 0;
1118 
1119  /* This should not happen, but just to be sure check this */
1120  if (sb->s_flags & MS_RDONLY) {
1121  ret = 1;
1122  goto out;
1123  }
1124 
1125  gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
1126  if (!gdp)
1127  goto out;
1128 
1129  /*
1130  * We do not need to lock this, because we are the only one
1131  * handling this flag.
1132  */
1134  goto out;
1135 
1136  handle = ext4_journal_start_sb(sb, 1);
1137  if (IS_ERR(handle)) {
1138  ret = PTR_ERR(handle);
1139  goto out;
1140  }
1141 
1142  down_write(&grp->alloc_sem);
1143  /*
1144  * If inode bitmap was already initialized there may be some
1145  * used inodes so we need to skip blocks with used inodes in
1146  * inode table.
1147  */
1148  if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
1149  used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) -
1150  ext4_itable_unused_count(sb, gdp)),
1151  sbi->s_inodes_per_block);
1152 
1153  if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
1154  ext4_error(sb, "Something is wrong with group %u: "
1155  "used itable blocks: %d; "
1156  "itable unused count: %u",
1157  group, used_blks,
1158  ext4_itable_unused_count(sb, gdp));
1159  ret = 1;
1160  goto err_out;
1161  }
1162 
1163  blk = ext4_inode_table(sb, gdp) + used_blks;
1164  num = sbi->s_itb_per_group - used_blks;
1165 
1166  BUFFER_TRACE(group_desc_bh, "get_write_access");
1167  ret = ext4_journal_get_write_access(handle,
1168  group_desc_bh);
1169  if (ret)
1170  goto err_out;
1171 
1172  /*
1173  * Skip zeroout if the inode table is full. But we set the ZEROED
1174  * flag anyway, because obviously, when it is full it does not need
1175  * further zeroing.
1176  */
1177  if (unlikely(num == 0))
1178  goto skip_zeroout;
1179 
1180  ext4_debug("going to zero out inode table in group %d\n",
1181  group);
1182  ret = sb_issue_zeroout(sb, blk, num, GFP_NOFS);
1183  if (ret < 0)
1184  goto err_out;
1185  if (barrier)
1187 
1188 skip_zeroout:
1189  ext4_lock_group(sb, group);
1191  ext4_group_desc_csum_set(sb, group, gdp);
1192  ext4_unlock_group(sb, group);
1193 
1194  BUFFER_TRACE(group_desc_bh,
1195  "call ext4_handle_dirty_metadata");
1196  ret = ext4_handle_dirty_metadata(handle, NULL,
1197  group_desc_bh);
1198 
1199 err_out:
1200  up_write(&grp->alloc_sem);
1201  ext4_journal_stop(handle);
1202 out:
1203  return ret;
1204 }