Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
namei.c
Go to the documentation of this file.
1 /*
2  * linux/fs/ext4/namei.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card ([email protected])
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  * from
10  *
11  * linux/fs/minix/namei.c
12  *
13  * Copyright (C) 1991, 1992 Linus Torvalds
14  *
15  * Big-endian to little-endian byte-swapping/bitmaps by
16  * David S. Miller ([email protected]), 1995
17  * Directory entry file type support and forward compatibility hooks
18  * for B-tree directories by Theodore Ts'o ([email protected]), 1998
19  * Hash Tree Directory indexing (c)
20  * Daniel Phillips, 2001
21  * Hash Tree Directory indexing porting
22  * Christopher Li, 2002
23  * Hash Tree Directory indexing cleanup
24  * Theodore Ts'o, 2002
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/pagemap.h>
29 #include <linux/jbd2.h>
30 #include <linux/time.h>
31 #include <linux/fcntl.h>
32 #include <linux/stat.h>
33 #include <linux/string.h>
34 #include <linux/quotaops.h>
35 #include <linux/buffer_head.h>
36 #include <linux/bio.h>
37 #include "ext4.h"
38 #include "ext4_jbd2.h"
39 
40 #include "xattr.h"
41 #include "acl.h"
42 
43 #include <trace/events/ext4.h>
44 /*
45  * define how far ahead to read directories while searching them.
46  */
47 #define NAMEI_RA_CHUNKS 2
48 #define NAMEI_RA_BLOCKS 4
49 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
50 #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
51 
52 static struct buffer_head *ext4_append(handle_t *handle,
53  struct inode *inode,
54  ext4_lblk_t *block, int *err)
55 {
56  struct buffer_head *bh;
57 
58  if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
59  ((inode->i_size >> 10) >=
60  EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) {
61  *err = -ENOSPC;
62  return NULL;
63  }
64 
65  *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
66 
67  bh = ext4_bread(handle, inode, *block, 1, err);
68  if (bh) {
69  inode->i_size += inode->i_sb->s_blocksize;
70  EXT4_I(inode)->i_disksize = inode->i_size;
71  *err = ext4_journal_get_write_access(handle, bh);
72  if (*err) {
73  brelse(bh);
74  bh = NULL;
75  }
76  }
77  if (!bh && !(*err)) {
78  *err = -EIO;
79  ext4_error(inode->i_sb,
80  "Directory hole detected on inode %lu\n",
81  inode->i_ino);
82  }
83  return bh;
84 }
85 
86 #ifndef assert
87 #define assert(test) J_ASSERT(test)
88 #endif
89 
90 #ifdef DX_DEBUG
91 #define dxtrace(command) command
92 #else
93 #define dxtrace(command)
94 #endif
95 
96 struct fake_dirent
97 {
98  __le32 inode;
100  u8 name_len;
101  u8 file_type;
102 };
103 
104 struct dx_countlimit
105 {
106  __le16 limit;
107  __le16 count;
108 };
109 
110 struct dx_entry
111 {
112  __le32 hash;
113  __le32 block;
114 };
115 
116 /*
117  * dx_root_info is laid out so that if it should somehow get overlaid by a
118  * dirent the two low bits of the hash version will be zero. Therefore, the
119  * hash version mod 4 should never be 0. Sincerely, the paranoia department.
120  */
121 
122 struct dx_root
123 {
124  struct fake_dirent dot;
125  char dot_name[4];
126  struct fake_dirent dotdot;
127  char dotdot_name[4];
128  struct dx_root_info
129  {
132  u8 info_length; /* 8 */
135  }
136  info;
137  struct dx_entry entries[0];
138 };
139 
140 struct dx_node
141 {
142  struct fake_dirent fake;
143  struct dx_entry entries[0];
144 };
145 
146 
147 struct dx_frame
148 {
149  struct buffer_head *bh;
150  struct dx_entry *entries;
151  struct dx_entry *at;
152 };
153 
154 struct dx_map_entry
155 {
156  u32 hash;
157  u16 offs;
158  u16 size;
159 };
160 
161 /*
162  * This goes at the end of each htree block.
163  */
164 struct dx_tail {
166  __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
167 };
168 
169 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
170 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
171 static inline unsigned dx_get_hash(struct dx_entry *entry);
172 static void dx_set_hash(struct dx_entry *entry, unsigned value);
173 static unsigned dx_get_count(struct dx_entry *entries);
174 static unsigned dx_get_limit(struct dx_entry *entries);
175 static void dx_set_count(struct dx_entry *entries, unsigned value);
176 static void dx_set_limit(struct dx_entry *entries, unsigned value);
177 static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
178 static unsigned dx_node_limit(struct inode *dir);
179 static struct dx_frame *dx_probe(const struct qstr *d_name,
180  struct inode *dir,
181  struct dx_hash_info *hinfo,
182  struct dx_frame *frame,
183  int *err);
184 static void dx_release(struct dx_frame *frames);
185 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
186  struct dx_hash_info *hinfo, struct dx_map_entry map[]);
187 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
188 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
189  struct dx_map_entry *offsets, int count, unsigned blocksize);
190 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
191 static void dx_insert_block(struct dx_frame *frame,
193 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
194  struct dx_frame *frame,
195  struct dx_frame *frames,
196  __u32 *start_hash);
197 static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
198  const struct qstr *d_name,
199  struct ext4_dir_entry_2 **res_dir,
200  int *err);
201 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
202  struct inode *inode);
203 
204 /* checksumming functions */
205 #define EXT4_DIRENT_TAIL(block, blocksize) \
206  ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
207  ((blocksize) - \
208  sizeof(struct ext4_dir_entry_tail))))
209 
210 static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
211  unsigned int blocksize)
212 {
213  memset(t, 0, sizeof(struct ext4_dir_entry_tail));
214  t->det_rec_len = ext4_rec_len_to_disk(
215  sizeof(struct ext4_dir_entry_tail), blocksize);
217 }
218 
219 /* Walk through a dirent block to find a checksum "dirent" at the tail */
220 static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
221  struct ext4_dir_entry *de)
222 {
223  struct ext4_dir_entry_tail *t;
224 
225 #ifdef PARANOID
226  struct ext4_dir_entry *d, *top;
227 
228  d = de;
229  top = (struct ext4_dir_entry *)(((void *)de) +
230  (EXT4_BLOCK_SIZE(inode->i_sb) -
231  sizeof(struct ext4_dir_entry_tail)));
232  while (d < top && d->rec_len)
233  d = (struct ext4_dir_entry *)(((void *)d) +
234  le16_to_cpu(d->rec_len));
235 
236  if (d != top)
237  return NULL;
238 
239  t = (struct ext4_dir_entry_tail *)d;
240 #else
241  t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
242 #endif
243 
244  if (t->det_reserved_zero1 ||
245  le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
246  t->det_reserved_zero2 ||
248  return NULL;
249 
250  return t;
251 }
252 
253 static __le32 ext4_dirent_csum(struct inode *inode,
254  struct ext4_dir_entry *dirent, int size)
255 {
256  struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
257  struct ext4_inode_info *ei = EXT4_I(inode);
258  __u32 csum;
259 
260  csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
261  return cpu_to_le32(csum);
262 }
263 
264 int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
265 {
266  struct ext4_dir_entry_tail *t;
267 
268  if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
270  return 1;
271 
272  t = get_dirent_tail(inode, dirent);
273  if (!t) {
274  EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
275  "leaf for checksum. Please run e2fsck -D.");
276  return 0;
277  }
278 
279  if (t->det_checksum != ext4_dirent_csum(inode, dirent,
280  (void *)t - (void *)dirent))
281  return 0;
282 
283  return 1;
284 }
285 
286 static void ext4_dirent_csum_set(struct inode *inode,
287  struct ext4_dir_entry *dirent)
288 {
289  struct ext4_dir_entry_tail *t;
290 
291  if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
293  return;
294 
295  t = get_dirent_tail(inode, dirent);
296  if (!t) {
297  EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
298  "leaf for checksum. Please run e2fsck -D.");
299  return;
300  }
301 
302  t->det_checksum = ext4_dirent_csum(inode, dirent,
303  (void *)t - (void *)dirent);
304 }
305 
306 static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
307  struct inode *inode,
308  struct buffer_head *bh)
309 {
310  ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
311  return ext4_handle_dirty_metadata(handle, inode, bh);
312 }
313 
314 static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
315  struct ext4_dir_entry *dirent,
316  int *offset)
317 {
318  struct ext4_dir_entry *dp;
319  struct dx_root_info *root;
320  int count_offset;
321 
322  if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
323  count_offset = 8;
324  else if (le16_to_cpu(dirent->rec_len) == 12) {
325  dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
326  if (le16_to_cpu(dp->rec_len) !=
327  EXT4_BLOCK_SIZE(inode->i_sb) - 12)
328  return NULL;
329  root = (struct dx_root_info *)(((void *)dp + 12));
330  if (root->reserved_zero ||
331  root->info_length != sizeof(struct dx_root_info))
332  return NULL;
333  count_offset = 32;
334  } else
335  return NULL;
336 
337  if (offset)
338  *offset = count_offset;
339  return (struct dx_countlimit *)(((void *)dirent) + count_offset);
340 }
341 
342 static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
343  int count_offset, int count, struct dx_tail *t)
344 {
345  struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
346  struct ext4_inode_info *ei = EXT4_I(inode);
347  __u32 csum, old_csum;
348  int size;
349 
350  size = count_offset + (count * sizeof(struct dx_entry));
351  old_csum = t->dt_checksum;
352  t->dt_checksum = 0;
353  csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
354  csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
355  t->dt_checksum = old_csum;
356 
357  return cpu_to_le32(csum);
358 }
359 
360 static int ext4_dx_csum_verify(struct inode *inode,
361  struct ext4_dir_entry *dirent)
362 {
363  struct dx_countlimit *c;
364  struct dx_tail *t;
365  int count_offset, limit, count;
366 
367  if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
369  return 1;
370 
371  c = get_dx_countlimit(inode, dirent, &count_offset);
372  if (!c) {
373  EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
374  return 1;
375  }
376  limit = le16_to_cpu(c->limit);
377  count = le16_to_cpu(c->count);
378  if (count_offset + (limit * sizeof(struct dx_entry)) >
379  EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
380  EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
381  "tree checksum found. Run e2fsck -D.");
382  return 1;
383  }
384  t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
385 
386  if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
387  count, t))
388  return 0;
389  return 1;
390 }
391 
392 static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
393 {
394  struct dx_countlimit *c;
395  struct dx_tail *t;
396  int count_offset, limit, count;
397 
398  if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
400  return;
401 
402  c = get_dx_countlimit(inode, dirent, &count_offset);
403  if (!c) {
404  EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
405  return;
406  }
407  limit = le16_to_cpu(c->limit);
408  count = le16_to_cpu(c->count);
409  if (count_offset + (limit * sizeof(struct dx_entry)) >
410  EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
411  EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
412  "tree checksum. Run e2fsck -D.");
413  return;
414  }
415  t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
416 
417  t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
418 }
419 
420 static inline int ext4_handle_dirty_dx_node(handle_t *handle,
421  struct inode *inode,
422  struct buffer_head *bh)
423 {
424  ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
425  return ext4_handle_dirty_metadata(handle, inode, bh);
426 }
427 
428 /*
429  * p is at least 6 bytes before the end of page
430  */
431 static inline struct ext4_dir_entry_2 *
432 ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
433 {
434  return (struct ext4_dir_entry_2 *)((char *)p +
435  ext4_rec_len_from_disk(p->rec_len, blocksize));
436 }
437 
438 /*
439  * Future: use high four bits of block for coalesce-on-delete flags
440  * Mask them off for now.
441  */
442 
443 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
444 {
445  return le32_to_cpu(entry->block) & 0x00ffffff;
446 }
447 
448 static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
449 {
450  entry->block = cpu_to_le32(value);
451 }
452 
453 static inline unsigned dx_get_hash(struct dx_entry *entry)
454 {
455  return le32_to_cpu(entry->hash);
456 }
457 
458 static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
459 {
460  entry->hash = cpu_to_le32(value);
461 }
462 
463 static inline unsigned dx_get_count(struct dx_entry *entries)
464 {
465  return le16_to_cpu(((struct dx_countlimit *) entries)->count);
466 }
467 
468 static inline unsigned dx_get_limit(struct dx_entry *entries)
469 {
470  return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
471 }
472 
473 static inline void dx_set_count(struct dx_entry *entries, unsigned value)
474 {
475  ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
476 }
477 
478 static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
479 {
480  ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
481 }
482 
483 static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
484 {
485  unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
486  EXT4_DIR_REC_LEN(2) - infosize;
487 
490  entry_space -= sizeof(struct dx_tail);
491  return entry_space / sizeof(struct dx_entry);
492 }
493 
494 static inline unsigned dx_node_limit(struct inode *dir)
495 {
496  unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
497 
500  entry_space -= sizeof(struct dx_tail);
501  return entry_space / sizeof(struct dx_entry);
502 }
503 
504 /*
505  * Debug
506  */
507 #ifdef DX_DEBUG
508 static void dx_show_index(char * label, struct dx_entry *entries)
509 {
510  int i, n = dx_get_count (entries);
511  printk(KERN_DEBUG "%s index ", label);
512  for (i = 0; i < n; i++) {
513  printk("%x->%lu ", i ? dx_get_hash(entries + i) :
514  0, (unsigned long)dx_get_block(entries + i));
515  }
516  printk("\n");
517 }
518 
519 struct stats
520 {
521  unsigned names;
522  unsigned space;
523  unsigned bcount;
524 };
525 
526 static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
527  int size, int show_names)
528 {
529  unsigned names = 0, space = 0;
530  char *base = (char *) de;
531  struct dx_hash_info h = *hinfo;
532 
533  printk("names: ");
534  while ((char *) de < base + size)
535  {
536  if (de->inode)
537  {
538  if (show_names)
539  {
540  int len = de->name_len;
541  char *name = de->name;
542  while (len--) printk("%c", *name++);
543  ext4fs_dirhash(de->name, de->name_len, &h);
544  printk(":%x.%u ", h.hash,
545  (unsigned) ((char *) de - base));
546  }
547  space += EXT4_DIR_REC_LEN(de->name_len);
548  names++;
549  }
550  de = ext4_next_entry(de, size);
551  }
552  printk("(%i)\n", names);
553  return (struct stats) { names, space, 1 };
554 }
555 
556 struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
557  struct dx_entry *entries, int levels)
558 {
559  unsigned blocksize = dir->i_sb->s_blocksize;
560  unsigned count = dx_get_count(entries), names = 0, space = 0, i;
561  unsigned bcount = 0;
562  struct buffer_head *bh;
563  int err;
564  printk("%i indexed blocks...\n", count);
565  for (i = 0; i < count; i++, entries++)
566  {
567  ext4_lblk_t block = dx_get_block(entries);
568  ext4_lblk_t hash = i ? dx_get_hash(entries): 0;
569  u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
570  struct stats stats;
571  printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
572  if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue;
573  stats = levels?
574  dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
575  dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
576  names += stats.names;
577  space += stats.space;
578  bcount += stats.bcount;
579  brelse(bh);
580  }
581  if (bcount)
582  printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
583  levels ? "" : " ", names, space/bcount,
584  (space/bcount)*100/blocksize);
585  return (struct stats) { names, space, bcount};
586 }
587 #endif /* DX_DEBUG */
588 
589 /*
590  * Probe for a directory leaf block to search.
591  *
592  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
593  * error in the directory index, and the caller should fall back to
594  * searching the directory normally. The callers of dx_probe **MUST**
595  * check for this error code, and make sure it never gets reflected
596  * back to userspace.
597  */
598 static struct dx_frame *
599 dx_probe(const struct qstr *d_name, struct inode *dir,
600  struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
601 {
602  unsigned count, indirect;
603  struct dx_entry *at, *entries, *p, *q, *m;
604  struct dx_root *root;
605  struct buffer_head *bh;
606  struct dx_frame *frame = frame_in;
607  u32 hash;
608 
609  frame->bh = NULL;
610  if (!(bh = ext4_bread(NULL, dir, 0, 0, err))) {
611  if (*err == 0)
612  *err = ERR_BAD_DX_DIR;
613  goto fail;
614  }
615  root = (struct dx_root *) bh->b_data;
616  if (root->info.hash_version != DX_HASH_TEA &&
617  root->info.hash_version != DX_HASH_HALF_MD4 &&
618  root->info.hash_version != DX_HASH_LEGACY) {
619  ext4_warning(dir->i_sb, "Unrecognised inode hash code %d",
620  root->info.hash_version);
621  brelse(bh);
622  *err = ERR_BAD_DX_DIR;
623  goto fail;
624  }
625  hinfo->hash_version = root->info.hash_version;
626  if (hinfo->hash_version <= DX_HASH_TEA)
627  hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
628  hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
629  if (d_name)
630  ext4fs_dirhash(d_name->name, d_name->len, hinfo);
631  hash = hinfo->hash;
632 
633  if (root->info.unused_flags & 1) {
634  ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x",
635  root->info.unused_flags);
636  brelse(bh);
637  *err = ERR_BAD_DX_DIR;
638  goto fail;
639  }
640 
641  if ((indirect = root->info.indirect_levels) > 1) {
642  ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x",
643  root->info.indirect_levels);
644  brelse(bh);
645  *err = ERR_BAD_DX_DIR;
646  goto fail;
647  }
648 
649  if (!buffer_verified(bh) &&
650  !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
651  ext4_warning(dir->i_sb, "Root failed checksum");
652  brelse(bh);
653  *err = ERR_BAD_DX_DIR;
654  goto fail;
655  }
656  set_buffer_verified(bh);
657 
658  entries = (struct dx_entry *) (((char *)&root->info) +
659  root->info.info_length);
660 
661  if (dx_get_limit(entries) != dx_root_limit(dir,
662  root->info.info_length)) {
663  ext4_warning(dir->i_sb, "dx entry: limit != root limit");
664  brelse(bh);
665  *err = ERR_BAD_DX_DIR;
666  goto fail;
667  }
668 
669  dxtrace(printk("Look up %x", hash));
670  while (1)
671  {
672  count = dx_get_count(entries);
673  if (!count || count > dx_get_limit(entries)) {
674  ext4_warning(dir->i_sb,
675  "dx entry: no count or count > limit");
676  brelse(bh);
677  *err = ERR_BAD_DX_DIR;
678  goto fail2;
679  }
680 
681  p = entries + 1;
682  q = entries + count - 1;
683  while (p <= q)
684  {
685  m = p + (q - p)/2;
686  dxtrace(printk("."));
687  if (dx_get_hash(m) > hash)
688  q = m - 1;
689  else
690  p = m + 1;
691  }
692 
693  if (0) // linear search cross check
694  {
695  unsigned n = count - 1;
696  at = entries;
697  while (n--)
698  {
699  dxtrace(printk(","));
700  if (dx_get_hash(++at) > hash)
701  {
702  at--;
703  break;
704  }
705  }
706  assert (at == p - 1);
707  }
708 
709  at = p - 1;
710  dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
711  frame->bh = bh;
712  frame->entries = entries;
713  frame->at = at;
714  if (!indirect--) return frame;
715  if (!(bh = ext4_bread(NULL, dir, dx_get_block(at), 0, err))) {
716  if (!(*err))
717  *err = ERR_BAD_DX_DIR;
718  goto fail2;
719  }
720  at = entries = ((struct dx_node *) bh->b_data)->entries;
721 
722  if (!buffer_verified(bh) &&
723  !ext4_dx_csum_verify(dir,
724  (struct ext4_dir_entry *)bh->b_data)) {
725  ext4_warning(dir->i_sb, "Node failed checksum");
726  brelse(bh);
727  *err = ERR_BAD_DX_DIR;
728  goto fail;
729  }
730  set_buffer_verified(bh);
731 
732  if (dx_get_limit(entries) != dx_node_limit (dir)) {
733  ext4_warning(dir->i_sb,
734  "dx entry: limit != node limit");
735  brelse(bh);
736  *err = ERR_BAD_DX_DIR;
737  goto fail2;
738  }
739  frame++;
740  frame->bh = NULL;
741  }
742 fail2:
743  while (frame >= frame_in) {
744  brelse(frame->bh);
745  frame--;
746  }
747 fail:
748  if (*err == ERR_BAD_DX_DIR)
749  ext4_warning(dir->i_sb,
750  "Corrupt dir inode %lu, running e2fsck is "
751  "recommended.", dir->i_ino);
752  return NULL;
753 }
754 
755 static void dx_release (struct dx_frame *frames)
756 {
757  if (frames[0].bh == NULL)
758  return;
759 
760  if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
761  brelse(frames[1].bh);
762  brelse(frames[0].bh);
763 }
764 
765 /*
766  * This function increments the frame pointer to search the next leaf
767  * block, and reads in the necessary intervening nodes if the search
768  * should be necessary. Whether or not the search is necessary is
769  * controlled by the hash parameter. If the hash value is even, then
770  * the search is only continued if the next block starts with that
771  * hash value. This is used if we are searching for a specific file.
772  *
773  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
774  *
775  * This function returns 1 if the caller should continue to search,
776  * or 0 if it should not. If there is an error reading one of the
777  * index blocks, it will a negative error code.
778  *
779  * If start_hash is non-null, it will be filled in with the starting
780  * hash of the next page.
781  */
782 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
783  struct dx_frame *frame,
784  struct dx_frame *frames,
785  __u32 *start_hash)
786 {
787  struct dx_frame *p;
788  struct buffer_head *bh;
789  int err, num_frames = 0;
790  __u32 bhash;
791 
792  p = frame;
793  /*
794  * Find the next leaf page by incrementing the frame pointer.
795  * If we run out of entries in the interior node, loop around and
796  * increment pointer in the parent node. When we break out of
797  * this loop, num_frames indicates the number of interior
798  * nodes need to be read.
799  */
800  while (1) {
801  if (++(p->at) < p->entries + dx_get_count(p->entries))
802  break;
803  if (p == frames)
804  return 0;
805  num_frames++;
806  p--;
807  }
808 
809  /*
810  * If the hash is 1, then continue only if the next page has a
811  * continuation hash of any value. This is used for readdir
812  * handling. Otherwise, check to see if the hash matches the
813  * desired contiuation hash. If it doesn't, return since
814  * there's no point to read in the successive index pages.
815  */
816  bhash = dx_get_hash(p->at);
817  if (start_hash)
818  *start_hash = bhash;
819  if ((hash & 1) == 0) {
820  if ((bhash & ~1) != hash)
821  return 0;
822  }
823  /*
824  * If the hash is HASH_NB_ALWAYS, we always go to the next
825  * block so no check is necessary
826  */
827  while (num_frames--) {
828  if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
829  0, &err))) {
830  if (!err) {
831  ext4_error(dir->i_sb,
832  "Directory hole detected on inode %lu\n",
833  dir->i_ino);
834  return -EIO;
835  }
836  return err; /* Failure */
837  }
838 
839  if (!buffer_verified(bh) &&
840  !ext4_dx_csum_verify(dir,
841  (struct ext4_dir_entry *)bh->b_data)) {
842  ext4_warning(dir->i_sb, "Node failed checksum");
843  return -EIO;
844  }
845  set_buffer_verified(bh);
846 
847  p++;
848  brelse(p->bh);
849  p->bh = bh;
850  p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
851  }
852  return 1;
853 }
854 
855 
856 /*
857  * This function fills a red-black tree with information from a
858  * directory block. It returns the number directory entries loaded
859  * into the tree. If there is an error it is returned in err.
860  */
861 static int htree_dirblock_to_tree(struct file *dir_file,
862  struct inode *dir, ext4_lblk_t block,
863  struct dx_hash_info *hinfo,
864  __u32 start_hash, __u32 start_minor_hash)
865 {
866  struct buffer_head *bh;
867  struct ext4_dir_entry_2 *de, *top;
868  int err = 0, count = 0;
869 
870  dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
871  (unsigned long)block));
872  if (!(bh = ext4_bread(NULL, dir, block, 0, &err))) {
873  if (!err) {
874  err = -EIO;
875  ext4_error(dir->i_sb,
876  "Directory hole detected on inode %lu\n",
877  dir->i_ino);
878  }
879  return err;
880  }
881 
882  if (!buffer_verified(bh) &&
883  !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
884  return -EIO;
885  set_buffer_verified(bh);
886 
887  de = (struct ext4_dir_entry_2 *) bh->b_data;
888  top = (struct ext4_dir_entry_2 *) ((char *) de +
889  dir->i_sb->s_blocksize -
890  EXT4_DIR_REC_LEN(0));
891  for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
892  if (ext4_check_dir_entry(dir, NULL, de, bh,
893  (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
894  + ((char *)de - bh->b_data))) {
895  /* On error, skip the f_pos to the next block. */
896  dir_file->f_pos = (dir_file->f_pos |
897  (dir->i_sb->s_blocksize - 1)) + 1;
898  brelse(bh);
899  return count;
900  }
901  ext4fs_dirhash(de->name, de->name_len, hinfo);
902  if ((hinfo->hash < start_hash) ||
903  ((hinfo->hash == start_hash) &&
904  (hinfo->minor_hash < start_minor_hash)))
905  continue;
906  if (de->inode == 0)
907  continue;
908  if ((err = ext4_htree_store_dirent(dir_file,
909  hinfo->hash, hinfo->minor_hash, de)) != 0) {
910  brelse(bh);
911  return err;
912  }
913  count++;
914  }
915  brelse(bh);
916  return count;
917 }
918 
919 
920 /*
921  * This function fills a red-black tree with information from a
922  * directory. We start scanning the directory in hash order, starting
923  * at start_hash and start_minor_hash.
924  *
925  * This function returns the number of entries inserted into the tree,
926  * or a negative error code.
927  */
928 int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
929  __u32 start_minor_hash, __u32 *next_hash)
930 {
931  struct dx_hash_info hinfo;
932  struct ext4_dir_entry_2 *de;
933  struct dx_frame frames[2], *frame;
934  struct inode *dir;
936  int count = 0;
937  int ret, err;
938  __u32 hashval;
939 
940  dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
941  start_hash, start_minor_hash));
942  dir = dir_file->f_path.dentry->d_inode;
943  if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
944  hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
945  if (hinfo.hash_version <= DX_HASH_TEA)
946  hinfo.hash_version +=
947  EXT4_SB(dir->i_sb)->s_hash_unsigned;
948  hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
949  count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
950  start_hash, start_minor_hash);
951  *next_hash = ~0;
952  return count;
953  }
954  hinfo.hash = start_hash;
955  hinfo.minor_hash = 0;
956  frame = dx_probe(NULL, dir, &hinfo, frames, &err);
957  if (!frame)
958  return err;
959 
960  /* Add '.' and '..' from the htree header */
961  if (!start_hash && !start_minor_hash) {
962  de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
963  if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
964  goto errout;
965  count++;
966  }
967  if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
968  de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
969  de = ext4_next_entry(de, dir->i_sb->s_blocksize);
970  if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
971  goto errout;
972  count++;
973  }
974 
975  while (1) {
976  block = dx_get_block(frame->at);
977  ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
978  start_hash, start_minor_hash);
979  if (ret < 0) {
980  err = ret;
981  goto errout;
982  }
983  count += ret;
984  hashval = ~0;
985  ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
986  frame, frames, &hashval);
987  *next_hash = hashval;
988  if (ret < 0) {
989  err = ret;
990  goto errout;
991  }
992  /*
993  * Stop if: (a) there are no more entries, or
994  * (b) we have inserted at least one entry and the
995  * next hash value is not a continuation
996  */
997  if ((ret == 0) ||
998  (count && ((hashval & 1) == 0)))
999  break;
1000  }
1001  dx_release(frames);
1002  dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
1003  "next hash: %x\n", count, *next_hash));
1004  return count;
1005 errout:
1006  dx_release(frames);
1007  return (err);
1008 }
1009 
1010 
1011 /*
1012  * Directory block splitting, compacting
1013  */
1014 
1015 /*
1016  * Create map of hash values, offsets, and sizes, stored at end of block.
1017  * Returns number of entries mapped.
1018  */
1019 static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
1020  struct dx_hash_info *hinfo,
1021  struct dx_map_entry *map_tail)
1022 {
1023  int count = 0;
1024  char *base = (char *) de;
1025  struct dx_hash_info h = *hinfo;
1026 
1027  while ((char *) de < base + blocksize) {
1028  if (de->name_len && de->inode) {
1029  ext4fs_dirhash(de->name, de->name_len, &h);
1030  map_tail--;
1031  map_tail->hash = h.hash;
1032  map_tail->offs = ((char *) de - base)>>2;
1033  map_tail->size = le16_to_cpu(de->rec_len);
1034  count++;
1035  cond_resched();
1036  }
1037  /* XXX: do we need to check rec_len == 0 case? -Chris */
1038  de = ext4_next_entry(de, blocksize);
1039  }
1040  return count;
1041 }
1042 
1043 /* Sort map by hash value */
1044 static void dx_sort_map (struct dx_map_entry *map, unsigned count)
1045 {
1046  struct dx_map_entry *p, *q, *top = map + count - 1;
1047  int more;
1048  /* Combsort until bubble sort doesn't suck */
1049  while (count > 2) {
1050  count = count*10/13;
1051  if (count - 9 < 2) /* 9, 10 -> 11 */
1052  count = 11;
1053  for (p = top, q = p - count; q >= map; p--, q--)
1054  if (p->hash < q->hash)
1055  swap(*p, *q);
1056  }
1057  /* Garden variety bubble sort */
1058  do {
1059  more = 0;
1060  q = top;
1061  while (q-- > map) {
1062  if (q[1].hash >= q[0].hash)
1063  continue;
1064  swap(*(q+1), *q);
1065  more = 1;
1066  }
1067  } while(more);
1068 }
1069 
1070 static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1071 {
1072  struct dx_entry *entries = frame->entries;
1073  struct dx_entry *old = frame->at, *new = old + 1;
1074  int count = dx_get_count(entries);
1075 
1076  assert(count < dx_get_limit(entries));
1077  assert(old < entries + count);
1078  memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
1079  dx_set_hash(new, hash);
1080  dx_set_block(new, block);
1081  dx_set_count(entries, count + 1);
1082 }
1083 
1084 static void ext4_update_dx_flag(struct inode *inode)
1085 {
1086  if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
1088  ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
1089 }
1090 
1091 /*
1092  * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
1093  *
1094  * `len <= EXT4_NAME_LEN' is guaranteed by caller.
1095  * `de != NULL' is guaranteed by caller.
1096  */
1097 static inline int ext4_match (int len, const char * const name,
1098  struct ext4_dir_entry_2 * de)
1099 {
1100  if (len != de->name_len)
1101  return 0;
1102  if (!de->inode)
1103  return 0;
1104  return !memcmp(name, de->name, len);
1105 }
1106 
1107 /*
1108  * Returns 0 if not found, -1 on failure, and 1 on success
1109  */
1110 static inline int search_dirblock(struct buffer_head *bh,
1111  struct inode *dir,
1112  const struct qstr *d_name,
1113  unsigned int offset,
1114  struct ext4_dir_entry_2 ** res_dir)
1115 {
1116  struct ext4_dir_entry_2 * de;
1117  char * dlimit;
1118  int de_len;
1119  const char *name = d_name->name;
1120  int namelen = d_name->len;
1121 
1122  de = (struct ext4_dir_entry_2 *) bh->b_data;
1123  dlimit = bh->b_data + dir->i_sb->s_blocksize;
1124  while ((char *) de < dlimit) {
1125  /* this code is executed quadratically often */
1126  /* do minimal checking `by hand' */
1127 
1128  if ((char *) de + namelen <= dlimit &&
1129  ext4_match (namelen, name, de)) {
1130  /* found a match - just to be sure, do a full check */
1131  if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1132  return -1;
1133  *res_dir = de;
1134  return 1;
1135  }
1136  /* prevent looping on a bad block */
1137  de_len = ext4_rec_len_from_disk(de->rec_len,
1138  dir->i_sb->s_blocksize);
1139  if (de_len <= 0)
1140  return -1;
1141  offset += de_len;
1142  de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
1143  }
1144  return 0;
1145 }
1146 
1147 
1148 /*
1149  * ext4_find_entry()
1150  *
1151  * finds an entry in the specified directory with the wanted name. It
1152  * returns the cache buffer in which the entry was found, and the entry
1153  * itself (as a parameter - res_dir). It does NOT read the inode of the
1154  * entry - you'll have to do that yourself if you want to.
1155  *
1156  * The returned buffer_head has ->b_count elevated. The caller is expected
1157  * to brelse() it when appropriate.
1158  */
1159 static struct buffer_head * ext4_find_entry (struct inode *dir,
1160  const struct qstr *d_name,
1161  struct ext4_dir_entry_2 ** res_dir)
1162 {
1163  struct super_block *sb;
1164  struct buffer_head *bh_use[NAMEI_RA_SIZE];
1165  struct buffer_head *bh, *ret = NULL;
1167  const u8 *name = d_name->name;
1168  int ra_max = 0; /* Number of bh's in the readahead
1169  buffer, bh_use[] */
1170  int ra_ptr = 0; /* Current index into readahead
1171  buffer */
1172  int num = 0;
1173  ext4_lblk_t nblocks;
1174  int i, err;
1175  int namelen;
1176 
1177  *res_dir = NULL;
1178  sb = dir->i_sb;
1179  namelen = d_name->len;
1180  if (namelen > EXT4_NAME_LEN)
1181  return NULL;
1182  if ((namelen <= 2) && (name[0] == '.') &&
1183  (name[1] == '.' || name[1] == '\0')) {
1184  /*
1185  * "." or ".." will only be in the first block
1186  * NFS may look up ".."; "." should be handled by the VFS
1187  */
1188  block = start = 0;
1189  nblocks = 1;
1190  goto restart;
1191  }
1192  if (is_dx(dir)) {
1193  bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
1194  /*
1195  * On success, or if the error was file not found,
1196  * return. Otherwise, fall back to doing a search the
1197  * old fashioned way.
1198  */
1199  if (bh || (err != ERR_BAD_DX_DIR))
1200  return bh;
1201  dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1202  "falling back\n"));
1203  }
1204  nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1205  start = EXT4_I(dir)->i_dir_start_lookup;
1206  if (start >= nblocks)
1207  start = 0;
1208  block = start;
1209 restart:
1210  do {
1211  /*
1212  * We deal with the read-ahead logic here.
1213  */
1214  if (ra_ptr >= ra_max) {
1215  /* Refill the readahead buffer */
1216  ra_ptr = 0;
1217  b = block;
1218  for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
1219  /*
1220  * Terminate if we reach the end of the
1221  * directory and must wrap, or if our
1222  * search has finished at this block.
1223  */
1224  if (b >= nblocks || (num && block == start)) {
1225  bh_use[ra_max] = NULL;
1226  break;
1227  }
1228  num++;
1229  bh = ext4_getblk(NULL, dir, b++, 0, &err);
1230  bh_use[ra_max] = bh;
1231  if (bh)
1233  1, &bh);
1234  }
1235  }
1236  if ((bh = bh_use[ra_ptr++]) == NULL)
1237  goto next;
1238  wait_on_buffer(bh);
1239  if (!buffer_uptodate(bh)) {
1240  /* read error, skip block & hope for the best */
1241  EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
1242  (unsigned long) block);
1243  brelse(bh);
1244  goto next;
1245  }
1246  if (!buffer_verified(bh) &&
1248  (struct ext4_dir_entry *)bh->b_data)) {
1249  EXT4_ERROR_INODE(dir, "checksumming directory "
1250  "block %lu", (unsigned long)block);
1251  brelse(bh);
1252  goto next;
1253  }
1254  set_buffer_verified(bh);
1255  i = search_dirblock(bh, dir, d_name,
1256  block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
1257  if (i == 1) {
1258  EXT4_I(dir)->i_dir_start_lookup = block;
1259  ret = bh;
1260  goto cleanup_and_exit;
1261  } else {
1262  brelse(bh);
1263  if (i < 0)
1264  goto cleanup_and_exit;
1265  }
1266  next:
1267  if (++block >= nblocks)
1268  block = 0;
1269  } while (block != start);
1270 
1271  /*
1272  * If the directory has grown while we were searching, then
1273  * search the last part of the directory before giving up.
1274  */
1275  block = nblocks;
1276  nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
1277  if (block < nblocks) {
1278  start = 0;
1279  goto restart;
1280  }
1281 
1282 cleanup_and_exit:
1283  /* Clean up the read-ahead blocks */
1284  for (; ra_ptr < ra_max; ra_ptr++)
1285  brelse(bh_use[ra_ptr]);
1286  return ret;
1287 }
1288 
1289 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
1290  struct ext4_dir_entry_2 **res_dir, int *err)
1291 {
1292  struct super_block * sb = dir->i_sb;
1293  struct dx_hash_info hinfo;
1294  struct dx_frame frames[2], *frame;
1295  struct buffer_head *bh;
1297  int retval;
1298 
1299  if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
1300  return NULL;
1301  do {
1302  block = dx_get_block(frame->at);
1303  if (!(bh = ext4_bread(NULL, dir, block, 0, err))) {
1304  if (!(*err)) {
1305  *err = -EIO;
1306  ext4_error(dir->i_sb,
1307  "Directory hole detected on inode %lu\n",
1308  dir->i_ino);
1309  }
1310  goto errout;
1311  }
1312 
1313  if (!buffer_verified(bh) &&
1315  (struct ext4_dir_entry *)bh->b_data)) {
1316  EXT4_ERROR_INODE(dir, "checksumming directory "
1317  "block %lu", (unsigned long)block);
1318  brelse(bh);
1319  *err = -EIO;
1320  goto errout;
1321  }
1322  set_buffer_verified(bh);
1323  retval = search_dirblock(bh, dir, d_name,
1324  block << EXT4_BLOCK_SIZE_BITS(sb),
1325  res_dir);
1326  if (retval == 1) { /* Success! */
1327  dx_release(frames);
1328  return bh;
1329  }
1330  brelse(bh);
1331  if (retval == -1) {
1332  *err = ERR_BAD_DX_DIR;
1333  goto errout;
1334  }
1335 
1336  /* Check to see if we should continue to search */
1337  retval = ext4_htree_next_block(dir, hinfo.hash, frame,
1338  frames, NULL);
1339  if (retval < 0) {
1340  ext4_warning(sb,
1341  "error reading index page in directory #%lu",
1342  dir->i_ino);
1343  *err = retval;
1344  goto errout;
1345  }
1346  } while (retval == 1);
1347 
1348  *err = -ENOENT;
1349 errout:
1350  dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1351  dx_release (frames);
1352  return NULL;
1353 }
1354 
1355 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
1356 {
1357  struct inode *inode;
1358  struct ext4_dir_entry_2 *de;
1359  struct buffer_head *bh;
1360 
1361  if (dentry->d_name.len > EXT4_NAME_LEN)
1362  return ERR_PTR(-ENAMETOOLONG);
1363 
1364  bh = ext4_find_entry(dir, &dentry->d_name, &de);
1365  inode = NULL;
1366  if (bh) {
1367  __u32 ino = le32_to_cpu(de->inode);
1368  brelse(bh);
1369  if (!ext4_valid_inum(dir->i_sb, ino)) {
1370  EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1371  return ERR_PTR(-EIO);
1372  }
1373  if (unlikely(ino == dir->i_ino)) {
1374  EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
1375  dentry->d_name.len,
1376  dentry->d_name.name);
1377  return ERR_PTR(-EIO);
1378  }
1379  inode = ext4_iget(dir->i_sb, ino);
1380  if (inode == ERR_PTR(-ESTALE)) {
1381  EXT4_ERROR_INODE(dir,
1382  "deleted inode referenced: %u",
1383  ino);
1384  return ERR_PTR(-EIO);
1385  }
1386  }
1387  return d_splice_alias(inode, dentry);
1388 }
1389 
1390 
1391 struct dentry *ext4_get_parent(struct dentry *child)
1392 {
1393  __u32 ino;
1394  static const struct qstr dotdot = QSTR_INIT("..", 2);
1395  struct ext4_dir_entry_2 * de;
1396  struct buffer_head *bh;
1397 
1398  bh = ext4_find_entry(child->d_inode, &dotdot, &de);
1399  if (!bh)
1400  return ERR_PTR(-ENOENT);
1401  ino = le32_to_cpu(de->inode);
1402  brelse(bh);
1403 
1404  if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1405  EXT4_ERROR_INODE(child->d_inode,
1406  "bad parent inode number: %u", ino);
1407  return ERR_PTR(-EIO);
1408  }
1409 
1410  return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
1411 }
1412 
1413 #define S_SHIFT 12
1414 static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
1416  [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
1419  [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
1420  [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
1422 };
1423 
1424 static inline void ext4_set_de_type(struct super_block *sb,
1425  struct ext4_dir_entry_2 *de,
1426  umode_t mode) {
1428  de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1429 }
1430 
1431 /*
1432  * Move count entries from end of map between two memory locations.
1433  * Returns pointer to last entry moved.
1434  */
1435 static struct ext4_dir_entry_2 *
1436 dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1437  unsigned blocksize)
1438 {
1439  unsigned rec_len = 0;
1440 
1441  while (count--) {
1442  struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1443  (from + (map->offs<<2));
1444  rec_len = EXT4_DIR_REC_LEN(de->name_len);
1445  memcpy (to, de, rec_len);
1446  ((struct ext4_dir_entry_2 *) to)->rec_len =
1447  ext4_rec_len_to_disk(rec_len, blocksize);
1448  de->inode = 0;
1449  map++;
1450  to += rec_len;
1451  }
1452  return (struct ext4_dir_entry_2 *) (to - rec_len);
1453 }
1454 
1455 /*
1456  * Compact each dir entry in the range to the minimal rec_len.
1457  * Returns pointer to last entry in range.
1458  */
1459 static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
1460 {
1461  struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
1462  unsigned rec_len = 0;
1463 
1464  prev = to = de;
1465  while ((char*)de < base + blocksize) {
1466  next = ext4_next_entry(de, blocksize);
1467  if (de->inode && de->name_len) {
1468  rec_len = EXT4_DIR_REC_LEN(de->name_len);
1469  if (de > to)
1470  memmove(to, de, rec_len);
1471  to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
1472  prev = to;
1473  to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
1474  }
1475  de = next;
1476  }
1477  return prev;
1478 }
1479 
1480 /*
1481  * Split a full leaf block to make room for a new dir entry.
1482  * Allocate a new block, and move entries so that they are approx. equally full.
1483  * Returns pointer to de in block into which the new entry will be inserted.
1484  */
1485 static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1486  struct buffer_head **bh,struct dx_frame *frame,
1487  struct dx_hash_info *hinfo, int *error)
1488 {
1489  unsigned blocksize = dir->i_sb->s_blocksize;
1490  unsigned count, continued;
1491  struct buffer_head *bh2;
1492  ext4_lblk_t newblock;
1493  u32 hash2;
1494  struct dx_map_entry *map;
1495  char *data1 = (*bh)->b_data, *data2;
1496  unsigned split, move, size;
1497  struct ext4_dir_entry_2 *de = NULL, *de2;
1498  struct ext4_dir_entry_tail *t;
1499  int csum_size = 0;
1500  int err = 0, i;
1501 
1504  csum_size = sizeof(struct ext4_dir_entry_tail);
1505 
1506  bh2 = ext4_append (handle, dir, &newblock, &err);
1507  if (!(bh2)) {
1508  brelse(*bh);
1509  *bh = NULL;
1510  goto errout;
1511  }
1512 
1513  BUFFER_TRACE(*bh, "get_write_access");
1514  err = ext4_journal_get_write_access(handle, *bh);
1515  if (err)
1516  goto journal_error;
1517 
1518  BUFFER_TRACE(frame->bh, "get_write_access");
1519  err = ext4_journal_get_write_access(handle, frame->bh);
1520  if (err)
1521  goto journal_error;
1522 
1523  data2 = bh2->b_data;
1524 
1525  /* create map in the end of data2 block */
1526  map = (struct dx_map_entry *) (data2 + blocksize);
1527  count = dx_make_map((struct ext4_dir_entry_2 *) data1,
1528  blocksize, hinfo, map);
1529  map -= count;
1530  dx_sort_map(map, count);
1531  /* Split the existing block in the middle, size-wise */
1532  size = 0;
1533  move = 0;
1534  for (i = count-1; i >= 0; i--) {
1535  /* is more than half of this entry in 2nd half of the block? */
1536  if (size + map[i].size/2 > blocksize/2)
1537  break;
1538  size += map[i].size;
1539  move++;
1540  }
1541  /* map index at which we will split */
1542  split = count - move;
1543  hash2 = map[split].hash;
1544  continued = hash2 == map[split - 1].hash;
1545  dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
1546  (unsigned long)dx_get_block(frame->at),
1547  hash2, split, count-split));
1548 
1549  /* Fancy dance to stay within two buffers */
1550  de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
1551  de = dx_pack_dirents(data1, blocksize);
1552  de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1553  (char *) de,
1554  blocksize);
1555  de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
1556  (char *) de2,
1557  blocksize);
1558  if (csum_size) {
1559  t = EXT4_DIRENT_TAIL(data2, blocksize);
1560  initialize_dirent_tail(t, blocksize);
1561 
1562  t = EXT4_DIRENT_TAIL(data1, blocksize);
1563  initialize_dirent_tail(t, blocksize);
1564  }
1565 
1566  dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
1567  dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
1568 
1569  /* Which block gets the new entry? */
1570  if (hinfo->hash >= hash2)
1571  {
1572  swap(*bh, bh2);
1573  de = de2;
1574  }
1575  dx_insert_block(frame, hash2 + continued, newblock);
1576  err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
1577  if (err)
1578  goto journal_error;
1579  err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1580  if (err)
1581  goto journal_error;
1582  brelse(bh2);
1583  dxtrace(dx_show_index("frame", frame->entries));
1584  return de;
1585 
1586 journal_error:
1587  brelse(*bh);
1588  brelse(bh2);
1589  *bh = NULL;
1590  ext4_std_error(dir->i_sb, err);
1591 errout:
1592  *error = err;
1593  return NULL;
1594 }
1595 
1596 /*
1597  * Add a new entry into a directory (leaf) block. If de is non-NULL,
1598  * it points to a directory entry which is guaranteed to be large
1599  * enough for new directory entry. If de is NULL, then
1600  * add_dirent_to_buf will attempt search the directory block for
1601  * space. It will return -ENOSPC if no space is available, and -EIO
1602  * and -EEXIST if directory entry already exists.
1603  */
1604 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1605  struct inode *inode, struct ext4_dir_entry_2 *de,
1606  struct buffer_head *bh)
1607 {
1608  struct inode *dir = dentry->d_parent->d_inode;
1609  const char *name = dentry->d_name.name;
1610  int namelen = dentry->d_name.len;
1611  unsigned int offset = 0;
1612  unsigned int blocksize = dir->i_sb->s_blocksize;
1613  unsigned short reclen;
1614  int nlen, rlen, err;
1615  char *top;
1616  int csum_size = 0;
1617 
1618  if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1620  csum_size = sizeof(struct ext4_dir_entry_tail);
1621 
1622  reclen = EXT4_DIR_REC_LEN(namelen);
1623  if (!de) {
1624  de = (struct ext4_dir_entry_2 *)bh->b_data;
1625  top = bh->b_data + (blocksize - csum_size) - reclen;
1626  while ((char *) de <= top) {
1627  if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
1628  return -EIO;
1629  if (ext4_match(namelen, name, de))
1630  return -EEXIST;
1631  nlen = EXT4_DIR_REC_LEN(de->name_len);
1632  rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1633  if ((de->inode? rlen - nlen: rlen) >= reclen)
1634  break;
1635  de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1636  offset += rlen;
1637  }
1638  if ((char *) de > top)
1639  return -ENOSPC;
1640  }
1641  BUFFER_TRACE(bh, "get_write_access");
1642  err = ext4_journal_get_write_access(handle, bh);
1643  if (err) {
1644  ext4_std_error(dir->i_sb, err);
1645  return err;
1646  }
1647 
1648  /* By now the buffer is marked for journaling */
1649  nlen = EXT4_DIR_REC_LEN(de->name_len);
1650  rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1651  if (de->inode) {
1652  struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
1653  de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
1654  de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
1655  de = de1;
1656  }
1657  de->file_type = EXT4_FT_UNKNOWN;
1658  de->inode = cpu_to_le32(inode->i_ino);
1659  ext4_set_de_type(dir->i_sb, de, inode->i_mode);
1660  de->name_len = namelen;
1661  memcpy(de->name, name, namelen);
1662  /*
1663  * XXX shouldn't update any times until successful
1664  * completion of syscall, but too many callers depend
1665  * on this.
1666  *
1667  * XXX similarly, too many callers depend on
1668  * ext4_new_inode() setting the times, but error
1669  * recovery deletes the inode, so the worst that can
1670  * happen is that the times are slightly out of date
1671  * and/or different from the directory change time.
1672  */
1673  dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
1674  ext4_update_dx_flag(dir);
1675  dir->i_version++;
1676  ext4_mark_inode_dirty(handle, dir);
1677  BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1678  err = ext4_handle_dirty_dirent_node(handle, dir, bh);
1679  if (err)
1680  ext4_std_error(dir->i_sb, err);
1681  return 0;
1682 }
1683 
1684 /*
1685  * This converts a one block unindexed directory to a 3 block indexed
1686  * directory, and adds the dentry to the indexed directory.
1687  */
1688 static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1689  struct inode *inode, struct buffer_head *bh)
1690 {
1691  struct inode *dir = dentry->d_parent->d_inode;
1692  const char *name = dentry->d_name.name;
1693  int namelen = dentry->d_name.len;
1694  struct buffer_head *bh2;
1695  struct dx_root *root;
1696  struct dx_frame frames[2], *frame;
1697  struct dx_entry *entries;
1698  struct ext4_dir_entry_2 *de, *de2;
1699  struct ext4_dir_entry_tail *t;
1700  char *data1, *top;
1701  unsigned len;
1702  int retval;
1703  unsigned blocksize;
1704  struct dx_hash_info hinfo;
1706  struct fake_dirent *fde;
1707  int csum_size = 0;
1708 
1709  if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1711  csum_size = sizeof(struct ext4_dir_entry_tail);
1712 
1713  blocksize = dir->i_sb->s_blocksize;
1714  dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1715  retval = ext4_journal_get_write_access(handle, bh);
1716  if (retval) {
1717  ext4_std_error(dir->i_sb, retval);
1718  brelse(bh);
1719  return retval;
1720  }
1721  root = (struct dx_root *) bh->b_data;
1722 
1723  /* The 0th block becomes the root, move the dirents out */
1724  fde = &root->dotdot;
1725  de = (struct ext4_dir_entry_2 *)((char *)fde +
1726  ext4_rec_len_from_disk(fde->rec_len, blocksize));
1727  if ((char *) de >= (((char *) root) + blocksize)) {
1728  EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1729  brelse(bh);
1730  return -EIO;
1731  }
1732  len = ((char *) root) + (blocksize - csum_size) - (char *) de;
1733 
1734  /* Allocate new block for the 0th block's dirents */
1735  bh2 = ext4_append(handle, dir, &block, &retval);
1736  if (!(bh2)) {
1737  brelse(bh);
1738  return retval;
1739  }
1740  ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1741  data1 = bh2->b_data;
1742 
1743  memcpy (data1, de, len);
1744  de = (struct ext4_dir_entry_2 *) data1;
1745  top = data1 + len;
1746  while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
1747  de = de2;
1748  de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1749  (char *) de,
1750  blocksize);
1751 
1752  if (csum_size) {
1753  t = EXT4_DIRENT_TAIL(data1, blocksize);
1754  initialize_dirent_tail(t, blocksize);
1755  }
1756 
1757  /* Initialize the root; the dot dirents already exist */
1758  de = (struct ext4_dir_entry_2 *) (&root->dotdot);
1759  de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
1760  blocksize);
1761  memset (&root->info, 0, sizeof(root->info));
1762  root->info.info_length = sizeof(root->info);
1763  root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
1764  entries = root->entries;
1765  dx_set_block(entries, 1);
1766  dx_set_count(entries, 1);
1767  dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
1768 
1769  /* Initialize as for dx_probe */
1770  hinfo.hash_version = root->info.hash_version;
1771  if (hinfo.hash_version <= DX_HASH_TEA)
1772  hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1773  hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
1774  ext4fs_dirhash(name, namelen, &hinfo);
1775  frame = frames;
1776  frame->entries = entries;
1777  frame->at = entries;
1778  frame->bh = bh;
1779  bh = bh2;
1780 
1781  ext4_handle_dirty_dx_node(handle, dir, frame->bh);
1782  ext4_handle_dirty_dirent_node(handle, dir, bh);
1783 
1784  de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1785  if (!de) {
1786  /*
1787  * Even if the block split failed, we have to properly write
1788  * out all the changes we did so far. Otherwise we can end up
1789  * with corrupted filesystem.
1790  */
1791  ext4_mark_inode_dirty(handle, dir);
1792  dx_release(frames);
1793  return retval;
1794  }
1795  dx_release(frames);
1796 
1797  retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1798  brelse(bh);
1799  return retval;
1800 }
1801 
1802 /*
1803  * ext4_add_entry()
1804  *
1805  * adds a file entry to the specified directory, using the same
1806  * semantics as ext4_find_entry(). It returns NULL if it failed.
1807  *
1808  * NOTE!! The inode part of 'de' is left at 0 - which means you
1809  * may not sleep between calling this and putting something into
1810  * the entry, as someone else might have used it while you slept.
1811  */
1812 static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1813  struct inode *inode)
1814 {
1815  struct inode *dir = dentry->d_parent->d_inode;
1816  struct buffer_head *bh;
1817  struct ext4_dir_entry_2 *de;
1818  struct ext4_dir_entry_tail *t;
1819  struct super_block *sb;
1820  int retval;
1821  int dx_fallback=0;
1822  unsigned blocksize;
1823  ext4_lblk_t block, blocks;
1824  int csum_size = 0;
1825 
1826  if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
1828  csum_size = sizeof(struct ext4_dir_entry_tail);
1829 
1830  sb = dir->i_sb;
1831  blocksize = sb->s_blocksize;
1832  if (!dentry->d_name.len)
1833  return -EINVAL;
1834  if (is_dx(dir)) {
1835  retval = ext4_dx_add_entry(handle, dentry, inode);
1836  if (!retval || (retval != ERR_BAD_DX_DIR))
1837  return retval;
1838  ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1839  dx_fallback++;
1840  ext4_mark_inode_dirty(handle, dir);
1841  }
1842  blocks = dir->i_size >> sb->s_blocksize_bits;
1843  for (block = 0; block < blocks; block++) {
1844  if (!(bh = ext4_bread(handle, dir, block, 0, &retval))) {
1845  if (!retval) {
1846  retval = -EIO;
1847  ext4_error(inode->i_sb,
1848  "Directory hole detected on inode %lu\n",
1849  inode->i_ino);
1850  }
1851  return retval;
1852  }
1853  if (!buffer_verified(bh) &&
1855  (struct ext4_dir_entry *)bh->b_data))
1856  return -EIO;
1857  set_buffer_verified(bh);
1858  retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1859  if (retval != -ENOSPC) {
1860  brelse(bh);
1861  return retval;
1862  }
1863 
1864  if (blocks == 1 && !dx_fallback &&
1866  return make_indexed_dir(handle, dentry, inode, bh);
1867  brelse(bh);
1868  }
1869  bh = ext4_append(handle, dir, &block, &retval);
1870  if (!bh)
1871  return retval;
1872  de = (struct ext4_dir_entry_2 *) bh->b_data;
1873  de->inode = 0;
1874  de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
1875 
1876  if (csum_size) {
1877  t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
1878  initialize_dirent_tail(t, blocksize);
1879  }
1880 
1881  retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1882  brelse(bh);
1883  if (retval == 0)
1884  ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1885  return retval;
1886 }
1887 
1888 /*
1889  * Returns 0 for success, or a negative error value
1890  */
1891 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1892  struct inode *inode)
1893 {
1894  struct dx_frame frames[2], *frame;
1895  struct dx_entry *entries, *at;
1896  struct dx_hash_info hinfo;
1897  struct buffer_head *bh;
1898  struct inode *dir = dentry->d_parent->d_inode;
1899  struct super_block *sb = dir->i_sb;
1900  struct ext4_dir_entry_2 *de;
1901  int err;
1902 
1903  frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1904  if (!frame)
1905  return err;
1906  entries = frame->entries;
1907  at = frame->at;
1908 
1909  if (!(bh = ext4_bread(handle, dir, dx_get_block(frame->at), 0, &err))) {
1910  if (!err) {
1911  err = -EIO;
1912  ext4_error(dir->i_sb,
1913  "Directory hole detected on inode %lu\n",
1914  dir->i_ino);
1915  }
1916  goto cleanup;
1917  }
1918 
1919  if (!buffer_verified(bh) &&
1920  !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
1921  goto journal_error;
1922  set_buffer_verified(bh);
1923 
1924  BUFFER_TRACE(bh, "get_write_access");
1925  err = ext4_journal_get_write_access(handle, bh);
1926  if (err)
1927  goto journal_error;
1928 
1929  err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1930  if (err != -ENOSPC)
1931  goto cleanup;
1932 
1933  /* Block full, should compress but for now just split */
1934  dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
1935  dx_get_count(entries), dx_get_limit(entries)));
1936  /* Need to split index? */
1937  if (dx_get_count(entries) == dx_get_limit(entries)) {
1938  ext4_lblk_t newblock;
1939  unsigned icount = dx_get_count(entries);
1940  int levels = frame - frames;
1941  struct dx_entry *entries2;
1942  struct dx_node *node2;
1943  struct buffer_head *bh2;
1944 
1945  if (levels && (dx_get_count(frames->entries) ==
1946  dx_get_limit(frames->entries))) {
1947  ext4_warning(sb, "Directory index full!");
1948  err = -ENOSPC;
1949  goto cleanup;
1950  }
1951  bh2 = ext4_append (handle, dir, &newblock, &err);
1952  if (!(bh2))
1953  goto cleanup;
1954  node2 = (struct dx_node *)(bh2->b_data);
1955  entries2 = node2->entries;
1956  memset(&node2->fake, 0, sizeof(struct fake_dirent));
1957  node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1958  sb->s_blocksize);
1959  BUFFER_TRACE(frame->bh, "get_write_access");
1960  err = ext4_journal_get_write_access(handle, frame->bh);
1961  if (err)
1962  goto journal_error;
1963  if (levels) {
1964  unsigned icount1 = icount/2, icount2 = icount - icount1;
1965  unsigned hash2 = dx_get_hash(entries + icount1);
1966  dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
1967  icount1, icount2));
1968 
1969  BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1970  err = ext4_journal_get_write_access(handle,
1971  frames[0].bh);
1972  if (err)
1973  goto journal_error;
1974 
1975  memcpy((char *) entries2, (char *) (entries + icount1),
1976  icount2 * sizeof(struct dx_entry));
1977  dx_set_count(entries, icount1);
1978  dx_set_count(entries2, icount2);
1979  dx_set_limit(entries2, dx_node_limit(dir));
1980 
1981  /* Which index block gets the new entry? */
1982  if (at - entries >= icount1) {
1983  frame->at = at = at - entries - icount1 + entries2;
1984  frame->entries = entries = entries2;
1985  swap(frame->bh, bh2);
1986  }
1987  dx_insert_block(frames + 0, hash2, newblock);
1988  dxtrace(dx_show_index("node", frames[1].entries));
1989  dxtrace(dx_show_index("node",
1990  ((struct dx_node *) bh2->b_data)->entries));
1991  err = ext4_handle_dirty_dx_node(handle, dir, bh2);
1992  if (err)
1993  goto journal_error;
1994  brelse (bh2);
1995  } else {
1997  "Creating second level index...\n"));
1998  memcpy((char *) entries2, (char *) entries,
1999  icount * sizeof(struct dx_entry));
2000  dx_set_limit(entries2, dx_node_limit(dir));
2001 
2002  /* Set up root */
2003  dx_set_count(entries, 1);
2004  dx_set_block(entries + 0, newblock);
2005  ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
2006 
2007  /* Add new access path frame */
2008  frame = frames + 1;
2009  frame->at = at = at - entries + entries2;
2010  frame->entries = entries = entries2;
2011  frame->bh = bh2;
2012  err = ext4_journal_get_write_access(handle,
2013  frame->bh);
2014  if (err)
2015  goto journal_error;
2016  }
2017  err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
2018  if (err) {
2019  ext4_std_error(inode->i_sb, err);
2020  goto cleanup;
2021  }
2022  }
2023  de = do_split(handle, dir, &bh, frame, &hinfo, &err);
2024  if (!de)
2025  goto cleanup;
2026  err = add_dirent_to_buf(handle, dentry, inode, de, bh);
2027  goto cleanup;
2028 
2029 journal_error:
2030  ext4_std_error(dir->i_sb, err);
2031 cleanup:
2032  if (bh)
2033  brelse(bh);
2034  dx_release(frames);
2035  return err;
2036 }
2037 
2038 /*
2039  * ext4_delete_entry deletes a directory entry by merging it with the
2040  * previous entry
2041  */
2042 static int ext4_delete_entry(handle_t *handle,
2043  struct inode *dir,
2044  struct ext4_dir_entry_2 *de_del,
2045  struct buffer_head *bh)
2046 {
2047  struct ext4_dir_entry_2 *de, *pde;
2048  unsigned int blocksize = dir->i_sb->s_blocksize;
2049  int csum_size = 0;
2050  int i, err;
2051 
2054  csum_size = sizeof(struct ext4_dir_entry_tail);
2055 
2056  i = 0;
2057  pde = NULL;
2058  de = (struct ext4_dir_entry_2 *) bh->b_data;
2059  while (i < bh->b_size - csum_size) {
2060  if (ext4_check_dir_entry(dir, NULL, de, bh, i))
2061  return -EIO;
2062  if (de == de_del) {
2063  BUFFER_TRACE(bh, "get_write_access");
2064  err = ext4_journal_get_write_access(handle, bh);
2065  if (unlikely(err)) {
2066  ext4_std_error(dir->i_sb, err);
2067  return err;
2068  }
2069  if (pde)
2070  pde->rec_len = ext4_rec_len_to_disk(
2071  ext4_rec_len_from_disk(pde->rec_len,
2072  blocksize) +
2073  ext4_rec_len_from_disk(de->rec_len,
2074  blocksize),
2075  blocksize);
2076  else
2077  de->inode = 0;
2078  dir->i_version++;
2079  BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
2080  err = ext4_handle_dirty_dirent_node(handle, dir, bh);
2081  if (unlikely(err)) {
2082  ext4_std_error(dir->i_sb, err);
2083  return err;
2084  }
2085  return 0;
2086  }
2087  i += ext4_rec_len_from_disk(de->rec_len, blocksize);
2088  pde = de;
2089  de = ext4_next_entry(de, blocksize);
2090  }
2091  return -ENOENT;
2092 }
2093 
2094 /*
2095  * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
2096  * since this indicates that nlinks count was previously 1.
2097  */
2098 static void ext4_inc_count(handle_t *handle, struct inode *inode)
2099 {
2100  inc_nlink(inode);
2101  if (is_dx(inode) && inode->i_nlink > 1) {
2102  /* limit is 16-bit i_links_count */
2103  if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
2104  set_nlink(inode, 1);
2107  }
2108  }
2109 }
2110 
2111 /*
2112  * If a directory had nlink == 1, then we should let it be 1. This indicates
2113  * directory has >EXT4_LINK_MAX subdirs.
2114  */
2115 static void ext4_dec_count(handle_t *handle, struct inode *inode)
2116 {
2117  if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
2118  drop_nlink(inode);
2119 }
2120 
2121 
2122 static int ext4_add_nondir(handle_t *handle,
2123  struct dentry *dentry, struct inode *inode)
2124 {
2125  int err = ext4_add_entry(handle, dentry, inode);
2126  if (!err) {
2127  ext4_mark_inode_dirty(handle, inode);
2128  unlock_new_inode(inode);
2129  d_instantiate(dentry, inode);
2130  return 0;
2131  }
2132  drop_nlink(inode);
2133  unlock_new_inode(inode);
2134  iput(inode);
2135  return err;
2136 }
2137 
2138 /*
2139  * By the time this is called, we already have created
2140  * the directory cache entry for the new file, but it
2141  * is so far negative - it has no inode.
2142  *
2143  * If the create succeeds, we fill in the inode information
2144  * with d_instantiate().
2145  */
2146 static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2147  bool excl)
2148 {
2149  handle_t *handle;
2150  struct inode *inode;
2151  int err, retries = 0;
2152 
2153  dquot_initialize(dir);
2154 
2155 retry:
2156  handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2159  if (IS_ERR(handle))
2160  return PTR_ERR(handle);
2161 
2162  if (IS_DIRSYNC(dir))
2163  ext4_handle_sync(handle);
2164 
2165  inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2166  err = PTR_ERR(inode);
2167  if (!IS_ERR(inode)) {
2168  inode->i_op = &ext4_file_inode_operations;
2169  inode->i_fop = &ext4_file_operations;
2170  ext4_set_aops(inode);
2171  err = ext4_add_nondir(handle, dentry, inode);
2172  }
2173  ext4_journal_stop(handle);
2174  if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2175  goto retry;
2176  return err;
2177 }
2178 
2179 static int ext4_mknod(struct inode *dir, struct dentry *dentry,
2180  umode_t mode, dev_t rdev)
2181 {
2182  handle_t *handle;
2183  struct inode *inode;
2184  int err, retries = 0;
2185 
2186  if (!new_valid_dev(rdev))
2187  return -EINVAL;
2188 
2189  dquot_initialize(dir);
2190 
2191 retry:
2192  handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2195  if (IS_ERR(handle))
2196  return PTR_ERR(handle);
2197 
2198  if (IS_DIRSYNC(dir))
2199  ext4_handle_sync(handle);
2200 
2201  inode = ext4_new_inode(handle, dir, mode, &dentry->d_name, 0, NULL);
2202  err = PTR_ERR(inode);
2203  if (!IS_ERR(inode)) {
2204  init_special_inode(inode, inode->i_mode, rdev);
2206  err = ext4_add_nondir(handle, dentry, inode);
2207  }
2208  ext4_journal_stop(handle);
2209  if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2210  goto retry;
2211  return err;
2212 }
2213 
2214 static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
2215 {
2216  handle_t *handle;
2217  struct inode *inode;
2218  struct buffer_head *dir_block = NULL;
2219  struct ext4_dir_entry_2 *de;
2220  struct ext4_dir_entry_tail *t;
2221  unsigned int blocksize = dir->i_sb->s_blocksize;
2222  int csum_size = 0;
2223  int err, retries = 0;
2224 
2227  csum_size = sizeof(struct ext4_dir_entry_tail);
2228 
2229  if (EXT4_DIR_LINK_MAX(dir))
2230  return -EMLINK;
2231 
2232  dquot_initialize(dir);
2233 
2234 retry:
2235  handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2238  if (IS_ERR(handle))
2239  return PTR_ERR(handle);
2240 
2241  if (IS_DIRSYNC(dir))
2242  ext4_handle_sync(handle);
2243 
2244  inode = ext4_new_inode(handle, dir, S_IFDIR | mode,
2245  &dentry->d_name, 0, NULL);
2246  err = PTR_ERR(inode);
2247  if (IS_ERR(inode))
2248  goto out_stop;
2249 
2250  inode->i_op = &ext4_dir_inode_operations;
2251  inode->i_fop = &ext4_dir_operations;
2252  inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
2253  if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
2254  if (!err) {
2255  err = -EIO;
2256  ext4_error(inode->i_sb,
2257  "Directory hole detected on inode %lu\n",
2258  inode->i_ino);
2259  }
2260  goto out_clear_inode;
2261  }
2262  BUFFER_TRACE(dir_block, "get_write_access");
2263  err = ext4_journal_get_write_access(handle, dir_block);
2264  if (err)
2265  goto out_clear_inode;
2266  de = (struct ext4_dir_entry_2 *) dir_block->b_data;
2267  de->inode = cpu_to_le32(inode->i_ino);
2268  de->name_len = 1;
2269  de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
2270  blocksize);
2271  strcpy(de->name, ".");
2272  ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2273  de = ext4_next_entry(de, blocksize);
2274  de->inode = cpu_to_le32(dir->i_ino);
2275  de->rec_len = ext4_rec_len_to_disk(blocksize -
2276  (csum_size + EXT4_DIR_REC_LEN(1)),
2277  blocksize);
2278  de->name_len = 2;
2279  strcpy(de->name, "..");
2280  ext4_set_de_type(dir->i_sb, de, S_IFDIR);
2281  set_nlink(inode, 2);
2282 
2283  if (csum_size) {
2284  t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
2285  initialize_dirent_tail(t, blocksize);
2286  }
2287 
2288  BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
2289  err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
2290  if (err)
2291  goto out_clear_inode;
2292  set_buffer_verified(dir_block);
2293  err = ext4_mark_inode_dirty(handle, inode);
2294  if (!err)
2295  err = ext4_add_entry(handle, dentry, inode);
2296  if (err) {
2297 out_clear_inode:
2298  clear_nlink(inode);
2299  unlock_new_inode(inode);
2300  ext4_mark_inode_dirty(handle, inode);
2301  iput(inode);
2302  goto out_stop;
2303  }
2304  ext4_inc_count(handle, dir);
2305  ext4_update_dx_flag(dir);
2306  err = ext4_mark_inode_dirty(handle, dir);
2307  if (err)
2308  goto out_clear_inode;
2309  unlock_new_inode(inode);
2310  d_instantiate(dentry, inode);
2311 out_stop:
2312  brelse(dir_block);
2313  ext4_journal_stop(handle);
2314  if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2315  goto retry;
2316  return err;
2317 }
2318 
2319 /*
2320  * routine to check that the specified directory is empty (for rmdir)
2321  */
2322 static int empty_dir(struct inode *inode)
2323 {
2324  unsigned int offset;
2325  struct buffer_head *bh;
2326  struct ext4_dir_entry_2 *de, *de1;
2327  struct super_block *sb;
2328  int err = 0;
2329 
2330  sb = inode->i_sb;
2331  if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
2332  !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
2333  if (err)
2334  EXT4_ERROR_INODE(inode,
2335  "error %d reading directory lblock 0", err);
2336  else
2337  ext4_warning(inode->i_sb,
2338  "bad directory (dir #%lu) - no data block",
2339  inode->i_ino);
2340  return 1;
2341  }
2342  if (!buffer_verified(bh) &&
2343  !ext4_dirent_csum_verify(inode,
2344  (struct ext4_dir_entry *)bh->b_data)) {
2345  EXT4_ERROR_INODE(inode, "checksum error reading directory "
2346  "lblock 0");
2347  return -EIO;
2348  }
2349  set_buffer_verified(bh);
2350  de = (struct ext4_dir_entry_2 *) bh->b_data;
2351  de1 = ext4_next_entry(de, sb->s_blocksize);
2352  if (le32_to_cpu(de->inode) != inode->i_ino ||
2353  !le32_to_cpu(de1->inode) ||
2354  strcmp(".", de->name) ||
2355  strcmp("..", de1->name)) {
2356  ext4_warning(inode->i_sb,
2357  "bad directory (dir #%lu) - no `.' or `..'",
2358  inode->i_ino);
2359  brelse(bh);
2360  return 1;
2361  }
2362  offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
2363  ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
2364  de = ext4_next_entry(de1, sb->s_blocksize);
2365  while (offset < inode->i_size) {
2366  if (!bh ||
2367  (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
2368  unsigned int lblock;
2369  err = 0;
2370  brelse(bh);
2371  lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
2372  bh = ext4_bread(NULL, inode, lblock, 0, &err);
2373  if (!bh) {
2374  if (err)
2375  EXT4_ERROR_INODE(inode,
2376  "error %d reading directory "
2377  "lblock %u", err, lblock);
2378  else
2379  ext4_warning(inode->i_sb,
2380  "bad directory (dir #%lu) - no data block",
2381  inode->i_ino);
2382 
2383  offset += sb->s_blocksize;
2384  continue;
2385  }
2386  if (!buffer_verified(bh) &&
2387  !ext4_dirent_csum_verify(inode,
2388  (struct ext4_dir_entry *)bh->b_data)) {
2389  EXT4_ERROR_INODE(inode, "checksum error "
2390  "reading directory lblock 0");
2391  return -EIO;
2392  }
2393  set_buffer_verified(bh);
2394  de = (struct ext4_dir_entry_2 *) bh->b_data;
2395  }
2396  if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
2397  de = (struct ext4_dir_entry_2 *)(bh->b_data +
2398  sb->s_blocksize);
2399  offset = (offset | (sb->s_blocksize - 1)) + 1;
2400  continue;
2401  }
2402  if (le32_to_cpu(de->inode)) {
2403  brelse(bh);
2404  return 0;
2405  }
2406  offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
2407  de = ext4_next_entry(de, sb->s_blocksize);
2408  }
2409  brelse(bh);
2410  return 1;
2411 }
2412 
2413 /* ext4_orphan_add() links an unlinked or truncated inode into a list of
2414  * such inodes, starting at the superblock, in case we crash before the
2415  * file is closed/deleted, or in case the inode truncate spans multiple
2416  * transactions and the last transaction is not recovered after a crash.
2417  *
2418  * At filesystem recovery time, we walk this list deleting unlinked
2419  * inodes and truncating linked inodes in ext4_orphan_cleanup().
2420  */
2421 int ext4_orphan_add(handle_t *handle, struct inode *inode)
2422 {
2423  struct super_block *sb = inode->i_sb;
2424  struct ext4_iloc iloc;
2425  int err = 0, rc;
2426 
2427  if (!EXT4_SB(sb)->s_journal)
2428  return 0;
2429 
2430  mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
2431  if (!list_empty(&EXT4_I(inode)->i_orphan))
2432  goto out_unlock;
2433 
2434  /*
2435  * Orphan handling is only valid for files with data blocks
2436  * being truncated, or files being unlinked. Note that we either
2437  * hold i_mutex, or the inode can not be referenced from outside,
2438  * so i_nlink should not be bumped due to race
2439  */
2440  J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2441  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
2442 
2443  BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
2444  err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
2445  if (err)
2446  goto out_unlock;
2447 
2448  err = ext4_reserve_inode_write(handle, inode, &iloc);
2449  if (err)
2450  goto out_unlock;
2451  /*
2452  * Due to previous errors inode may be already a part of on-disk
2453  * orphan list. If so skip on-disk list modification.
2454  */
2455  if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
2456  (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
2457  goto mem_insert;
2458 
2459  /* Insert this inode at the head of the on-disk orphan list... */
2460  NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
2461  EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
2462  err = ext4_handle_dirty_super(handle, sb);
2463  rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
2464  if (!err)
2465  err = rc;
2466 
2467  /* Only add to the head of the in-memory list if all the
2468  * previous operations succeeded. If the orphan_add is going to
2469  * fail (possibly taking the journal offline), we can't risk
2470  * leaving the inode on the orphan list: stray orphan-list
2471  * entries can cause panics at unmount time.
2472  *
2473  * This is safe: on error we're going to ignore the orphan list
2474  * anyway on the next recovery. */
2475 mem_insert:
2476  if (!err)
2477  list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2478 
2479  jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
2480  jbd_debug(4, "orphan inode %lu will point to %d\n",
2481  inode->i_ino, NEXT_ORPHAN(inode));
2482 out_unlock:
2483  mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
2484  ext4_std_error(inode->i_sb, err);
2485  return err;
2486 }
2487 
2488 /*
2489  * ext4_orphan_del() removes an unlinked or truncated inode from the list
2490  * of such inodes stored on disk, because it is finally being cleaned up.
2491  */
2492 int ext4_orphan_del(handle_t *handle, struct inode *inode)
2493 {
2494  struct list_head *prev;
2495  struct ext4_inode_info *ei = EXT4_I(inode);
2496  struct ext4_sb_info *sbi;
2497  __u32 ino_next;
2498  struct ext4_iloc iloc;
2499  int err = 0;
2500 
2501  if (!EXT4_SB(inode->i_sb)->s_journal)
2502  return 0;
2503 
2504  mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2505  if (list_empty(&ei->i_orphan))
2506  goto out;
2507 
2508  ino_next = NEXT_ORPHAN(inode);
2509  prev = ei->i_orphan.prev;
2510  sbi = EXT4_SB(inode->i_sb);
2511 
2512  jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
2513 
2514  list_del_init(&ei->i_orphan);
2515 
2516  /* If we're on an error path, we may not have a valid
2517  * transaction handle with which to update the orphan list on
2518  * disk, but we still need to remove the inode from the linked
2519  * list in memory. */
2520  if (!handle)
2521  goto out;
2522 
2523  err = ext4_reserve_inode_write(handle, inode, &iloc);
2524  if (err)
2525  goto out_err;
2526 
2527  if (prev == &sbi->s_orphan) {
2528  jbd_debug(4, "superblock will point to %u\n", ino_next);
2529  BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2530  err = ext4_journal_get_write_access(handle, sbi->s_sbh);
2531  if (err)
2532  goto out_brelse;
2533  sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2534  err = ext4_handle_dirty_super(handle, inode->i_sb);
2535  } else {
2536  struct ext4_iloc iloc2;
2537  struct inode *i_prev =
2538  &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
2539 
2540  jbd_debug(4, "orphan inode %lu will point to %u\n",
2541  i_prev->i_ino, ino_next);
2542  err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
2543  if (err)
2544  goto out_brelse;
2545  NEXT_ORPHAN(i_prev) = ino_next;
2546  err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
2547  }
2548  if (err)
2549  goto out_brelse;
2550  NEXT_ORPHAN(inode) = 0;
2551  err = ext4_mark_iloc_dirty(handle, inode, &iloc);
2552 
2553 out_err:
2554  ext4_std_error(inode->i_sb, err);
2555 out:
2556  mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
2557  return err;
2558 
2559 out_brelse:
2560  brelse(iloc.bh);
2561  goto out_err;
2562 }
2563 
2564 static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2565 {
2566  int retval;
2567  struct inode *inode;
2568  struct buffer_head *bh;
2569  struct ext4_dir_entry_2 *de;
2570  handle_t *handle;
2571 
2572  /* Initialize quotas before so that eventual writes go in
2573  * separate transaction */
2574  dquot_initialize(dir);
2575  dquot_initialize(dentry->d_inode);
2576 
2577  handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2578  if (IS_ERR(handle))
2579  return PTR_ERR(handle);
2580 
2581  retval = -ENOENT;
2582  bh = ext4_find_entry(dir, &dentry->d_name, &de);
2583  if (!bh)
2584  goto end_rmdir;
2585 
2586  if (IS_DIRSYNC(dir))
2587  ext4_handle_sync(handle);
2588 
2589  inode = dentry->d_inode;
2590 
2591  retval = -EIO;
2592  if (le32_to_cpu(de->inode) != inode->i_ino)
2593  goto end_rmdir;
2594 
2595  retval = -ENOTEMPTY;
2596  if (!empty_dir(inode))
2597  goto end_rmdir;
2598 
2599  retval = ext4_delete_entry(handle, dir, de, bh);
2600  if (retval)
2601  goto end_rmdir;
2602  if (!EXT4_DIR_LINK_EMPTY(inode))
2603  ext4_warning(inode->i_sb,
2604  "empty directory has too many links (%d)",
2605  inode->i_nlink);
2606  inode->i_version++;
2607  clear_nlink(inode);
2608  /* There's no need to set i_disksize: the fact that i_nlink is
2609  * zero will ensure that the right thing happens during any
2610  * recovery. */
2611  inode->i_size = 0;
2612  ext4_orphan_add(handle, inode);
2613  inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
2614  ext4_mark_inode_dirty(handle, inode);
2615  ext4_dec_count(handle, dir);
2616  ext4_update_dx_flag(dir);
2617  ext4_mark_inode_dirty(handle, dir);
2618 
2619 end_rmdir:
2620  ext4_journal_stop(handle);
2621  brelse(bh);
2622  return retval;
2623 }
2624 
2625 static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2626 {
2627  int retval;
2628  struct inode *inode;
2629  struct buffer_head *bh;
2630  struct ext4_dir_entry_2 *de;
2631  handle_t *handle;
2632 
2633  trace_ext4_unlink_enter(dir, dentry);
2634  /* Initialize quotas before so that eventual writes go
2635  * in separate transaction */
2636  dquot_initialize(dir);
2637  dquot_initialize(dentry->d_inode);
2638 
2639  handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
2640  if (IS_ERR(handle))
2641  return PTR_ERR(handle);
2642 
2643  if (IS_DIRSYNC(dir))
2644  ext4_handle_sync(handle);
2645 
2646  retval = -ENOENT;
2647  bh = ext4_find_entry(dir, &dentry->d_name, &de);
2648  if (!bh)
2649  goto end_unlink;
2650 
2651  inode = dentry->d_inode;
2652 
2653  retval = -EIO;
2654  if (le32_to_cpu(de->inode) != inode->i_ino)
2655  goto end_unlink;
2656 
2657  if (!inode->i_nlink) {
2658  ext4_warning(inode->i_sb,
2659  "Deleting nonexistent file (%lu), %d",
2660  inode->i_ino, inode->i_nlink);
2661  set_nlink(inode, 1);
2662  }
2663  retval = ext4_delete_entry(handle, dir, de, bh);
2664  if (retval)
2665  goto end_unlink;
2666  dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
2667  ext4_update_dx_flag(dir);
2668  ext4_mark_inode_dirty(handle, dir);
2669  drop_nlink(inode);
2670  if (!inode->i_nlink)
2671  ext4_orphan_add(handle, inode);
2672  inode->i_ctime = ext4_current_time(inode);
2673  ext4_mark_inode_dirty(handle, inode);
2674  retval = 0;
2675 
2676 end_unlink:
2677  ext4_journal_stop(handle);
2678  brelse(bh);
2679  trace_ext4_unlink_exit(dentry, retval);
2680  return retval;
2681 }
2682 
2683 static int ext4_symlink(struct inode *dir,
2684  struct dentry *dentry, const char *symname)
2685 {
2686  handle_t *handle;
2687  struct inode *inode;
2688  int l, err, retries = 0;
2689  int credits;
2690 
2691  l = strlen(symname)+1;
2692  if (l > dir->i_sb->s_blocksize)
2693  return -ENAMETOOLONG;
2694 
2695  dquot_initialize(dir);
2696 
2697  if (l > EXT4_N_BLOCKS * 4) {
2698  /*
2699  * For non-fast symlinks, we just allocate inode and put it on
2700  * orphan list in the first transaction => we need bitmap,
2701  * group descriptor, sb, inode block, quota blocks, and
2702  * possibly selinux xattr blocks.
2703  */
2704  credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2706  } else {
2707  /*
2708  * Fast symlink. We have to add entry to directory
2709  * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
2710  * allocate new inode (bitmap, group descriptor, inode block,
2711  * quota blocks, sb is already counted in previous macros).
2712  */
2713  credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2716  }
2717 retry:
2718  handle = ext4_journal_start(dir, credits);
2719  if (IS_ERR(handle))
2720  return PTR_ERR(handle);
2721 
2722  if (IS_DIRSYNC(dir))
2723  ext4_handle_sync(handle);
2724 
2725  inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO,
2726  &dentry->d_name, 0, NULL);
2727  err = PTR_ERR(inode);
2728  if (IS_ERR(inode))
2729  goto out_stop;
2730 
2731  if (l > EXT4_N_BLOCKS * 4) {
2733  ext4_set_aops(inode);
2734  /*
2735  * We cannot call page_symlink() with transaction started
2736  * because it calls into ext4_write_begin() which can wait
2737  * for transaction commit if we are running out of space
2738  * and thus we deadlock. So we have to stop transaction now
2739  * and restart it when symlink contents is written.
2740  *
2741  * To keep fs consistent in case of crash, we have to put inode
2742  * to orphan list in the mean time.
2743  */
2744  drop_nlink(inode);
2745  err = ext4_orphan_add(handle, inode);
2746  ext4_journal_stop(handle);
2747  if (err)
2748  goto err_drop_inode;
2749  err = __page_symlink(inode, symname, l, 1);
2750  if (err)
2751  goto err_drop_inode;
2752  /*
2753  * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
2754  * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2755  */
2756  handle = ext4_journal_start(dir,
2759  if (IS_ERR(handle)) {
2760  err = PTR_ERR(handle);
2761  goto err_drop_inode;
2762  }
2763  set_nlink(inode, 1);
2764  err = ext4_orphan_del(handle, inode);
2765  if (err) {
2766  ext4_journal_stop(handle);
2767  clear_nlink(inode);
2768  goto err_drop_inode;
2769  }
2770  } else {
2771  /* clear the extent format for fast symlink */
2772  ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2774  memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2775  inode->i_size = l-1;
2776  }
2777  EXT4_I(inode)->i_disksize = inode->i_size;
2778  err = ext4_add_nondir(handle, dentry, inode);
2779 out_stop:
2780  ext4_journal_stop(handle);
2781  if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2782  goto retry;
2783  return err;
2784 err_drop_inode:
2785  unlock_new_inode(inode);
2786  iput(inode);
2787  return err;
2788 }
2789 
2790 static int ext4_link(struct dentry *old_dentry,
2791  struct inode *dir, struct dentry *dentry)
2792 {
2793  handle_t *handle;
2794  struct inode *inode = old_dentry->d_inode;
2795  int err, retries = 0;
2796 
2797  if (inode->i_nlink >= EXT4_LINK_MAX)
2798  return -EMLINK;
2799 
2800  dquot_initialize(dir);
2801 
2802 retry:
2803  handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2805  if (IS_ERR(handle))
2806  return PTR_ERR(handle);
2807 
2808  if (IS_DIRSYNC(dir))
2809  ext4_handle_sync(handle);
2810 
2811  inode->i_ctime = ext4_current_time(inode);
2812  ext4_inc_count(handle, inode);
2813  ihold(inode);
2814 
2815  err = ext4_add_entry(handle, dentry, inode);
2816  if (!err) {
2817  ext4_mark_inode_dirty(handle, inode);
2818  d_instantiate(dentry, inode);
2819  } else {
2820  drop_nlink(inode);
2821  iput(inode);
2822  }
2823  ext4_journal_stop(handle);
2824  if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2825  goto retry;
2826  return err;
2827 }
2828 
2829 #define PARENT_INO(buffer, size) \
2830  (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
2831 
2832 /*
2833  * Anybody can rename anything with this: the permission checks are left to the
2834  * higher-level routines.
2835  */
2836 static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2837  struct inode *new_dir, struct dentry *new_dentry)
2838 {
2839  handle_t *handle;
2840  struct inode *old_inode, *new_inode;
2841  struct buffer_head *old_bh, *new_bh, *dir_bh;
2842  struct ext4_dir_entry_2 *old_de, *new_de;
2843  int retval, force_da_alloc = 0;
2844 
2845  dquot_initialize(old_dir);
2846  dquot_initialize(new_dir);
2847 
2848  old_bh = new_bh = dir_bh = NULL;
2849 
2850  /* Initialize quotas before so that eventual writes go
2851  * in separate transaction */
2852  if (new_dentry->d_inode)
2853  dquot_initialize(new_dentry->d_inode);
2854  handle = ext4_journal_start(old_dir, 2 *
2855  EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2857  if (IS_ERR(handle))
2858  return PTR_ERR(handle);
2859 
2860  if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2861  ext4_handle_sync(handle);
2862 
2863  old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
2864  /*
2865  * Check for inode number is _not_ due to possible IO errors.
2866  * We might rmdir the source, keep it as pwd of some process
2867  * and merrily kill the link to whatever was created under the
2868  * same name. Goodbye sticky bit ;-<
2869  */
2870  old_inode = old_dentry->d_inode;
2871  retval = -ENOENT;
2872  if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2873  goto end_rename;
2874 
2875  new_inode = new_dentry->d_inode;
2876  new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
2877  if (new_bh) {
2878  if (!new_inode) {
2879  brelse(new_bh);
2880  new_bh = NULL;
2881  }
2882  }
2883  if (S_ISDIR(old_inode->i_mode)) {
2884  if (new_inode) {
2885  retval = -ENOTEMPTY;
2886  if (!empty_dir(new_inode))
2887  goto end_rename;
2888  }
2889  retval = -EIO;
2890  if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
2891  if (!retval) {
2892  retval = -EIO;
2893  ext4_error(old_inode->i_sb,
2894  "Directory hole detected on inode %lu\n",
2895  old_inode->i_ino);
2896  }
2897  goto end_rename;
2898  }
2899  if (!buffer_verified(dir_bh) &&
2900  !ext4_dirent_csum_verify(old_inode,
2901  (struct ext4_dir_entry *)dir_bh->b_data))
2902  goto end_rename;
2903  set_buffer_verified(dir_bh);
2904  if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
2905  old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
2906  goto end_rename;
2907  retval = -EMLINK;
2908  if (!new_inode && new_dir != old_dir &&
2909  EXT4_DIR_LINK_MAX(new_dir))
2910  goto end_rename;
2911  BUFFER_TRACE(dir_bh, "get_write_access");
2912  retval = ext4_journal_get_write_access(handle, dir_bh);
2913  if (retval)
2914  goto end_rename;
2915  }
2916  if (!new_bh) {
2917  retval = ext4_add_entry(handle, new_dentry, old_inode);
2918  if (retval)
2919  goto end_rename;
2920  } else {
2921  BUFFER_TRACE(new_bh, "get write access");
2922  retval = ext4_journal_get_write_access(handle, new_bh);
2923  if (retval)
2924  goto end_rename;
2925  new_de->inode = cpu_to_le32(old_inode->i_ino);
2926  if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2928  new_de->file_type = old_de->file_type;
2929  new_dir->i_version++;
2930  new_dir->i_ctime = new_dir->i_mtime =
2931  ext4_current_time(new_dir);
2932  ext4_mark_inode_dirty(handle, new_dir);
2933  BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
2934  retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
2935  if (unlikely(retval)) {
2936  ext4_std_error(new_dir->i_sb, retval);
2937  goto end_rename;
2938  }
2939  brelse(new_bh);
2940  new_bh = NULL;
2941  }
2942 
2943  /*
2944  * Like most other Unix systems, set the ctime for inodes on a
2945  * rename.
2946  */
2947  old_inode->i_ctime = ext4_current_time(old_inode);
2948  ext4_mark_inode_dirty(handle, old_inode);
2949 
2950  /*
2951  * ok, that's it
2952  */
2953  if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2954  old_de->name_len != old_dentry->d_name.len ||
2955  strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2956  (retval = ext4_delete_entry(handle, old_dir,
2957  old_de, old_bh)) == -ENOENT) {
2958  /* old_de could have moved from under us during htree split, so
2959  * make sure that we are deleting the right entry. We might
2960  * also be pointing to a stale entry in the unused part of
2961  * old_bh so just checking inum and the name isn't enough. */
2962  struct buffer_head *old_bh2;
2963  struct ext4_dir_entry_2 *old_de2;
2964 
2965  old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
2966  if (old_bh2) {
2967  retval = ext4_delete_entry(handle, old_dir,
2968  old_de2, old_bh2);
2969  brelse(old_bh2);
2970  }
2971  }
2972  if (retval) {
2973  ext4_warning(old_dir->i_sb,
2974  "Deleting old file (%lu), %d, error=%d",
2975  old_dir->i_ino, old_dir->i_nlink, retval);
2976  }
2977 
2978  if (new_inode) {
2979  ext4_dec_count(handle, new_inode);
2980  new_inode->i_ctime = ext4_current_time(new_inode);
2981  }
2982  old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2983  ext4_update_dx_flag(old_dir);
2984  if (dir_bh) {
2985  PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2986  cpu_to_le32(new_dir->i_ino);
2987  BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
2988  if (is_dx(old_inode)) {
2989  retval = ext4_handle_dirty_dx_node(handle,
2990  old_inode,
2991  dir_bh);
2992  } else {
2993  retval = ext4_handle_dirty_dirent_node(handle,
2994  old_inode,
2995  dir_bh);
2996  }
2997  if (retval) {
2998  ext4_std_error(old_dir->i_sb, retval);
2999  goto end_rename;
3000  }
3001  ext4_dec_count(handle, old_dir);
3002  if (new_inode) {
3003  /* checked empty_dir above, can't have another parent,
3004  * ext4_dec_count() won't work for many-linked dirs */
3005  clear_nlink(new_inode);
3006  } else {
3007  ext4_inc_count(handle, new_dir);
3008  ext4_update_dx_flag(new_dir);
3009  ext4_mark_inode_dirty(handle, new_dir);
3010  }
3011  }
3012  ext4_mark_inode_dirty(handle, old_dir);
3013  if (new_inode) {
3014  ext4_mark_inode_dirty(handle, new_inode);
3015  if (!new_inode->i_nlink)
3016  ext4_orphan_add(handle, new_inode);
3017  if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
3018  force_da_alloc = 1;
3019  }
3020  retval = 0;
3021 
3022 end_rename:
3023  brelse(dir_bh);
3024  brelse(old_bh);
3025  brelse(new_bh);
3026  ext4_journal_stop(handle);
3027  if (retval == 0 && force_da_alloc)
3028  ext4_alloc_da_blocks(old_inode);
3029  return retval;
3030 }
3031 
3032 /*
3033  * directories can handle most operations...
3034  */
3036  .create = ext4_create,
3037  .lookup = ext4_lookup,
3038  .link = ext4_link,
3039  .unlink = ext4_unlink,
3040  .symlink = ext4_symlink,
3041  .mkdir = ext4_mkdir,
3042  .rmdir = ext4_rmdir,
3043  .mknod = ext4_mknod,
3044  .rename = ext4_rename,
3045  .setattr = ext4_setattr,
3046 #ifdef CONFIG_EXT4_FS_XATTR
3047  .setxattr = generic_setxattr,
3048  .getxattr = generic_getxattr,
3049  .listxattr = ext4_listxattr,
3050  .removexattr = generic_removexattr,
3051 #endif
3052  .get_acl = ext4_get_acl,
3053  .fiemap = ext4_fiemap,
3054 };
3055 
3057  .setattr = ext4_setattr,
3058 #ifdef CONFIG_EXT4_FS_XATTR
3059  .setxattr = generic_setxattr,
3060  .getxattr = generic_getxattr,
3061  .listxattr = ext4_listxattr,
3062  .removexattr = generic_removexattr,
3063 #endif
3064  .get_acl = ext4_get_acl,
3065 };