Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
namei.c
Go to the documentation of this file.
1 /*
2  * linux/fs/ext3/namei.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card ([email protected])
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  * from
10  *
11  * linux/fs/minix/namei.c
12  *
13  * Copyright (C) 1991, 1992 Linus Torvalds
14  *
15  * Big-endian to little-endian byte-swapping/bitmaps by
16  * David S. Miller ([email protected]), 1995
17  * Directory entry file type support and forward compatibility hooks
18  * for B-tree directories by Theodore Ts'o ([email protected]), 1998
19  * Hash Tree Directory indexing (c)
20  * Daniel Phillips, 2001
21  * Hash Tree Directory indexing porting
22  * Christopher Li, 2002
23  * Hash Tree Directory indexing cleanup
24  * Theodore Ts'o, 2002
25  */
26 
27 #include <linux/quotaops.h>
28 #include "ext3.h"
29 #include "namei.h"
30 #include "xattr.h"
31 #include "acl.h"
32 
33 /*
34  * define how far ahead to read directories while searching them.
35  */
36 #define NAMEI_RA_CHUNKS 2
37 #define NAMEI_RA_BLOCKS 4
38 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
39 #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
40 
41 static struct buffer_head *ext3_append(handle_t *handle,
42  struct inode *inode,
43  u32 *block, int *err)
44 {
45  struct buffer_head *bh;
46 
47  *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
48 
49  if ((bh = ext3_dir_bread(handle, inode, *block, 1, err))) {
50  inode->i_size += inode->i_sb->s_blocksize;
51  EXT3_I(inode)->i_disksize = inode->i_size;
52  *err = ext3_journal_get_write_access(handle, bh);
53  if (*err) {
54  brelse(bh);
55  bh = NULL;
56  }
57  }
58  return bh;
59 }
60 
61 #ifndef assert
62 #define assert(test) J_ASSERT(test)
63 #endif
64 
65 #ifdef DX_DEBUG
66 #define dxtrace(command) command
67 #else
68 #define dxtrace(command)
69 #endif
70 
72 {
77 };
78 
80 {
83 };
84 
85 struct dx_entry
86 {
89 };
90 
91 /*
92  * dx_root_info is laid out so that if it should somehow get overlaid by a
93  * dirent the two low bits of the hash version will be zero. Therefore, the
94  * hash version mod 4 should never be 0. Sincerely, the paranoia department.
95  */
96 
97 struct dx_root
98 {
99  struct fake_dirent dot;
100  char dot_name[4];
102  char dotdot_name[4];
104  {
107  u8 info_length; /* 8 */
110  }
111  info;
112  struct dx_entry entries[0];
113 };
114 
115 struct dx_node
116 {
118  struct dx_entry entries[0];
119 };
120 
121 
122 struct dx_frame
123 {
124  struct buffer_head *bh;
125  struct dx_entry *entries;
126  struct dx_entry *at;
127 };
128 
130 {
134 };
135 
136 static inline unsigned dx_get_block (struct dx_entry *entry);
137 static void dx_set_block (struct dx_entry *entry, unsigned value);
138 static inline unsigned dx_get_hash (struct dx_entry *entry);
139 static void dx_set_hash (struct dx_entry *entry, unsigned value);
140 static unsigned dx_get_count (struct dx_entry *entries);
141 static unsigned dx_get_limit (struct dx_entry *entries);
142 static void dx_set_count (struct dx_entry *entries, unsigned value);
143 static void dx_set_limit (struct dx_entry *entries, unsigned value);
144 static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
145 static unsigned dx_node_limit (struct inode *dir);
146 static struct dx_frame *dx_probe(struct qstr *entry,
147  struct inode *dir,
148  struct dx_hash_info *hinfo,
149  struct dx_frame *frame,
150  int *err);
151 static void dx_release (struct dx_frame *frames);
152 static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
153  struct dx_hash_info *hinfo, struct dx_map_entry map[]);
154 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
155 static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
156  struct dx_map_entry *offsets, int count);
157 static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize);
158 static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
159 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
160  struct dx_frame *frame,
161  struct dx_frame *frames,
162  __u32 *start_hash);
163 static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
164  struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
165  int *err);
166 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
167  struct inode *inode);
168 
169 /*
170  * p is at least 6 bytes before the end of page
171  */
172 static inline struct ext3_dir_entry_2 *
173 ext3_next_entry(struct ext3_dir_entry_2 *p)
174 {
175  return (struct ext3_dir_entry_2 *)((char *)p +
176  ext3_rec_len_from_disk(p->rec_len));
177 }
178 
179 /*
180  * Future: use high four bits of block for coalesce-on-delete flags
181  * Mask them off for now.
182  */
183 
184 static inline unsigned dx_get_block (struct dx_entry *entry)
185 {
186  return le32_to_cpu(entry->block) & 0x00ffffff;
187 }
188 
189 static inline void dx_set_block (struct dx_entry *entry, unsigned value)
190 {
191  entry->block = cpu_to_le32(value);
192 }
193 
194 static inline unsigned dx_get_hash (struct dx_entry *entry)
195 {
196  return le32_to_cpu(entry->hash);
197 }
198 
199 static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
200 {
201  entry->hash = cpu_to_le32(value);
202 }
203 
204 static inline unsigned dx_get_count (struct dx_entry *entries)
205 {
206  return le16_to_cpu(((struct dx_countlimit *) entries)->count);
207 }
208 
209 static inline unsigned dx_get_limit (struct dx_entry *entries)
210 {
211  return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
212 }
213 
214 static inline void dx_set_count (struct dx_entry *entries, unsigned value)
215 {
216  ((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
217 }
218 
219 static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
220 {
221  ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
222 }
223 
224 static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
225 {
226  unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
227  EXT3_DIR_REC_LEN(2) - infosize;
228  return entry_space / sizeof(struct dx_entry);
229 }
230 
231 static inline unsigned dx_node_limit (struct inode *dir)
232 {
233  unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
234  return entry_space / sizeof(struct dx_entry);
235 }
236 
237 /*
238  * Debug
239  */
240 #ifdef DX_DEBUG
241 static void dx_show_index (char * label, struct dx_entry *entries)
242 {
243  int i, n = dx_get_count (entries);
244  printk("%s index ", label);
245  for (i = 0; i < n; i++)
246  {
247  printk("%x->%u ", i? dx_get_hash(entries + i): 0, dx_get_block(entries + i));
248  }
249  printk("\n");
250 }
251 
252 struct stats
253 {
254  unsigned names;
255  unsigned space;
256  unsigned bcount;
257 };
258 
259 static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
260  int size, int show_names)
261 {
262  unsigned names = 0, space = 0;
263  char *base = (char *) de;
264  struct dx_hash_info h = *hinfo;
265 
266  printk("names: ");
267  while ((char *) de < base + size)
268  {
269  if (de->inode)
270  {
271  if (show_names)
272  {
273  int len = de->name_len;
274  char *name = de->name;
275  while (len--) printk("%c", *name++);
276  ext3fs_dirhash(de->name, de->name_len, &h);
277  printk(":%x.%u ", h.hash,
278  (unsigned) ((char *) de - base));
279  }
280  space += EXT3_DIR_REC_LEN(de->name_len);
281  names++;
282  }
283  de = ext3_next_entry(de);
284  }
285  printk("(%i)\n", names);
286  return (struct stats) { names, space, 1 };
287 }
288 
289 struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
290  struct dx_entry *entries, int levels)
291 {
292  unsigned blocksize = dir->i_sb->s_blocksize;
293  unsigned count = dx_get_count (entries), names = 0, space = 0, i;
294  unsigned bcount = 0;
295  struct buffer_head *bh;
296  int err;
297  printk("%i indexed blocks...\n", count);
298  for (i = 0; i < count; i++, entries++)
299  {
300  u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
301  u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
302  struct stats stats;
303  printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
304  if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
305  stats = levels?
306  dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
307  dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
308  names += stats.names;
309  space += stats.space;
310  bcount += stats.bcount;
311  brelse (bh);
312  }
313  if (bcount)
314  printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
315  names, space/bcount,(space/bcount)*100/blocksize);
316  return (struct stats) { names, space, bcount};
317 }
318 #endif /* DX_DEBUG */
319 
320 /*
321  * Probe for a directory leaf block to search.
322  *
323  * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
324  * error in the directory index, and the caller should fall back to
325  * searching the directory normally. The callers of dx_probe **MUST**
326  * check for this error code, and make sure it never gets reflected
327  * back to userspace.
328  */
329 static struct dx_frame *
330 dx_probe(struct qstr *entry, struct inode *dir,
331  struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
332 {
333  unsigned count, indirect;
334  struct dx_entry *at, *entries, *p, *q, *m;
335  struct dx_root *root;
336  struct buffer_head *bh;
337  struct dx_frame *frame = frame_in;
338  u32 hash;
339 
340  frame->bh = NULL;
341  if (!(bh = ext3_dir_bread(NULL, dir, 0, 0, err))) {
342  *err = ERR_BAD_DX_DIR;
343  goto fail;
344  }
345  root = (struct dx_root *) bh->b_data;
346  if (root->info.hash_version != DX_HASH_TEA &&
347  root->info.hash_version != DX_HASH_HALF_MD4 &&
348  root->info.hash_version != DX_HASH_LEGACY) {
349  ext3_warning(dir->i_sb, __func__,
350  "Unrecognised inode hash code %d",
351  root->info.hash_version);
352  brelse(bh);
353  *err = ERR_BAD_DX_DIR;
354  goto fail;
355  }
356  hinfo->hash_version = root->info.hash_version;
357  if (hinfo->hash_version <= DX_HASH_TEA)
358  hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
359  hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed;
360  if (entry)
361  ext3fs_dirhash(entry->name, entry->len, hinfo);
362  hash = hinfo->hash;
363 
364  if (root->info.unused_flags & 1) {
365  ext3_warning(dir->i_sb, __func__,
366  "Unimplemented inode hash flags: %#06x",
367  root->info.unused_flags);
368  brelse(bh);
369  *err = ERR_BAD_DX_DIR;
370  goto fail;
371  }
372 
373  if ((indirect = root->info.indirect_levels) > 1) {
374  ext3_warning(dir->i_sb, __func__,
375  "Unimplemented inode hash depth: %#06x",
376  root->info.indirect_levels);
377  brelse(bh);
378  *err = ERR_BAD_DX_DIR;
379  goto fail;
380  }
381 
382  entries = (struct dx_entry *) (((char *)&root->info) +
383  root->info.info_length);
384 
385  if (dx_get_limit(entries) != dx_root_limit(dir,
386  root->info.info_length)) {
387  ext3_warning(dir->i_sb, __func__,
388  "dx entry: limit != root limit");
389  brelse(bh);
390  *err = ERR_BAD_DX_DIR;
391  goto fail;
392  }
393 
394  dxtrace (printk("Look up %x", hash));
395  while (1)
396  {
397  count = dx_get_count(entries);
398  if (!count || count > dx_get_limit(entries)) {
399  ext3_warning(dir->i_sb, __func__,
400  "dx entry: no count or count > limit");
401  brelse(bh);
402  *err = ERR_BAD_DX_DIR;
403  goto fail2;
404  }
405 
406  p = entries + 1;
407  q = entries + count - 1;
408  while (p <= q)
409  {
410  m = p + (q - p)/2;
411  dxtrace(printk("."));
412  if (dx_get_hash(m) > hash)
413  q = m - 1;
414  else
415  p = m + 1;
416  }
417 
418  if (0) // linear search cross check
419  {
420  unsigned n = count - 1;
421  at = entries;
422  while (n--)
423  {
424  dxtrace(printk(","));
425  if (dx_get_hash(++at) > hash)
426  {
427  at--;
428  break;
429  }
430  }
431  assert (at == p - 1);
432  }
433 
434  at = p - 1;
435  dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
436  frame->bh = bh;
437  frame->entries = entries;
438  frame->at = at;
439  if (!indirect--) return frame;
440  if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(at), 0, err))) {
441  *err = ERR_BAD_DX_DIR;
442  goto fail2;
443  }
444  at = entries = ((struct dx_node *) bh->b_data)->entries;
445  if (dx_get_limit(entries) != dx_node_limit (dir)) {
446  ext3_warning(dir->i_sb, __func__,
447  "dx entry: limit != node limit");
448  brelse(bh);
449  *err = ERR_BAD_DX_DIR;
450  goto fail2;
451  }
452  frame++;
453  frame->bh = NULL;
454  }
455 fail2:
456  while (frame >= frame_in) {
457  brelse(frame->bh);
458  frame--;
459  }
460 fail:
461  if (*err == ERR_BAD_DX_DIR)
462  ext3_warning(dir->i_sb, __func__,
463  "Corrupt dir inode %ld, running e2fsck is "
464  "recommended.", dir->i_ino);
465  return NULL;
466 }
467 
468 static void dx_release (struct dx_frame *frames)
469 {
470  if (frames[0].bh == NULL)
471  return;
472 
473  if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
474  brelse(frames[1].bh);
475  brelse(frames[0].bh);
476 }
477 
478 /*
479  * This function increments the frame pointer to search the next leaf
480  * block, and reads in the necessary intervening nodes if the search
481  * should be necessary. Whether or not the search is necessary is
482  * controlled by the hash parameter. If the hash value is even, then
483  * the search is only continued if the next block starts with that
484  * hash value. This is used if we are searching for a specific file.
485  *
486  * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
487  *
488  * This function returns 1 if the caller should continue to search,
489  * or 0 if it should not. If there is an error reading one of the
490  * index blocks, it will a negative error code.
491  *
492  * If start_hash is non-null, it will be filled in with the starting
493  * hash of the next page.
494  */
495 static int ext3_htree_next_block(struct inode *dir, __u32 hash,
496  struct dx_frame *frame,
497  struct dx_frame *frames,
498  __u32 *start_hash)
499 {
500  struct dx_frame *p;
501  struct buffer_head *bh;
502  int err, num_frames = 0;
503  __u32 bhash;
504 
505  p = frame;
506  /*
507  * Find the next leaf page by incrementing the frame pointer.
508  * If we run out of entries in the interior node, loop around and
509  * increment pointer in the parent node. When we break out of
510  * this loop, num_frames indicates the number of interior
511  * nodes need to be read.
512  */
513  while (1) {
514  if (++(p->at) < p->entries + dx_get_count(p->entries))
515  break;
516  if (p == frames)
517  return 0;
518  num_frames++;
519  p--;
520  }
521 
522  /*
523  * If the hash is 1, then continue only if the next page has a
524  * continuation hash of any value. This is used for readdir
525  * handling. Otherwise, check to see if the hash matches the
526  * desired contiuation hash. If it doesn't, return since
527  * there's no point to read in the successive index pages.
528  */
529  bhash = dx_get_hash(p->at);
530  if (start_hash)
531  *start_hash = bhash;
532  if ((hash & 1) == 0) {
533  if ((bhash & ~1) != hash)
534  return 0;
535  }
536  /*
537  * If the hash is HASH_NB_ALWAYS, we always go to the next
538  * block so no check is necessary
539  */
540  while (num_frames--) {
541  if (!(bh = ext3_dir_bread(NULL, dir, dx_get_block(p->at),
542  0, &err)))
543  return err; /* Failure */
544  p++;
545  brelse (p->bh);
546  p->bh = bh;
547  p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
548  }
549  return 1;
550 }
551 
552 
553 /*
554  * This function fills a red-black tree with information from a
555  * directory block. It returns the number directory entries loaded
556  * into the tree. If there is an error it is returned in err.
557  */
558 static int htree_dirblock_to_tree(struct file *dir_file,
559  struct inode *dir, int block,
560  struct dx_hash_info *hinfo,
561  __u32 start_hash, __u32 start_minor_hash)
562 {
563  struct buffer_head *bh;
564  struct ext3_dir_entry_2 *de, *top;
565  int err = 0, count = 0;
566 
567  dxtrace(printk("In htree dirblock_to_tree: block %d\n", block));
568 
569  if (!(bh = ext3_dir_bread(NULL, dir, block, 0, &err)))
570  return err;
571 
572  de = (struct ext3_dir_entry_2 *) bh->b_data;
573  top = (struct ext3_dir_entry_2 *) ((char *) de +
574  dir->i_sb->s_blocksize -
575  EXT3_DIR_REC_LEN(0));
576  for (; de < top; de = ext3_next_entry(de)) {
577  if (!ext3_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
578  (block<<EXT3_BLOCK_SIZE_BITS(dir->i_sb))
579  +((char *)de - bh->b_data))) {
580  /* On error, skip the f_pos to the next block. */
581  dir_file->f_pos = (dir_file->f_pos |
582  (dir->i_sb->s_blocksize - 1)) + 1;
583  brelse (bh);
584  return count;
585  }
586  ext3fs_dirhash(de->name, de->name_len, hinfo);
587  if ((hinfo->hash < start_hash) ||
588  ((hinfo->hash == start_hash) &&
589  (hinfo->minor_hash < start_minor_hash)))
590  continue;
591  if (de->inode == 0)
592  continue;
593  if ((err = ext3_htree_store_dirent(dir_file,
594  hinfo->hash, hinfo->minor_hash, de)) != 0) {
595  brelse(bh);
596  return err;
597  }
598  count++;
599  }
600  brelse(bh);
601  return count;
602 }
603 
604 
605 /*
606  * This function fills a red-black tree with information from a
607  * directory. We start scanning the directory in hash order, starting
608  * at start_hash and start_minor_hash.
609  *
610  * This function returns the number of entries inserted into the tree,
611  * or a negative error code.
612  */
613 int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
614  __u32 start_minor_hash, __u32 *next_hash)
615 {
616  struct dx_hash_info hinfo;
617  struct ext3_dir_entry_2 *de;
618  struct dx_frame frames[2], *frame;
619  struct inode *dir;
620  int block, err;
621  int count = 0;
622  int ret;
623  __u32 hashval;
624 
625  dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
626  start_minor_hash));
627  dir = dir_file->f_path.dentry->d_inode;
628  if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) {
629  hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
630  if (hinfo.hash_version <= DX_HASH_TEA)
631  hinfo.hash_version +=
632  EXT3_SB(dir->i_sb)->s_hash_unsigned;
633  hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
634  count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
635  start_hash, start_minor_hash);
636  *next_hash = ~0;
637  return count;
638  }
639  hinfo.hash = start_hash;
640  hinfo.minor_hash = 0;
641  frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
642  if (!frame)
643  return err;
644 
645  /* Add '.' and '..' from the htree header */
646  if (!start_hash && !start_minor_hash) {
647  de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
648  if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
649  goto errout;
650  count++;
651  }
652  if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
653  de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
654  de = ext3_next_entry(de);
655  if ((err = ext3_htree_store_dirent(dir_file, 2, 0, de)) != 0)
656  goto errout;
657  count++;
658  }
659 
660  while (1) {
661  block = dx_get_block(frame->at);
662  ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo,
663  start_hash, start_minor_hash);
664  if (ret < 0) {
665  err = ret;
666  goto errout;
667  }
668  count += ret;
669  hashval = ~0;
670  ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
671  frame, frames, &hashval);
672  *next_hash = hashval;
673  if (ret < 0) {
674  err = ret;
675  goto errout;
676  }
677  /*
678  * Stop if: (a) there are no more entries, or
679  * (b) we have inserted at least one entry and the
680  * next hash value is not a continuation
681  */
682  if ((ret == 0) ||
683  (count && ((hashval & 1) == 0)))
684  break;
685  }
686  dx_release(frames);
687  dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
688  count, *next_hash));
689  return count;
690 errout:
691  dx_release(frames);
692  return (err);
693 }
694 
695 
696 /*
697  * Directory block splitting, compacting
698  */
699 
700 /*
701  * Create map of hash values, offsets, and sizes, stored at end of block.
702  * Returns number of entries mapped.
703  */
704 static int dx_make_map(struct ext3_dir_entry_2 *de, unsigned blocksize,
705  struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
706 {
707  int count = 0;
708  char *base = (char *) de;
709  struct dx_hash_info h = *hinfo;
710 
711  while ((char *) de < base + blocksize)
712  {
713  if (de->name_len && de->inode) {
714  ext3fs_dirhash(de->name, de->name_len, &h);
715  map_tail--;
716  map_tail->hash = h.hash;
717  map_tail->offs = (u16) ((char *) de - base);
718  map_tail->size = le16_to_cpu(de->rec_len);
719  count++;
720  cond_resched();
721  }
722  /* XXX: do we need to check rec_len == 0 case? -Chris */
723  de = ext3_next_entry(de);
724  }
725  return count;
726 }
727 
728 /* Sort map by hash value */
729 static void dx_sort_map (struct dx_map_entry *map, unsigned count)
730 {
731  struct dx_map_entry *p, *q, *top = map + count - 1;
732  int more;
733  /* Combsort until bubble sort doesn't suck */
734  while (count > 2)
735  {
736  count = count*10/13;
737  if (count - 9 < 2) /* 9, 10 -> 11 */
738  count = 11;
739  for (p = top, q = p - count; q >= map; p--, q--)
740  if (p->hash < q->hash)
741  swap(*p, *q);
742  }
743  /* Garden variety bubble sort */
744  do {
745  more = 0;
746  q = top;
747  while (q-- > map)
748  {
749  if (q[1].hash >= q[0].hash)
750  continue;
751  swap(*(q+1), *q);
752  more = 1;
753  }
754  } while(more);
755 }
756 
757 static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
758 {
759  struct dx_entry *entries = frame->entries;
760  struct dx_entry *old = frame->at, *new = old + 1;
761  int count = dx_get_count(entries);
762 
763  assert(count < dx_get_limit(entries));
764  assert(old < entries + count);
765  memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
766  dx_set_hash(new, hash);
767  dx_set_block(new, block);
768  dx_set_count(entries, count + 1);
769 }
770 
771 static void ext3_update_dx_flag(struct inode *inode)
772 {
773  if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
775  EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
776 }
777 
778 /*
779  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
780  *
781  * `len <= EXT3_NAME_LEN' is guaranteed by caller.
782  * `de != NULL' is guaranteed by caller.
783  */
784 static inline int ext3_match (int len, const char * const name,
785  struct ext3_dir_entry_2 * de)
786 {
787  if (len != de->name_len)
788  return 0;
789  if (!de->inode)
790  return 0;
791  return !memcmp(name, de->name, len);
792 }
793 
794 /*
795  * Returns 0 if not found, -1 on failure, and 1 on success
796  */
797 static inline int search_dirblock(struct buffer_head * bh,
798  struct inode *dir,
799  struct qstr *child,
800  unsigned long offset,
801  struct ext3_dir_entry_2 ** res_dir)
802 {
803  struct ext3_dir_entry_2 * de;
804  char * dlimit;
805  int de_len;
806  const char *name = child->name;
807  int namelen = child->len;
808 
809  de = (struct ext3_dir_entry_2 *) bh->b_data;
810  dlimit = bh->b_data + dir->i_sb->s_blocksize;
811  while ((char *) de < dlimit) {
812  /* this code is executed quadratically often */
813  /* do minimal checking `by hand' */
814 
815  if ((char *) de + namelen <= dlimit &&
816  ext3_match (namelen, name, de)) {
817  /* found a match - just to be sure, do a full check */
818  if (!ext3_check_dir_entry("ext3_find_entry",
819  dir, de, bh, offset))
820  return -1;
821  *res_dir = de;
822  return 1;
823  }
824  /* prevent looping on a bad block */
825  de_len = ext3_rec_len_from_disk(de->rec_len);
826  if (de_len <= 0)
827  return -1;
828  offset += de_len;
829  de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
830  }
831  return 0;
832 }
833 
834 
835 /*
836  * ext3_find_entry()
837  *
838  * finds an entry in the specified directory with the wanted name. It
839  * returns the cache buffer in which the entry was found, and the entry
840  * itself (as a parameter - res_dir). It does NOT read the inode of the
841  * entry - you'll have to do that yourself if you want to.
842  *
843  * The returned buffer_head has ->b_count elevated. The caller is expected
844  * to brelse() it when appropriate.
845  */
846 static struct buffer_head *ext3_find_entry(struct inode *dir,
847  struct qstr *entry,
848  struct ext3_dir_entry_2 **res_dir)
849 {
850  struct super_block * sb;
851  struct buffer_head * bh_use[NAMEI_RA_SIZE];
852  struct buffer_head * bh, *ret = NULL;
853  unsigned long start, block, b;
854  const u8 *name = entry->name;
855  int ra_max = 0; /* Number of bh's in the readahead
856  buffer, bh_use[] */
857  int ra_ptr = 0; /* Current index into readahead
858  buffer */
859  int num = 0;
860  int nblocks, i, err;
861  int namelen;
862 
863  *res_dir = NULL;
864  sb = dir->i_sb;
865  namelen = entry->len;
866  if (namelen > EXT3_NAME_LEN)
867  return NULL;
868  if ((namelen <= 2) && (name[0] == '.') &&
869  (name[1] == '.' || name[1] == 0)) {
870  /*
871  * "." or ".." will only be in the first block
872  * NFS may look up ".."; "." should be handled by the VFS
873  */
874  block = start = 0;
875  nblocks = 1;
876  goto restart;
877  }
878  if (is_dx(dir)) {
879  bh = ext3_dx_find_entry(dir, entry, res_dir, &err);
880  /*
881  * On success, or if the error was file not found,
882  * return. Otherwise, fall back to doing a search the
883  * old fashioned way.
884  */
885  if (bh || (err != ERR_BAD_DX_DIR))
886  return bh;
887  dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
888  }
889  nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
890  start = EXT3_I(dir)->i_dir_start_lookup;
891  if (start >= nblocks)
892  start = 0;
893  block = start;
894 restart:
895  do {
896  /*
897  * We deal with the read-ahead logic here.
898  */
899  if (ra_ptr >= ra_max) {
900  /* Refill the readahead buffer */
901  ra_ptr = 0;
902  b = block;
903  for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
904  /*
905  * Terminate if we reach the end of the
906  * directory and must wrap, or if our
907  * search has finished at this block.
908  */
909  if (b >= nblocks || (num && block == start)) {
910  bh_use[ra_max] = NULL;
911  break;
912  }
913  num++;
914  bh = ext3_getblk(NULL, dir, b++, 0, &err);
915  bh_use[ra_max] = bh;
916  if (bh && !bh_uptodate_or_lock(bh)) {
917  get_bh(bh);
918  bh->b_end_io = end_buffer_read_sync;
920  bh);
921  }
922  }
923  }
924  if ((bh = bh_use[ra_ptr++]) == NULL)
925  goto next;
926  wait_on_buffer(bh);
927  if (!buffer_uptodate(bh)) {
928  /* read error, skip block & hope for the best */
929  ext3_error(sb, __func__, "reading directory #%lu "
930  "offset %lu", dir->i_ino, block);
931  brelse(bh);
932  goto next;
933  }
934  i = search_dirblock(bh, dir, entry,
935  block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
936  if (i == 1) {
937  EXT3_I(dir)->i_dir_start_lookup = block;
938  ret = bh;
939  goto cleanup_and_exit;
940  } else {
941  brelse(bh);
942  if (i < 0)
943  goto cleanup_and_exit;
944  }
945  next:
946  if (++block >= nblocks)
947  block = 0;
948  } while (block != start);
949 
950  /*
951  * If the directory has grown while we were searching, then
952  * search the last part of the directory before giving up.
953  */
954  block = nblocks;
955  nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
956  if (block < nblocks) {
957  start = 0;
958  goto restart;
959  }
960 
961 cleanup_and_exit:
962  /* Clean up the read-ahead blocks */
963  for (; ra_ptr < ra_max; ra_ptr++)
964  brelse (bh_use[ra_ptr]);
965  return ret;
966 }
967 
968 static struct buffer_head * ext3_dx_find_entry(struct inode *dir,
969  struct qstr *entry, struct ext3_dir_entry_2 **res_dir,
970  int *err)
971 {
972  struct super_block *sb = dir->i_sb;
973  struct dx_hash_info hinfo;
974  struct dx_frame frames[2], *frame;
975  struct buffer_head *bh;
976  unsigned long block;
977  int retval;
978 
979  if (!(frame = dx_probe(entry, dir, &hinfo, frames, err)))
980  return NULL;
981  do {
982  block = dx_get_block(frame->at);
983  if (!(bh = ext3_dir_bread (NULL, dir, block, 0, err)))
984  goto errout;
985 
986  retval = search_dirblock(bh, dir, entry,
987  block << EXT3_BLOCK_SIZE_BITS(sb),
988  res_dir);
989  if (retval == 1) {
990  dx_release(frames);
991  return bh;
992  }
993  brelse(bh);
994  if (retval == -1) {
995  *err = ERR_BAD_DX_DIR;
996  goto errout;
997  }
998 
999  /* Check to see if we should continue to search */
1000  retval = ext3_htree_next_block(dir, hinfo.hash, frame,
1001  frames, NULL);
1002  if (retval < 0) {
1003  ext3_warning(sb, __func__,
1004  "error reading index page in directory #%lu",
1005  dir->i_ino);
1006  *err = retval;
1007  goto errout;
1008  }
1009  } while (retval == 1);
1010 
1011  *err = -ENOENT;
1012 errout:
1013  dxtrace(printk("%s not found\n", entry->name));
1014  dx_release (frames);
1015  return NULL;
1016 }
1017 
1018 static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, unsigned int flags)
1019 {
1020  struct inode * inode;
1021  struct ext3_dir_entry_2 * de;
1022  struct buffer_head * bh;
1023 
1024  if (dentry->d_name.len > EXT3_NAME_LEN)
1025  return ERR_PTR(-ENAMETOOLONG);
1026 
1027  bh = ext3_find_entry(dir, &dentry->d_name, &de);
1028  inode = NULL;
1029  if (bh) {
1030  unsigned long ino = le32_to_cpu(de->inode);
1031  brelse (bh);
1032  if (!ext3_valid_inum(dir->i_sb, ino)) {
1033  ext3_error(dir->i_sb, "ext3_lookup",
1034  "bad inode number: %lu", ino);
1035  return ERR_PTR(-EIO);
1036  }
1037  inode = ext3_iget(dir->i_sb, ino);
1038  if (inode == ERR_PTR(-ESTALE)) {
1039  ext3_error(dir->i_sb, __func__,
1040  "deleted inode referenced: %lu",
1041  ino);
1042  return ERR_PTR(-EIO);
1043  }
1044  }
1045  return d_splice_alias(inode, dentry);
1046 }
1047 
1048 
1049 struct dentry *ext3_get_parent(struct dentry *child)
1050 {
1051  unsigned long ino;
1052  struct qstr dotdot = QSTR_INIT("..", 2);
1053  struct ext3_dir_entry_2 * de;
1054  struct buffer_head *bh;
1055 
1056  bh = ext3_find_entry(child->d_inode, &dotdot, &de);
1057  if (!bh)
1058  return ERR_PTR(-ENOENT);
1059  ino = le32_to_cpu(de->inode);
1060  brelse(bh);
1061 
1062  if (!ext3_valid_inum(child->d_inode->i_sb, ino)) {
1063  ext3_error(child->d_inode->i_sb, "ext3_get_parent",
1064  "bad inode number: %lu", ino);
1065  return ERR_PTR(-EIO);
1066  }
1067 
1068  return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino));
1069 }
1070 
1071 #define S_SHIFT 12
1072 static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
1074  [S_IFDIR >> S_SHIFT] = EXT3_FT_DIR,
1077  [S_IFIFO >> S_SHIFT] = EXT3_FT_FIFO,
1078  [S_IFSOCK >> S_SHIFT] = EXT3_FT_SOCK,
1080 };
1081 
1082 static inline void ext3_set_de_type(struct super_block *sb,
1083  struct ext3_dir_entry_2 *de,
1084  umode_t mode) {
1086  de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
1087 }
1088 
1089 /*
1090  * Move count entries from end of map between two memory locations.
1091  * Returns pointer to last entry moved.
1092  */
1093 static struct ext3_dir_entry_2 *
1094 dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
1095 {
1096  unsigned rec_len = 0;
1097 
1098  while (count--) {
1099  struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
1100  rec_len = EXT3_DIR_REC_LEN(de->name_len);
1101  memcpy (to, de, rec_len);
1102  ((struct ext3_dir_entry_2 *) to)->rec_len =
1103  ext3_rec_len_to_disk(rec_len);
1104  de->inode = 0;
1105  map++;
1106  to += rec_len;
1107  }
1108  return (struct ext3_dir_entry_2 *) (to - rec_len);
1109 }
1110 
1111 /*
1112  * Compact each dir entry in the range to the minimal rec_len.
1113  * Returns pointer to last entry in range.
1114  */
1115 static struct ext3_dir_entry_2 *dx_pack_dirents(char *base, unsigned blocksize)
1116 {
1117  struct ext3_dir_entry_2 *next, *to, *prev;
1118  struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *)base;
1119  unsigned rec_len = 0;
1120 
1121  prev = to = de;
1122  while ((char *)de < base + blocksize) {
1123  next = ext3_next_entry(de);
1124  if (de->inode && de->name_len) {
1125  rec_len = EXT3_DIR_REC_LEN(de->name_len);
1126  if (de > to)
1127  memmove(to, de, rec_len);
1128  to->rec_len = ext3_rec_len_to_disk(rec_len);
1129  prev = to;
1130  to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
1131  }
1132  de = next;
1133  }
1134  return prev;
1135 }
1136 
1137 /*
1138  * Split a full leaf block to make room for a new dir entry.
1139  * Allocate a new block, and move entries so that they are approx. equally full.
1140  * Returns pointer to de in block into which the new entry will be inserted.
1141  */
1142 static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1143  struct buffer_head **bh,struct dx_frame *frame,
1144  struct dx_hash_info *hinfo, int *error)
1145 {
1146  unsigned blocksize = dir->i_sb->s_blocksize;
1147  unsigned count, continued;
1148  struct buffer_head *bh2;
1149  u32 newblock;
1150  u32 hash2;
1151  struct dx_map_entry *map;
1152  char *data1 = (*bh)->b_data, *data2;
1153  unsigned split, move, size;
1154  struct ext3_dir_entry_2 *de = NULL, *de2;
1155  int err = 0, i;
1156 
1157  bh2 = ext3_append (handle, dir, &newblock, &err);
1158  if (!(bh2)) {
1159  brelse(*bh);
1160  *bh = NULL;
1161  goto errout;
1162  }
1163 
1164  BUFFER_TRACE(*bh, "get_write_access");
1165  err = ext3_journal_get_write_access(handle, *bh);
1166  if (err)
1167  goto journal_error;
1168 
1169  BUFFER_TRACE(frame->bh, "get_write_access");
1170  err = ext3_journal_get_write_access(handle, frame->bh);
1171  if (err)
1172  goto journal_error;
1173 
1174  data2 = bh2->b_data;
1175 
1176  /* create map in the end of data2 block */
1177  map = (struct dx_map_entry *) (data2 + blocksize);
1178  count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
1179  blocksize, hinfo, map);
1180  map -= count;
1181  dx_sort_map (map, count);
1182  /* Split the existing block in the middle, size-wise */
1183  size = 0;
1184  move = 0;
1185  for (i = count-1; i >= 0; i--) {
1186  /* is more than half of this entry in 2nd half of the block? */
1187  if (size + map[i].size/2 > blocksize/2)
1188  break;
1189  size += map[i].size;
1190  move++;
1191  }
1192  /* map index at which we will split */
1193  split = count - move;
1194  hash2 = map[split].hash;
1195  continued = hash2 == map[split - 1].hash;
1196  dxtrace(printk("Split block %i at %x, %i/%i\n",
1197  dx_get_block(frame->at), hash2, split, count-split));
1198 
1199  /* Fancy dance to stay within two buffers */
1200  de2 = dx_move_dirents(data1, data2, map + split, count - split);
1201  de = dx_pack_dirents(data1,blocksize);
1202  de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1203  de2->rec_len = ext3_rec_len_to_disk(data2 + blocksize - (char *) de2);
1204  dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
1205  dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
1206 
1207  /* Which block gets the new entry? */
1208  if (hinfo->hash >= hash2)
1209  {
1210  swap(*bh, bh2);
1211  de = de2;
1212  }
1213  dx_insert_block (frame, hash2 + continued, newblock);
1214  err = ext3_journal_dirty_metadata (handle, bh2);
1215  if (err)
1216  goto journal_error;
1217  err = ext3_journal_dirty_metadata (handle, frame->bh);
1218  if (err)
1219  goto journal_error;
1220  brelse (bh2);
1221  dxtrace(dx_show_index ("frame", frame->entries));
1222  return de;
1223 
1224 journal_error:
1225  brelse(*bh);
1226  brelse(bh2);
1227  *bh = NULL;
1228  ext3_std_error(dir->i_sb, err);
1229 errout:
1230  *error = err;
1231  return NULL;
1232 }
1233 
1234 
1235 /*
1236  * Add a new entry into a directory (leaf) block. If de is non-NULL,
1237  * it points to a directory entry which is guaranteed to be large
1238  * enough for new directory entry. If de is NULL, then
1239  * add_dirent_to_buf will attempt search the directory block for
1240  * space. It will return -ENOSPC if no space is available, and -EIO
1241  * and -EEXIST if directory entry already exists.
1242  *
1243  * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1244  * all other cases bh is released.
1245  */
1246 static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1247  struct inode *inode, struct ext3_dir_entry_2 *de,
1248  struct buffer_head * bh)
1249 {
1250  struct inode *dir = dentry->d_parent->d_inode;
1251  const char *name = dentry->d_name.name;
1252  int namelen = dentry->d_name.len;
1253  unsigned long offset = 0;
1254  unsigned short reclen;
1255  int nlen, rlen, err;
1256  char *top;
1257 
1258  reclen = EXT3_DIR_REC_LEN(namelen);
1259  if (!de) {
1260  de = (struct ext3_dir_entry_2 *)bh->b_data;
1261  top = bh->b_data + dir->i_sb->s_blocksize - reclen;
1262  while ((char *) de <= top) {
1263  if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
1264  bh, offset)) {
1265  brelse (bh);
1266  return -EIO;
1267  }
1268  if (ext3_match (namelen, name, de)) {
1269  brelse (bh);
1270  return -EEXIST;
1271  }
1272  nlen = EXT3_DIR_REC_LEN(de->name_len);
1273  rlen = ext3_rec_len_from_disk(de->rec_len);
1274  if ((de->inode? rlen - nlen: rlen) >= reclen)
1275  break;
1276  de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
1277  offset += rlen;
1278  }
1279  if ((char *) de > top)
1280  return -ENOSPC;
1281  }
1282  BUFFER_TRACE(bh, "get_write_access");
1283  err = ext3_journal_get_write_access(handle, bh);
1284  if (err) {
1285  ext3_std_error(dir->i_sb, err);
1286  brelse(bh);
1287  return err;
1288  }
1289 
1290  /* By now the buffer is marked for journaling */
1291  nlen = EXT3_DIR_REC_LEN(de->name_len);
1292  rlen = ext3_rec_len_from_disk(de->rec_len);
1293  if (de->inode) {
1294  struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
1295  de1->rec_len = ext3_rec_len_to_disk(rlen - nlen);
1296  de->rec_len = ext3_rec_len_to_disk(nlen);
1297  de = de1;
1298  }
1299  de->file_type = EXT3_FT_UNKNOWN;
1300  if (inode) {
1301  de->inode = cpu_to_le32(inode->i_ino);
1302  ext3_set_de_type(dir->i_sb, de, inode->i_mode);
1303  } else
1304  de->inode = 0;
1305  de->name_len = namelen;
1306  memcpy (de->name, name, namelen);
1307  /*
1308  * XXX shouldn't update any times until successful
1309  * completion of syscall, but too many callers depend
1310  * on this.
1311  *
1312  * XXX similarly, too many callers depend on
1313  * ext3_new_inode() setting the times, but error
1314  * recovery deletes the inode, so the worst that can
1315  * happen is that the times are slightly out of date
1316  * and/or different from the directory change time.
1317  */
1318  dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
1319  ext3_update_dx_flag(dir);
1320  dir->i_version++;
1321  ext3_mark_inode_dirty(handle, dir);
1322  BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1323  err = ext3_journal_dirty_metadata(handle, bh);
1324  if (err)
1325  ext3_std_error(dir->i_sb, err);
1326  brelse(bh);
1327  return 0;
1328 }
1329 
1330 /*
1331  * This converts a one block unindexed directory to a 3 block indexed
1332  * directory, and adds the dentry to the indexed directory.
1333  */
1334 static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1335  struct inode *inode, struct buffer_head *bh)
1336 {
1337  struct inode *dir = dentry->d_parent->d_inode;
1338  const char *name = dentry->d_name.name;
1339  int namelen = dentry->d_name.len;
1340  struct buffer_head *bh2;
1341  struct dx_root *root;
1342  struct dx_frame frames[2], *frame;
1343  struct dx_entry *entries;
1344  struct ext3_dir_entry_2 *de, *de2;
1345  char *data1, *top;
1346  unsigned len;
1347  int retval;
1348  unsigned blocksize;
1349  struct dx_hash_info hinfo;
1350  u32 block;
1351  struct fake_dirent *fde;
1352 
1353  blocksize = dir->i_sb->s_blocksize;
1354  dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
1355  retval = ext3_journal_get_write_access(handle, bh);
1356  if (retval) {
1357  ext3_std_error(dir->i_sb, retval);
1358  brelse(bh);
1359  return retval;
1360  }
1361  root = (struct dx_root *) bh->b_data;
1362 
1363  /* The 0th block becomes the root, move the dirents out */
1364  fde = &root->dotdot;
1365  de = (struct ext3_dir_entry_2 *)((char *)fde +
1366  ext3_rec_len_from_disk(fde->rec_len));
1367  if ((char *) de >= (((char *) root) + blocksize)) {
1368  ext3_error(dir->i_sb, __func__,
1369  "invalid rec_len for '..' in inode %lu",
1370  dir->i_ino);
1371  brelse(bh);
1372  return -EIO;
1373  }
1374  len = ((char *) root) + blocksize - (char *) de;
1375 
1376  bh2 = ext3_append (handle, dir, &block, &retval);
1377  if (!(bh2)) {
1378  brelse(bh);
1379  return retval;
1380  }
1381  EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
1382  data1 = bh2->b_data;
1383 
1384  memcpy (data1, de, len);
1385  de = (struct ext3_dir_entry_2 *) data1;
1386  top = data1 + len;
1387  while ((char *)(de2 = ext3_next_entry(de)) < top)
1388  de = de2;
1389  de->rec_len = ext3_rec_len_to_disk(data1 + blocksize - (char *) de);
1390  /* Initialize the root; the dot dirents already exist */
1391  de = (struct ext3_dir_entry_2 *) (&root->dotdot);
1392  de->rec_len = ext3_rec_len_to_disk(blocksize - EXT3_DIR_REC_LEN(2));
1393  memset (&root->info, 0, sizeof(root->info));
1394  root->info.info_length = sizeof(root->info);
1395  root->info.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version;
1396  entries = root->entries;
1397  dx_set_block (entries, 1);
1398  dx_set_count (entries, 1);
1399  dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
1400 
1401  /* Initialize as for dx_probe */
1402  hinfo.hash_version = root->info.hash_version;
1403  if (hinfo.hash_version <= DX_HASH_TEA)
1404  hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned;
1405  hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed;
1406  ext3fs_dirhash(name, namelen, &hinfo);
1407  frame = frames;
1408  frame->entries = entries;
1409  frame->at = entries;
1410  frame->bh = bh;
1411  bh = bh2;
1412  /*
1413  * Mark buffers dirty here so that if do_split() fails we write a
1414  * consistent set of buffers to disk.
1415  */
1416  ext3_journal_dirty_metadata(handle, frame->bh);
1417  ext3_journal_dirty_metadata(handle, bh);
1418  de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
1419  if (!de) {
1420  ext3_mark_inode_dirty(handle, dir);
1421  dx_release(frames);
1422  return retval;
1423  }
1424  dx_release(frames);
1425 
1426  return add_dirent_to_buf(handle, dentry, inode, de, bh);
1427 }
1428 
1429 /*
1430  * ext3_add_entry()
1431  *
1432  * adds a file entry to the specified directory, using the same
1433  * semantics as ext3_find_entry(). It returns NULL if it failed.
1434  *
1435  * NOTE!! The inode part of 'de' is left at 0 - which means you
1436  * may not sleep between calling this and putting something into
1437  * the entry, as someone else might have used it while you slept.
1438  */
1439 static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
1440  struct inode *inode)
1441 {
1442  struct inode *dir = dentry->d_parent->d_inode;
1443  struct buffer_head * bh;
1444  struct ext3_dir_entry_2 *de;
1445  struct super_block * sb;
1446  int retval;
1447  int dx_fallback=0;
1448  unsigned blocksize;
1449  u32 block, blocks;
1450 
1451  sb = dir->i_sb;
1452  blocksize = sb->s_blocksize;
1453  if (!dentry->d_name.len)
1454  return -EINVAL;
1455  if (is_dx(dir)) {
1456  retval = ext3_dx_add_entry(handle, dentry, inode);
1457  if (!retval || (retval != ERR_BAD_DX_DIR))
1458  return retval;
1459  EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
1460  dx_fallback++;
1461  ext3_mark_inode_dirty(handle, dir);
1462  }
1463  blocks = dir->i_size >> sb->s_blocksize_bits;
1464  for (block = 0; block < blocks; block++) {
1465  if (!(bh = ext3_dir_bread(handle, dir, block, 0, &retval)))
1466  return retval;
1467 
1468  retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1469  if (retval != -ENOSPC)
1470  return retval;
1471 
1472  if (blocks == 1 && !dx_fallback &&
1474  return make_indexed_dir(handle, dentry, inode, bh);
1475  brelse(bh);
1476  }
1477  bh = ext3_append(handle, dir, &block, &retval);
1478  if (!bh)
1479  return retval;
1480  de = (struct ext3_dir_entry_2 *) bh->b_data;
1481  de->inode = 0;
1482  de->rec_len = ext3_rec_len_to_disk(blocksize);
1483  return add_dirent_to_buf(handle, dentry, inode, de, bh);
1484 }
1485 
1486 /*
1487  * Returns 0 for success, or a negative error value
1488  */
1489 static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1490  struct inode *inode)
1491 {
1492  struct dx_frame frames[2], *frame;
1493  struct dx_entry *entries, *at;
1494  struct dx_hash_info hinfo;
1495  struct buffer_head * bh;
1496  struct inode *dir = dentry->d_parent->d_inode;
1497  struct super_block * sb = dir->i_sb;
1498  struct ext3_dir_entry_2 *de;
1499  int err;
1500 
1501  frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
1502  if (!frame)
1503  return err;
1504  entries = frame->entries;
1505  at = frame->at;
1506 
1507  if (!(bh = ext3_dir_bread(handle, dir, dx_get_block(frame->at), 0, &err)))
1508  goto cleanup;
1509 
1510  BUFFER_TRACE(bh, "get_write_access");
1511  err = ext3_journal_get_write_access(handle, bh);
1512  if (err)
1513  goto journal_error;
1514 
1515  err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1516  if (err != -ENOSPC) {
1517  bh = NULL;
1518  goto cleanup;
1519  }
1520 
1521  /* Block full, should compress but for now just split */
1522  dxtrace(printk("using %u of %u node entries\n",
1523  dx_get_count(entries), dx_get_limit(entries)));
1524  /* Need to split index? */
1525  if (dx_get_count(entries) == dx_get_limit(entries)) {
1526  u32 newblock;
1527  unsigned icount = dx_get_count(entries);
1528  int levels = frame - frames;
1529  struct dx_entry *entries2;
1530  struct dx_node *node2;
1531  struct buffer_head *bh2;
1532 
1533  if (levels && (dx_get_count(frames->entries) ==
1534  dx_get_limit(frames->entries))) {
1535  ext3_warning(sb, __func__,
1536  "Directory index full!");
1537  err = -ENOSPC;
1538  goto cleanup;
1539  }
1540  bh2 = ext3_append (handle, dir, &newblock, &err);
1541  if (!(bh2))
1542  goto cleanup;
1543  node2 = (struct dx_node *)(bh2->b_data);
1544  entries2 = node2->entries;
1545  memset(&node2->fake, 0, sizeof(struct fake_dirent));
1546  node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1547  BUFFER_TRACE(frame->bh, "get_write_access");
1548  err = ext3_journal_get_write_access(handle, frame->bh);
1549  if (err)
1550  goto journal_error;
1551  if (levels) {
1552  unsigned icount1 = icount/2, icount2 = icount - icount1;
1553  unsigned hash2 = dx_get_hash(entries + icount1);
1554  dxtrace(printk("Split index %i/%i\n", icount1, icount2));
1555 
1556  BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
1557  err = ext3_journal_get_write_access(handle,
1558  frames[0].bh);
1559  if (err)
1560  goto journal_error;
1561 
1562  memcpy ((char *) entries2, (char *) (entries + icount1),
1563  icount2 * sizeof(struct dx_entry));
1564  dx_set_count (entries, icount1);
1565  dx_set_count (entries2, icount2);
1566  dx_set_limit (entries2, dx_node_limit(dir));
1567 
1568  /* Which index block gets the new entry? */
1569  if (at - entries >= icount1) {
1570  frame->at = at = at - entries - icount1 + entries2;
1571  frame->entries = entries = entries2;
1572  swap(frame->bh, bh2);
1573  }
1574  dx_insert_block (frames + 0, hash2, newblock);
1575  dxtrace(dx_show_index ("node", frames[1].entries));
1576  dxtrace(dx_show_index ("node",
1577  ((struct dx_node *) bh2->b_data)->entries));
1578  err = ext3_journal_dirty_metadata(handle, bh2);
1579  if (err)
1580  goto journal_error;
1581  brelse (bh2);
1582  } else {
1583  dxtrace(printk("Creating second level index...\n"));
1584  memcpy((char *) entries2, (char *) entries,
1585  icount * sizeof(struct dx_entry));
1586  dx_set_limit(entries2, dx_node_limit(dir));
1587 
1588  /* Set up root */
1589  dx_set_count(entries, 1);
1590  dx_set_block(entries + 0, newblock);
1591  ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
1592 
1593  /* Add new access path frame */
1594  frame = frames + 1;
1595  frame->at = at = at - entries + entries2;
1596  frame->entries = entries = entries2;
1597  frame->bh = bh2;
1598  err = ext3_journal_get_write_access(handle,
1599  frame->bh);
1600  if (err)
1601  goto journal_error;
1602  }
1603  err = ext3_journal_dirty_metadata(handle, frames[0].bh);
1604  if (err)
1605  goto journal_error;
1606  }
1607  de = do_split(handle, dir, &bh, frame, &hinfo, &err);
1608  if (!de)
1609  goto cleanup;
1610  err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1611  bh = NULL;
1612  goto cleanup;
1613 
1614 journal_error:
1615  ext3_std_error(dir->i_sb, err);
1616 cleanup:
1617  if (bh)
1618  brelse(bh);
1619  dx_release(frames);
1620  return err;
1621 }
1622 
1623 /*
1624  * ext3_delete_entry deletes a directory entry by merging it with the
1625  * previous entry
1626  */
1627 static int ext3_delete_entry (handle_t *handle,
1628  struct inode * dir,
1629  struct ext3_dir_entry_2 * de_del,
1630  struct buffer_head * bh)
1631 {
1632  struct ext3_dir_entry_2 * de, * pde;
1633  int i;
1634 
1635  i = 0;
1636  pde = NULL;
1637  de = (struct ext3_dir_entry_2 *) bh->b_data;
1638  while (i < bh->b_size) {
1639  if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
1640  return -EIO;
1641  if (de == de_del) {
1642  int err;
1643 
1644  BUFFER_TRACE(bh, "get_write_access");
1645  err = ext3_journal_get_write_access(handle, bh);
1646  if (err)
1647  goto journal_error;
1648 
1649  if (pde)
1650  pde->rec_len = ext3_rec_len_to_disk(
1651  ext3_rec_len_from_disk(pde->rec_len) +
1652  ext3_rec_len_from_disk(de->rec_len));
1653  else
1654  de->inode = 0;
1655  dir->i_version++;
1656  BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1657  err = ext3_journal_dirty_metadata(handle, bh);
1658  if (err) {
1659 journal_error:
1660  ext3_std_error(dir->i_sb, err);
1661  return err;
1662  }
1663  return 0;
1664  }
1665  i += ext3_rec_len_from_disk(de->rec_len);
1666  pde = de;
1667  de = ext3_next_entry(de);
1668  }
1669  return -ENOENT;
1670 }
1671 
1672 static int ext3_add_nondir(handle_t *handle,
1673  struct dentry *dentry, struct inode *inode)
1674 {
1675  int err = ext3_add_entry(handle, dentry, inode);
1676  if (!err) {
1677  ext3_mark_inode_dirty(handle, inode);
1678  unlock_new_inode(inode);
1679  d_instantiate(dentry, inode);
1680  return 0;
1681  }
1682  drop_nlink(inode);
1683  unlock_new_inode(inode);
1684  iput(inode);
1685  return err;
1686 }
1687 
1688 /*
1689  * By the time this is called, we already have created
1690  * the directory cache entry for the new file, but it
1691  * is so far negative - it has no inode.
1692  *
1693  * If the create succeeds, we fill in the inode information
1694  * with d_instantiate().
1695  */
1696 static int ext3_create (struct inode * dir, struct dentry * dentry, umode_t mode,
1697  bool excl)
1698 {
1699  handle_t *handle;
1700  struct inode * inode;
1701  int err, retries = 0;
1702 
1703  dquot_initialize(dir);
1704 
1705 retry:
1706  handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1709  if (IS_ERR(handle))
1710  return PTR_ERR(handle);
1711 
1712  if (IS_DIRSYNC(dir))
1713  handle->h_sync = 1;
1714 
1715  inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1716  err = PTR_ERR(inode);
1717  if (!IS_ERR(inode)) {
1718  inode->i_op = &ext3_file_inode_operations;
1719  inode->i_fop = &ext3_file_operations;
1720  ext3_set_aops(inode);
1721  err = ext3_add_nondir(handle, dentry, inode);
1722  }
1723  ext3_journal_stop(handle);
1724  if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1725  goto retry;
1726  return err;
1727 }
1728 
1729 static int ext3_mknod (struct inode * dir, struct dentry *dentry,
1730  umode_t mode, dev_t rdev)
1731 {
1732  handle_t *handle;
1733  struct inode *inode;
1734  int err, retries = 0;
1735 
1736  if (!new_valid_dev(rdev))
1737  return -EINVAL;
1738 
1739  dquot_initialize(dir);
1740 
1741 retry:
1742  handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1745  if (IS_ERR(handle))
1746  return PTR_ERR(handle);
1747 
1748  if (IS_DIRSYNC(dir))
1749  handle->h_sync = 1;
1750 
1751  inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1752  err = PTR_ERR(inode);
1753  if (!IS_ERR(inode)) {
1754  init_special_inode(inode, inode->i_mode, rdev);
1755 #ifdef CONFIG_EXT3_FS_XATTR
1757 #endif
1758  err = ext3_add_nondir(handle, dentry, inode);
1759  }
1760  ext3_journal_stop(handle);
1761  if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1762  goto retry;
1763  return err;
1764 }
1765 
1766 static int ext3_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
1767 {
1768  handle_t *handle;
1769  struct inode * inode;
1770  struct buffer_head * dir_block = NULL;
1771  struct ext3_dir_entry_2 * de;
1772  int err, retries = 0;
1773 
1774  if (dir->i_nlink >= EXT3_LINK_MAX)
1775  return -EMLINK;
1776 
1777  dquot_initialize(dir);
1778 
1779 retry:
1780  handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
1783  if (IS_ERR(handle))
1784  return PTR_ERR(handle);
1785 
1786  if (IS_DIRSYNC(dir))
1787  handle->h_sync = 1;
1788 
1789  inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
1790  err = PTR_ERR(inode);
1791  if (IS_ERR(inode))
1792  goto out_stop;
1793 
1794  inode->i_op = &ext3_dir_inode_operations;
1795  inode->i_fop = &ext3_dir_operations;
1796  inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
1797  if (!(dir_block = ext3_dir_bread(handle, inode, 0, 1, &err)))
1798  goto out_clear_inode;
1799 
1800  BUFFER_TRACE(dir_block, "get_write_access");
1801  err = ext3_journal_get_write_access(handle, dir_block);
1802  if (err)
1803  goto out_clear_inode;
1804 
1805  de = (struct ext3_dir_entry_2 *) dir_block->b_data;
1806  de->inode = cpu_to_le32(inode->i_ino);
1807  de->name_len = 1;
1808  de->rec_len = ext3_rec_len_to_disk(EXT3_DIR_REC_LEN(de->name_len));
1809  strcpy (de->name, ".");
1810  ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1811  de = ext3_next_entry(de);
1812  de->inode = cpu_to_le32(dir->i_ino);
1813  de->rec_len = ext3_rec_len_to_disk(inode->i_sb->s_blocksize -
1814  EXT3_DIR_REC_LEN(1));
1815  de->name_len = 2;
1816  strcpy (de->name, "..");
1817  ext3_set_de_type(dir->i_sb, de, S_IFDIR);
1818  set_nlink(inode, 2);
1819  BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
1820  err = ext3_journal_dirty_metadata(handle, dir_block);
1821  if (err)
1822  goto out_clear_inode;
1823 
1824  err = ext3_mark_inode_dirty(handle, inode);
1825  if (!err)
1826  err = ext3_add_entry (handle, dentry, inode);
1827 
1828  if (err) {
1829 out_clear_inode:
1830  clear_nlink(inode);
1831  unlock_new_inode(inode);
1832  ext3_mark_inode_dirty(handle, inode);
1833  iput (inode);
1834  goto out_stop;
1835  }
1836  inc_nlink(dir);
1837  ext3_update_dx_flag(dir);
1838  err = ext3_mark_inode_dirty(handle, dir);
1839  if (err)
1840  goto out_clear_inode;
1841 
1842  unlock_new_inode(inode);
1843  d_instantiate(dentry, inode);
1844 out_stop:
1845  brelse(dir_block);
1846  ext3_journal_stop(handle);
1847  if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
1848  goto retry;
1849  return err;
1850 }
1851 
1852 /*
1853  * routine to check that the specified directory is empty (for rmdir)
1854  */
1855 static int empty_dir (struct inode * inode)
1856 {
1857  unsigned long offset;
1858  struct buffer_head * bh;
1859  struct ext3_dir_entry_2 * de, * de1;
1860  struct super_block * sb;
1861  int err = 0;
1862 
1863  sb = inode->i_sb;
1864  if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
1865  !(bh = ext3_dir_bread(NULL, inode, 0, 0, &err))) {
1866  if (err)
1867  ext3_error(inode->i_sb, __func__,
1868  "error %d reading directory #%lu offset 0",
1869  err, inode->i_ino);
1870  else
1871  ext3_warning(inode->i_sb, __func__,
1872  "bad directory (dir #%lu) - no data block",
1873  inode->i_ino);
1874  return 1;
1875  }
1876  de = (struct ext3_dir_entry_2 *) bh->b_data;
1877  de1 = ext3_next_entry(de);
1878  if (le32_to_cpu(de->inode) != inode->i_ino ||
1879  !le32_to_cpu(de1->inode) ||
1880  strcmp (".", de->name) ||
1881  strcmp ("..", de1->name)) {
1882  ext3_warning (inode->i_sb, "empty_dir",
1883  "bad directory (dir #%lu) - no `.' or `..'",
1884  inode->i_ino);
1885  brelse (bh);
1886  return 1;
1887  }
1888  offset = ext3_rec_len_from_disk(de->rec_len) +
1889  ext3_rec_len_from_disk(de1->rec_len);
1890  de = ext3_next_entry(de1);
1891  while (offset < inode->i_size ) {
1892  if (!bh ||
1893  (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1894  err = 0;
1895  brelse (bh);
1896  if (!(bh = ext3_dir_bread (NULL, inode,
1897  offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err))) {
1898  if (err)
1899  ext3_error(sb, __func__,
1900  "error %d reading directory"
1901  " #%lu offset %lu",
1902  err, inode->i_ino, offset);
1903  offset += sb->s_blocksize;
1904  continue;
1905  }
1906  de = (struct ext3_dir_entry_2 *) bh->b_data;
1907  }
1908  if (!ext3_check_dir_entry("empty_dir", inode, de, bh, offset)) {
1909  de = (struct ext3_dir_entry_2 *)(bh->b_data +
1910  sb->s_blocksize);
1911  offset = (offset | (sb->s_blocksize - 1)) + 1;
1912  continue;
1913  }
1914  if (le32_to_cpu(de->inode)) {
1915  brelse (bh);
1916  return 0;
1917  }
1918  offset += ext3_rec_len_from_disk(de->rec_len);
1919  de = ext3_next_entry(de);
1920  }
1921  brelse (bh);
1922  return 1;
1923 }
1924 
1925 /* ext3_orphan_add() links an unlinked or truncated inode into a list of
1926  * such inodes, starting at the superblock, in case we crash before the
1927  * file is closed/deleted, or in case the inode truncate spans multiple
1928  * transactions and the last transaction is not recovered after a crash.
1929  *
1930  * At filesystem recovery time, we walk this list deleting unlinked
1931  * inodes and truncating linked inodes in ext3_orphan_cleanup().
1932  */
1933 int ext3_orphan_add(handle_t *handle, struct inode *inode)
1934 {
1935  struct super_block *sb = inode->i_sb;
1936  struct ext3_iloc iloc;
1937  int err = 0, rc;
1938 
1939  mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
1940  if (!list_empty(&EXT3_I(inode)->i_orphan))
1941  goto out_unlock;
1942 
1943  /* Orphan handling is only valid for files with data blocks
1944  * being truncated, or files being unlinked. */
1945 
1946  /* @@@ FIXME: Observation from aviro:
1947  * I think I can trigger J_ASSERT in ext3_orphan_add(). We block
1948  * here (on s_orphan_lock), so race with ext3_link() which might bump
1949  * ->i_nlink. For, say it, character device. Not a regular file,
1950  * not a directory, not a symlink and ->i_nlink > 0.
1951  *
1952  * tytso, 4/25/2009: I'm not sure how that could happen;
1953  * shouldn't the fs core protect us from these sort of
1954  * unlink()/link() races?
1955  */
1956  J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1957  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
1958 
1959  BUFFER_TRACE(EXT3_SB(sb)->s_sbh, "get_write_access");
1960  err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
1961  if (err)
1962  goto out_unlock;
1963 
1964  err = ext3_reserve_inode_write(handle, inode, &iloc);
1965  if (err)
1966  goto out_unlock;
1967 
1968  /* Insert this inode at the head of the on-disk orphan list... */
1969  NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
1970  EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
1971  err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
1972  rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
1973  if (!err)
1974  err = rc;
1975 
1976  /* Only add to the head of the in-memory list if all the
1977  * previous operations succeeded. If the orphan_add is going to
1978  * fail (possibly taking the journal offline), we can't risk
1979  * leaving the inode on the orphan list: stray orphan-list
1980  * entries can cause panics at unmount time.
1981  *
1982  * This is safe: on error we're going to ignore the orphan list
1983  * anyway on the next recovery. */
1984  if (!err)
1985  list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
1986 
1987  jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
1988  jbd_debug(4, "orphan inode %lu will point to %d\n",
1989  inode->i_ino, NEXT_ORPHAN(inode));
1990 out_unlock:
1991  mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
1992  ext3_std_error(inode->i_sb, err);
1993  return err;
1994 }
1995 
1996 /*
1997  * ext3_orphan_del() removes an unlinked or truncated inode from the list
1998  * of such inodes stored on disk, because it is finally being cleaned up.
1999  */
2000 int ext3_orphan_del(handle_t *handle, struct inode *inode)
2001 {
2002  struct list_head *prev;
2003  struct ext3_inode_info *ei = EXT3_I(inode);
2004  struct ext3_sb_info *sbi;
2005  unsigned long ino_next;
2006  struct ext3_iloc iloc;
2007  int err = 0;
2008 
2009  mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2010  if (list_empty(&ei->i_orphan))
2011  goto out;
2012 
2013  ino_next = NEXT_ORPHAN(inode);
2014  prev = ei->i_orphan.prev;
2015  sbi = EXT3_SB(inode->i_sb);
2016 
2017  jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
2018 
2019  list_del_init(&ei->i_orphan);
2020 
2021  /* If we're on an error path, we may not have a valid
2022  * transaction handle with which to update the orphan list on
2023  * disk, but we still need to remove the inode from the linked
2024  * list in memory. */
2025  if (!handle)
2026  goto out;
2027 
2028  err = ext3_reserve_inode_write(handle, inode, &iloc);
2029  if (err)
2030  goto out_err;
2031 
2032  if (prev == &sbi->s_orphan) {
2033  jbd_debug(4, "superblock will point to %lu\n", ino_next);
2034  BUFFER_TRACE(sbi->s_sbh, "get_write_access");
2035  err = ext3_journal_get_write_access(handle, sbi->s_sbh);
2036  if (err)
2037  goto out_brelse;
2038  sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
2039  err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
2040  } else {
2041  struct ext3_iloc iloc2;
2042  struct inode *i_prev =
2043  &list_entry(prev, struct ext3_inode_info, i_orphan)->vfs_inode;
2044 
2045  jbd_debug(4, "orphan inode %lu will point to %lu\n",
2046  i_prev->i_ino, ino_next);
2047  err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
2048  if (err)
2049  goto out_brelse;
2050  NEXT_ORPHAN(i_prev) = ino_next;
2051  err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
2052  }
2053  if (err)
2054  goto out_brelse;
2055  NEXT_ORPHAN(inode) = 0;
2056  err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2057 
2058 out_err:
2059  ext3_std_error(inode->i_sb, err);
2060 out:
2061  mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
2062  return err;
2063 
2064 out_brelse:
2065  brelse(iloc.bh);
2066  goto out_err;
2067 }
2068 
2069 static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
2070 {
2071  int retval;
2072  struct inode * inode;
2073  struct buffer_head * bh;
2074  struct ext3_dir_entry_2 * de;
2075  handle_t *handle;
2076 
2077  /* Initialize quotas before so that eventual writes go in
2078  * separate transaction */
2079  dquot_initialize(dir);
2080  dquot_initialize(dentry->d_inode);
2081 
2082  handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2083  if (IS_ERR(handle))
2084  return PTR_ERR(handle);
2085 
2086  retval = -ENOENT;
2087  bh = ext3_find_entry(dir, &dentry->d_name, &de);
2088  if (!bh)
2089  goto end_rmdir;
2090 
2091  if (IS_DIRSYNC(dir))
2092  handle->h_sync = 1;
2093 
2094  inode = dentry->d_inode;
2095 
2096  retval = -EIO;
2097  if (le32_to_cpu(de->inode) != inode->i_ino)
2098  goto end_rmdir;
2099 
2100  retval = -ENOTEMPTY;
2101  if (!empty_dir (inode))
2102  goto end_rmdir;
2103 
2104  retval = ext3_delete_entry(handle, dir, de, bh);
2105  if (retval)
2106  goto end_rmdir;
2107  if (inode->i_nlink != 2)
2108  ext3_warning (inode->i_sb, "ext3_rmdir",
2109  "empty directory has nlink!=2 (%d)",
2110  inode->i_nlink);
2111  inode->i_version++;
2112  clear_nlink(inode);
2113  /* There's no need to set i_disksize: the fact that i_nlink is
2114  * zero will ensure that the right thing happens during any
2115  * recovery. */
2116  inode->i_size = 0;
2117  ext3_orphan_add(handle, inode);
2118  inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2119  ext3_mark_inode_dirty(handle, inode);
2120  drop_nlink(dir);
2121  ext3_update_dx_flag(dir);
2122  ext3_mark_inode_dirty(handle, dir);
2123 
2124 end_rmdir:
2125  ext3_journal_stop(handle);
2126  brelse (bh);
2127  return retval;
2128 }
2129 
2130 static int ext3_unlink(struct inode * dir, struct dentry *dentry)
2131 {
2132  int retval;
2133  struct inode * inode;
2134  struct buffer_head * bh;
2135  struct ext3_dir_entry_2 * de;
2136  handle_t *handle;
2137 
2138  trace_ext3_unlink_enter(dir, dentry);
2139  /* Initialize quotas before so that eventual writes go
2140  * in separate transaction */
2141  dquot_initialize(dir);
2142  dquot_initialize(dentry->d_inode);
2143 
2144  handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
2145  if (IS_ERR(handle))
2146  return PTR_ERR(handle);
2147 
2148  if (IS_DIRSYNC(dir))
2149  handle->h_sync = 1;
2150 
2151  retval = -ENOENT;
2152  bh = ext3_find_entry(dir, &dentry->d_name, &de);
2153  if (!bh)
2154  goto end_unlink;
2155 
2156  inode = dentry->d_inode;
2157 
2158  retval = -EIO;
2159  if (le32_to_cpu(de->inode) != inode->i_ino)
2160  goto end_unlink;
2161 
2162  if (!inode->i_nlink) {
2163  ext3_warning (inode->i_sb, "ext3_unlink",
2164  "Deleting nonexistent file (%lu), %d",
2165  inode->i_ino, inode->i_nlink);
2166  set_nlink(inode, 1);
2167  }
2168  retval = ext3_delete_entry(handle, dir, de, bh);
2169  if (retval)
2170  goto end_unlink;
2171  dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
2172  ext3_update_dx_flag(dir);
2173  ext3_mark_inode_dirty(handle, dir);
2174  drop_nlink(inode);
2175  if (!inode->i_nlink)
2176  ext3_orphan_add(handle, inode);
2177  inode->i_ctime = dir->i_ctime;
2178  ext3_mark_inode_dirty(handle, inode);
2179  retval = 0;
2180 
2181 end_unlink:
2182  ext3_journal_stop(handle);
2183  brelse (bh);
2184  trace_ext3_unlink_exit(dentry, retval);
2185  return retval;
2186 }
2187 
2188 static int ext3_symlink (struct inode * dir,
2189  struct dentry *dentry, const char * symname)
2190 {
2191  handle_t *handle;
2192  struct inode * inode;
2193  int l, err, retries = 0;
2194  int credits;
2195 
2196  l = strlen(symname)+1;
2197  if (l > dir->i_sb->s_blocksize)
2198  return -ENAMETOOLONG;
2199 
2200  dquot_initialize(dir);
2201 
2202  if (l > EXT3_N_BLOCKS * 4) {
2203  /*
2204  * For non-fast symlinks, we just allocate inode and put it on
2205  * orphan list in the first transaction => we need bitmap,
2206  * group descriptor, sb, inode block, quota blocks, and
2207  * possibly selinux xattr blocks.
2208  */
2209  credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2211  } else {
2212  /*
2213  * Fast symlink. We have to add entry to directory
2214  * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
2215  * allocate new inode (bitmap, group descriptor, inode block,
2216  * quota blocks, sb is already counted in previous macros).
2217  */
2218  credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2221  }
2222 retry:
2223  handle = ext3_journal_start(dir, credits);
2224  if (IS_ERR(handle))
2225  return PTR_ERR(handle);
2226 
2227  if (IS_DIRSYNC(dir))
2228  handle->h_sync = 1;
2229 
2230  inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
2231  err = PTR_ERR(inode);
2232  if (IS_ERR(inode))
2233  goto out_stop;
2234 
2235  if (l > EXT3_N_BLOCKS * 4) {
2237  ext3_set_aops(inode);
2238  /*
2239  * We cannot call page_symlink() with transaction started
2240  * because it calls into ext3_write_begin() which acquires page
2241  * lock which ranks below transaction start (and it can also
2242  * wait for journal commit if we are running out of space). So
2243  * we have to stop transaction now and restart it when symlink
2244  * contents is written.
2245  *
2246  * To keep fs consistent in case of crash, we have to put inode
2247  * to orphan list in the mean time.
2248  */
2249  drop_nlink(inode);
2250  err = ext3_orphan_add(handle, inode);
2251  ext3_journal_stop(handle);
2252  if (err)
2253  goto err_drop_inode;
2254  err = __page_symlink(inode, symname, l, 1);
2255  if (err)
2256  goto err_drop_inode;
2257  /*
2258  * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
2259  * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
2260  */
2261  handle = ext3_journal_start(dir,
2264  if (IS_ERR(handle)) {
2265  err = PTR_ERR(handle);
2266  goto err_drop_inode;
2267  }
2268  set_nlink(inode, 1);
2269  err = ext3_orphan_del(handle, inode);
2270  if (err) {
2271  ext3_journal_stop(handle);
2272  drop_nlink(inode);
2273  goto err_drop_inode;
2274  }
2275  } else {
2277  memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
2278  inode->i_size = l-1;
2279  }
2280  EXT3_I(inode)->i_disksize = inode->i_size;
2281  err = ext3_add_nondir(handle, dentry, inode);
2282 out_stop:
2283  ext3_journal_stop(handle);
2284  if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2285  goto retry;
2286  return err;
2287 err_drop_inode:
2288  unlock_new_inode(inode);
2289  iput(inode);
2290  return err;
2291 }
2292 
2293 static int ext3_link (struct dentry * old_dentry,
2294  struct inode * dir, struct dentry *dentry)
2295 {
2296  handle_t *handle;
2297  struct inode *inode = old_dentry->d_inode;
2298  int err, retries = 0;
2299 
2300  if (inode->i_nlink >= EXT3_LINK_MAX)
2301  return -EMLINK;
2302 
2303  dquot_initialize(dir);
2304 
2305 retry:
2306  handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2308  if (IS_ERR(handle))
2309  return PTR_ERR(handle);
2310 
2311  if (IS_DIRSYNC(dir))
2312  handle->h_sync = 1;
2313 
2314  inode->i_ctime = CURRENT_TIME_SEC;
2315  inc_nlink(inode);
2316  ihold(inode);
2317 
2318  err = ext3_add_entry(handle, dentry, inode);
2319  if (!err) {
2320  ext3_mark_inode_dirty(handle, inode);
2321  d_instantiate(dentry, inode);
2322  } else {
2323  drop_nlink(inode);
2324  iput(inode);
2325  }
2326  ext3_journal_stop(handle);
2327  if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
2328  goto retry;
2329  return err;
2330 }
2331 
2332 #define PARENT_INO(buffer) \
2333  (ext3_next_entry((struct ext3_dir_entry_2 *)(buffer))->inode)
2334 
2335 /*
2336  * Anybody can rename anything with this: the permission checks are left to the
2337  * higher-level routines.
2338  */
2339 static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
2340  struct inode * new_dir,struct dentry *new_dentry)
2341 {
2342  handle_t *handle;
2343  struct inode * old_inode, * new_inode;
2344  struct buffer_head * old_bh, * new_bh, * dir_bh;
2345  struct ext3_dir_entry_2 * old_de, * new_de;
2346  int retval, flush_file = 0;
2347 
2348  dquot_initialize(old_dir);
2349  dquot_initialize(new_dir);
2350 
2351  old_bh = new_bh = dir_bh = NULL;
2352 
2353  /* Initialize quotas before so that eventual writes go
2354  * in separate transaction */
2355  if (new_dentry->d_inode)
2356  dquot_initialize(new_dentry->d_inode);
2357  handle = ext3_journal_start(old_dir, 2 *
2358  EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
2360  if (IS_ERR(handle))
2361  return PTR_ERR(handle);
2362 
2363  if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
2364  handle->h_sync = 1;
2365 
2366  old_bh = ext3_find_entry(old_dir, &old_dentry->d_name, &old_de);
2367  /*
2368  * Check for inode number is _not_ due to possible IO errors.
2369  * We might rmdir the source, keep it as pwd of some process
2370  * and merrily kill the link to whatever was created under the
2371  * same name. Goodbye sticky bit ;-<
2372  */
2373  old_inode = old_dentry->d_inode;
2374  retval = -ENOENT;
2375  if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
2376  goto end_rename;
2377 
2378  new_inode = new_dentry->d_inode;
2379  new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de);
2380  if (new_bh) {
2381  if (!new_inode) {
2382  brelse (new_bh);
2383  new_bh = NULL;
2384  }
2385  }
2386  if (S_ISDIR(old_inode->i_mode)) {
2387  if (new_inode) {
2388  retval = -ENOTEMPTY;
2389  if (!empty_dir (new_inode))
2390  goto end_rename;
2391  }
2392  retval = -EIO;
2393  dir_bh = ext3_dir_bread(handle, old_inode, 0, 0, &retval);
2394  if (!dir_bh)
2395  goto end_rename;
2396  if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
2397  goto end_rename;
2398  retval = -EMLINK;
2399  if (!new_inode && new_dir!=old_dir &&
2400  new_dir->i_nlink >= EXT3_LINK_MAX)
2401  goto end_rename;
2402  }
2403  if (!new_bh) {
2404  retval = ext3_add_entry (handle, new_dentry, old_inode);
2405  if (retval)
2406  goto end_rename;
2407  } else {
2408  BUFFER_TRACE(new_bh, "get write access");
2409  retval = ext3_journal_get_write_access(handle, new_bh);
2410  if (retval)
2411  goto journal_error;
2412  new_de->inode = cpu_to_le32(old_inode->i_ino);
2413  if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2415  new_de->file_type = old_de->file_type;
2416  new_dir->i_version++;
2417  new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC;
2418  ext3_mark_inode_dirty(handle, new_dir);
2419  BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
2420  retval = ext3_journal_dirty_metadata(handle, new_bh);
2421  if (retval)
2422  goto journal_error;
2423  brelse(new_bh);
2424  new_bh = NULL;
2425  }
2426 
2427  /*
2428  * Like most other Unix systems, set the ctime for inodes on a
2429  * rename.
2430  */
2431  old_inode->i_ctime = CURRENT_TIME_SEC;
2432  ext3_mark_inode_dirty(handle, old_inode);
2433 
2434  /*
2435  * ok, that's it
2436  */
2437  if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
2438  old_de->name_len != old_dentry->d_name.len ||
2439  strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
2440  (retval = ext3_delete_entry(handle, old_dir,
2441  old_de, old_bh)) == -ENOENT) {
2442  /* old_de could have moved from under us during htree split, so
2443  * make sure that we are deleting the right entry. We might
2444  * also be pointing to a stale entry in the unused part of
2445  * old_bh so just checking inum and the name isn't enough. */
2446  struct buffer_head *old_bh2;
2447  struct ext3_dir_entry_2 *old_de2;
2448 
2449  old_bh2 = ext3_find_entry(old_dir, &old_dentry->d_name,
2450  &old_de2);
2451  if (old_bh2) {
2452  retval = ext3_delete_entry(handle, old_dir,
2453  old_de2, old_bh2);
2454  brelse(old_bh2);
2455  }
2456  }
2457  if (retval) {
2458  ext3_warning(old_dir->i_sb, "ext3_rename",
2459  "Deleting old file (%lu), %d, error=%d",
2460  old_dir->i_ino, old_dir->i_nlink, retval);
2461  }
2462 
2463  if (new_inode) {
2464  drop_nlink(new_inode);
2465  new_inode->i_ctime = CURRENT_TIME_SEC;
2466  }
2467  old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
2468  ext3_update_dx_flag(old_dir);
2469  if (dir_bh) {
2470  BUFFER_TRACE(dir_bh, "get_write_access");
2471  retval = ext3_journal_get_write_access(handle, dir_bh);
2472  if (retval)
2473  goto journal_error;
2474  PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
2475  BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
2476  retval = ext3_journal_dirty_metadata(handle, dir_bh);
2477  if (retval) {
2478 journal_error:
2479  ext3_std_error(new_dir->i_sb, retval);
2480  goto end_rename;
2481  }
2482  drop_nlink(old_dir);
2483  if (new_inode) {
2484  drop_nlink(new_inode);
2485  } else {
2486  inc_nlink(new_dir);
2487  ext3_update_dx_flag(new_dir);
2488  ext3_mark_inode_dirty(handle, new_dir);
2489  }
2490  }
2491  ext3_mark_inode_dirty(handle, old_dir);
2492  if (new_inode) {
2493  ext3_mark_inode_dirty(handle, new_inode);
2494  if (!new_inode->i_nlink)
2495  ext3_orphan_add(handle, new_inode);
2496  if (ext3_should_writeback_data(new_inode))
2497  flush_file = 1;
2498  }
2499  retval = 0;
2500 
2501 end_rename:
2502  brelse (dir_bh);
2503  brelse (old_bh);
2504  brelse (new_bh);
2505  ext3_journal_stop(handle);
2506  if (retval == 0 && flush_file)
2507  filemap_flush(old_inode->i_mapping);
2508  return retval;
2509 }
2510 
2511 /*
2512  * directories can handle most operations...
2513  */
2515  .create = ext3_create,
2516  .lookup = ext3_lookup,
2517  .link = ext3_link,
2518  .unlink = ext3_unlink,
2519  .symlink = ext3_symlink,
2520  .mkdir = ext3_mkdir,
2521  .rmdir = ext3_rmdir,
2522  .mknod = ext3_mknod,
2523  .rename = ext3_rename,
2524  .setattr = ext3_setattr,
2525 #ifdef CONFIG_EXT3_FS_XATTR
2526  .setxattr = generic_setxattr,
2527  .getxattr = generic_getxattr,
2528  .listxattr = ext3_listxattr,
2529  .removexattr = generic_removexattr,
2530 #endif
2531  .get_acl = ext3_get_acl,
2532 };
2533 
2535  .setattr = ext3_setattr,
2536 #ifdef CONFIG_EXT3_FS_XATTR
2537  .setxattr = generic_setxattr,
2538  .getxattr = generic_getxattr,
2539  .listxattr = ext3_listxattr,
2540  .removexattr = generic_removexattr,
2541 #endif
2542  .get_acl = ext3_get_acl,
2543 };