Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
namei.c
Go to the documentation of this file.
1 /*
2  * linux/fs/namei.c
3  *
4  * Copyright (C) 1991, 1992 Linus Torvalds
5  */
6 
7 /*
8  * Some corrections by tytso.
9  */
10 
11 /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
12  * lookup logic.
13  */
14 /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
15  */
16 
17 #include <linux/init.h>
18 #include <linux/export.h>
19 #include <linux/kernel.h>
20 #include <linux/slab.h>
21 #include <linux/fs.h>
22 #include <linux/namei.h>
23 #include <linux/pagemap.h>
24 #include <linux/fsnotify.h>
25 #include <linux/personality.h>
26 #include <linux/security.h>
27 #include <linux/ima.h>
28 #include <linux/syscalls.h>
29 #include <linux/mount.h>
30 #include <linux/audit.h>
31 #include <linux/capability.h>
32 #include <linux/file.h>
33 #include <linux/fcntl.h>
34 #include <linux/device_cgroup.h>
35 #include <linux/fs_struct.h>
36 #include <linux/posix_acl.h>
37 #include <asm/uaccess.h>
38 
39 #include "internal.h"
40 #include "mount.h"
41 
42 /* [Feb-1997 T. Schoebel-Theuer]
43  * Fundamental changes in the pathname lookup mechanisms (namei)
44  * were necessary because of omirr. The reason is that omirr needs
45  * to know the _real_ pathname, not the user-supplied one, in case
46  * of symlinks (and also when transname replacements occur).
47  *
48  * The new code replaces the old recursive symlink resolution with
49  * an iterative one (in case of non-nested symlink chains). It does
50  * this with calls to <fs>_follow_link().
51  * As a side effect, dir_namei(), _namei() and follow_link() are now
52  * replaced with a single function lookup_dentry() that can handle all
53  * the special cases of the former code.
54  *
55  * With the new dcache, the pathname is stored at each inode, at least as
56  * long as the refcount of the inode is positive. As a side effect, the
57  * size of the dcache depends on the inode cache and thus is dynamic.
58  *
59  * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
60  * resolution to correspond with current state of the code.
61  *
62  * Note that the symlink resolution is not *completely* iterative.
63  * There is still a significant amount of tail- and mid- recursion in
64  * the algorithm. Also, note that <fs>_readlink() is not used in
65  * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
66  * may return different results than <fs>_follow_link(). Many virtual
67  * filesystems (including /proc) exhibit this behavior.
68  */
69 
70 /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
71  * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
72  * and the name already exists in form of a symlink, try to create the new
73  * name indicated by the symlink. The old code always complained that the
74  * name already exists, due to not following the symlink even if its target
75  * is nonexistent. The new semantics affects also mknod() and link() when
76  * the name is a symlink pointing to a non-existent name.
77  *
78  * I don't know which semantics is the right one, since I have no access
79  * to standards. But I found by trial that HP-UX 9.0 has the full "new"
80  * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
81  * "old" one. Personally, I think the new semantics is much more logical.
82  * Note that "ln old new" where "new" is a symlink pointing to a non-existing
83  * file does succeed in both HP-UX and SunOs, but not in Solaris
84  * and in the old Linux semantics.
85  */
86 
87 /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
88  * semantics. See the comments in "open_namei" and "do_link" below.
89  *
90  * [10-Sep-98 Alan Modra] Another symlink change.
91  */
92 
93 /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
94  * inside the path - always follow.
95  * in the last component in creation/removal/renaming - never follow.
96  * if LOOKUP_FOLLOW passed - follow.
97  * if the pathname has trailing slashes - follow.
98  * otherwise - don't follow.
99  * (applied in that order).
100  *
101  * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
102  * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
103  * During the 2.4 we need to fix the userland stuff depending on it -
104  * hopefully we will be able to get rid of that wart in 2.5. So far only
105  * XEmacs seems to be relying on it...
106  */
107 /*
108  * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
109  * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
110  * any extra contention...
111  */
112 
113 /* In order to reduce some races, while at the same time doing additional
114  * checking and hopefully speeding things up, we copy filenames to the
115  * kernel data space before using them..
116  *
117  * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
118  * PATH_MAX includes the nul terminator --RR.
119  */
121 {
122  if (name->separate) {
123  __putname(name->name);
124  kfree(name);
125  } else {
126  __putname(name);
127  }
128 }
129 
130 #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
131 
132 static struct filename *
133 getname_flags(const char __user *filename, int flags, int *empty)
134 {
135  struct filename *result, *err;
136  int len;
137  long max;
138  char *kname;
139 
140  result = audit_reusename(filename);
141  if (result)
142  return result;
143 
144  result = __getname();
145  if (unlikely(!result))
146  return ERR_PTR(-ENOMEM);
147 
148  /*
149  * First, try to embed the struct filename inside the names_cache
150  * allocation
151  */
152  kname = (char *)result + sizeof(*result);
153  result->name = kname;
154  result->separate = false;
155  max = EMBEDDED_NAME_MAX;
156 
157 recopy:
158  len = strncpy_from_user(kname, filename, max);
159  if (unlikely(len < 0)) {
160  err = ERR_PTR(len);
161  goto error;
162  }
163 
164  /*
165  * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
166  * separate struct filename so we can dedicate the entire
167  * names_cache allocation for the pathname, and re-do the copy from
168  * userland.
169  */
170  if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
171  kname = (char *)result;
172 
173  result = kzalloc(sizeof(*result), GFP_KERNEL);
174  if (!result) {
175  err = ERR_PTR(-ENOMEM);
176  result = (struct filename *)kname;
177  goto error;
178  }
179  result->name = kname;
180  result->separate = true;
181  max = PATH_MAX;
182  goto recopy;
183  }
184 
185  /* The empty path is special. */
186  if (unlikely(!len)) {
187  if (empty)
188  *empty = 1;
189  err = ERR_PTR(-ENOENT);
190  if (!(flags & LOOKUP_EMPTY))
191  goto error;
192  }
193 
194  err = ERR_PTR(-ENAMETOOLONG);
195  if (unlikely(len >= PATH_MAX))
196  goto error;
197 
198  result->uptr = filename;
199  audit_getname(result);
200  return result;
201 
202 error:
203  final_putname(result);
204  return err;
205 }
206 
207 struct filename *
208 getname(const char __user * filename)
209 {
210  return getname_flags(filename, 0, NULL);
211 }
213 
214 #ifdef CONFIG_AUDITSYSCALL
215 void putname(struct filename *name)
216 {
217  if (unlikely(!audit_dummy_context()))
218  return audit_putname(name);
219  final_putname(name);
220 }
221 #endif
222 
223 static int check_acl(struct inode *inode, int mask)
224 {
225 #ifdef CONFIG_FS_POSIX_ACL
226  struct posix_acl *acl;
227 
228  if (mask & MAY_NOT_BLOCK) {
229  acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
230  if (!acl)
231  return -EAGAIN;
232  /* no ->get_acl() calls in RCU mode... */
233  if (acl == ACL_NOT_CACHED)
234  return -ECHILD;
235  return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
236  }
237 
238  acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
239 
240  /*
241  * A filesystem can force a ACL callback by just never filling the
242  * ACL cache. But normally you'd fill the cache either at inode
243  * instantiation time, or on the first ->get_acl call.
244  *
245  * If the filesystem doesn't have a get_acl() function at all, we'll
246  * just create the negative cache entry.
247  */
248  if (acl == ACL_NOT_CACHED) {
249  if (inode->i_op->get_acl) {
250  acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
251  if (IS_ERR(acl))
252  return PTR_ERR(acl);
253  } else {
254  set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
255  return -EAGAIN;
256  }
257  }
258 
259  if (acl) {
260  int error = posix_acl_permission(inode, acl, mask);
261  posix_acl_release(acl);
262  return error;
263  }
264 #endif
265 
266  return -EAGAIN;
267 }
268 
269 /*
270  * This does the basic permission checking
271  */
272 static int acl_permission_check(struct inode *inode, int mask)
273 {
274  unsigned int mode = inode->i_mode;
275 
276  if (likely(uid_eq(current_fsuid(), inode->i_uid)))
277  mode >>= 6;
278  else {
279  if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
280  int error = check_acl(inode, mask);
281  if (error != -EAGAIN)
282  return error;
283  }
284 
285  if (in_group_p(inode->i_gid))
286  mode >>= 3;
287  }
288 
289  /*
290  * If the DACs are ok we don't need any capability check.
291  */
292  if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
293  return 0;
294  return -EACCES;
295 }
296 
311 int generic_permission(struct inode *inode, int mask)
312 {
313  int ret;
314 
315  /*
316  * Do the basic permission checks.
317  */
318  ret = acl_permission_check(inode, mask);
319  if (ret != -EACCES)
320  return ret;
321 
322  if (S_ISDIR(inode->i_mode)) {
323  /* DACs are overridable for directories */
324  if (inode_capable(inode, CAP_DAC_OVERRIDE))
325  return 0;
326  if (!(mask & MAY_WRITE))
328  return 0;
329  return -EACCES;
330  }
331  /*
332  * Read/write DACs are always overridable.
333  * Executable DACs are overridable when there is
334  * at least one exec bit set.
335  */
336  if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
337  if (inode_capable(inode, CAP_DAC_OVERRIDE))
338  return 0;
339 
340  /*
341  * Searching includes executable on directories, else just read.
342  */
343  mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
344  if (mask == MAY_READ)
346  return 0;
347 
348  return -EACCES;
349 }
350 
351 /*
352  * We _really_ want to just do "generic_permission()" without
353  * even looking at the inode->i_op values. So we keep a cache
354  * flag in inode->i_opflags, that says "this has not special
355  * permission function, use the fast case".
356  */
357 static inline int do_inode_permission(struct inode *inode, int mask)
358 {
359  if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
360  if (likely(inode->i_op->permission))
361  return inode->i_op->permission(inode, mask);
362 
363  /* This gets set once for the inode lifetime */
364  spin_lock(&inode->i_lock);
365  inode->i_opflags |= IOP_FASTPERM;
366  spin_unlock(&inode->i_lock);
367  }
368  return generic_permission(inode, mask);
369 }
370 
383 int __inode_permission(struct inode *inode, int mask)
384 {
385  int retval;
386 
387  if (unlikely(mask & MAY_WRITE)) {
388  /*
389  * Nobody gets write access to an immutable file.
390  */
391  if (IS_IMMUTABLE(inode))
392  return -EACCES;
393  }
394 
395  retval = do_inode_permission(inode, mask);
396  if (retval)
397  return retval;
398 
399  retval = devcgroup_inode_permission(inode, mask);
400  if (retval)
401  return retval;
402 
403  return security_inode_permission(inode, mask);
404 }
405 
414 static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
415 {
416  if (unlikely(mask & MAY_WRITE)) {
417  umode_t mode = inode->i_mode;
418 
419  /* Nobody gets write access to a read-only fs. */
420  if ((sb->s_flags & MS_RDONLY) &&
421  (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
422  return -EROFS;
423  }
424  return 0;
425 }
426 
438 int inode_permission(struct inode *inode, int mask)
439 {
440  int retval;
441 
442  retval = sb_permission(inode->i_sb, inode, mask);
443  if (retval)
444  return retval;
445  return __inode_permission(inode, mask);
446 }
447 
454 void path_get(struct path *path)
455 {
456  mntget(path->mnt);
457  dget(path->dentry);
458 }
460 
467 void path_put(struct path *path)
468 {
469  dput(path->dentry);
470  mntput(path->mnt);
471 }
473 
474 /*
475  * Path walking has 2 modes, rcu-walk and ref-walk (see
476  * Documentation/filesystems/path-lookup.txt). In situations when we can't
477  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
478  * normal reference counts on dentries and vfsmounts to transition to rcu-walk
479  * mode. Refcounts are grabbed at the last known good point before rcu-walk
480  * got stuck, so ref-walk may continue from there. If this is not successful
481  * (eg. a seqcount has changed), then failure is returned and it's up to caller
482  * to restart the path walk from the beginning in ref-walk mode.
483  */
484 
485 static inline void lock_rcu_walk(void)
486 {
487  br_read_lock(&vfsmount_lock);
488  rcu_read_lock();
489 }
490 
491 static inline void unlock_rcu_walk(void)
492 {
493  rcu_read_unlock();
494  br_read_unlock(&vfsmount_lock);
495 }
496 
507 static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
508 {
509  struct fs_struct *fs = current->fs;
510  struct dentry *parent = nd->path.dentry;
511  int want_root = 0;
512 
513  BUG_ON(!(nd->flags & LOOKUP_RCU));
514  if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
515  want_root = 1;
516  spin_lock(&fs->lock);
517  if (nd->root.mnt != fs->root.mnt ||
518  nd->root.dentry != fs->root.dentry)
519  goto err_root;
520  }
521  spin_lock(&parent->d_lock);
522  if (!dentry) {
523  if (!__d_rcu_to_refcount(parent, nd->seq))
524  goto err_parent;
525  BUG_ON(nd->inode != parent->d_inode);
526  } else {
527  if (dentry->d_parent != parent)
528  goto err_parent;
530  if (!__d_rcu_to_refcount(dentry, nd->seq))
531  goto err_child;
532  /*
533  * If the sequence check on the child dentry passed, then
534  * the child has not been removed from its parent. This
535  * means the parent dentry must be valid and able to take
536  * a reference at this point.
537  */
538  BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
539  BUG_ON(!parent->d_count);
540  parent->d_count++;
541  spin_unlock(&dentry->d_lock);
542  }
543  spin_unlock(&parent->d_lock);
544  if (want_root) {
545  path_get(&nd->root);
546  spin_unlock(&fs->lock);
547  }
548  mntget(nd->path.mnt);
549 
550  unlock_rcu_walk();
551  nd->flags &= ~LOOKUP_RCU;
552  return 0;
553 
554 err_child:
555  spin_unlock(&dentry->d_lock);
556 err_parent:
557  spin_unlock(&parent->d_lock);
558 err_root:
559  if (want_root)
560  spin_unlock(&fs->lock);
561  return -ECHILD;
562 }
563 
564 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
565 {
566  return dentry->d_op->d_revalidate(dentry, flags);
567 }
568 
579 static int complete_walk(struct nameidata *nd)
580 {
581  struct dentry *dentry = nd->path.dentry;
582  int status;
583 
584  if (nd->flags & LOOKUP_RCU) {
585  nd->flags &= ~LOOKUP_RCU;
586  if (!(nd->flags & LOOKUP_ROOT))
587  nd->root.mnt = NULL;
588  spin_lock(&dentry->d_lock);
589  if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
590  spin_unlock(&dentry->d_lock);
591  unlock_rcu_walk();
592  return -ECHILD;
593  }
594  BUG_ON(nd->inode != dentry->d_inode);
595  spin_unlock(&dentry->d_lock);
596  mntget(nd->path.mnt);
597  unlock_rcu_walk();
598  }
599 
600  if (likely(!(nd->flags & LOOKUP_JUMPED)))
601  return 0;
602 
603  if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
604  return 0;
605 
606  if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
607  return 0;
608 
609  /* Note: we do not d_invalidate() */
610  status = d_revalidate(dentry, nd->flags);
611  if (status > 0)
612  return 0;
613 
614  if (!status)
615  status = -ESTALE;
616 
617  path_put(&nd->path);
618  return status;
619 }
620 
621 static __always_inline void set_root(struct nameidata *nd)
622 {
623  if (!nd->root.mnt)
624  get_fs_root(current->fs, &nd->root);
625 }
626 
627 static int link_path_walk(const char *, struct nameidata *);
628 
629 static __always_inline void set_root_rcu(struct nameidata *nd)
630 {
631  if (!nd->root.mnt) {
632  struct fs_struct *fs = current->fs;
633  unsigned seq;
634 
635  do {
636  seq = read_seqcount_begin(&fs->seq);
637  nd->root = fs->root;
638  nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
639  } while (read_seqcount_retry(&fs->seq, seq));
640  }
641 }
642 
643 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
644 {
645  int ret;
646 
647  if (IS_ERR(link))
648  goto fail;
649 
650  if (*link == '/') {
651  set_root(nd);
652  path_put(&nd->path);
653  nd->path = nd->root;
654  path_get(&nd->root);
655  nd->flags |= LOOKUP_JUMPED;
656  }
657  nd->inode = nd->path.dentry->d_inode;
658 
659  ret = link_path_walk(link, nd);
660  return ret;
661 fail:
662  path_put(&nd->path);
663  return PTR_ERR(link);
664 }
665 
666 static void path_put_conditional(struct path *path, struct nameidata *nd)
667 {
668  dput(path->dentry);
669  if (path->mnt != nd->path.mnt)
670  mntput(path->mnt);
671 }
672 
673 static inline void path_to_nameidata(const struct path *path,
674  struct nameidata *nd)
675 {
676  if (!(nd->flags & LOOKUP_RCU)) {
677  dput(nd->path.dentry);
678  if (nd->path.mnt != path->mnt)
679  mntput(nd->path.mnt);
680  }
681  nd->path.mnt = path->mnt;
682  nd->path.dentry = path->dentry;
683 }
684 
685 /*
686  * Helper to directly jump to a known parsed path from ->follow_link,
687  * caller must have taken a reference to path beforehand.
688  */
689 void nd_jump_link(struct nameidata *nd, struct path *path)
690 {
691  path_put(&nd->path);
692 
693  nd->path = *path;
694  nd->inode = nd->path.dentry->d_inode;
695  nd->flags |= LOOKUP_JUMPED;
696 
697  BUG_ON(nd->inode->i_op->follow_link);
698 }
699 
700 static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
701 {
702  struct inode *inode = link->dentry->d_inode;
703  if (inode->i_op->put_link)
704  inode->i_op->put_link(link->dentry, nd, cookie);
705  path_put(link);
706 }
707 
709 int sysctl_protected_hardlinks __read_mostly = 0;
710 
727 static inline int may_follow_link(struct path *link, struct nameidata *nd)
728 {
729  const struct inode *inode;
730  const struct inode *parent;
731 
733  return 0;
734 
735  /* Allowed if owner and follower match. */
736  inode = link->dentry->d_inode;
737  if (uid_eq(current_cred()->fsuid, inode->i_uid))
738  return 0;
739 
740  /* Allowed if parent directory not sticky and world-writable. */
741  parent = nd->path.dentry->d_inode;
742  if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
743  return 0;
744 
745  /* Allowed if parent directory and link owner match. */
746  if (uid_eq(parent->i_uid, inode->i_uid))
747  return 0;
748 
749  audit_log_link_denied("follow_link", link);
750  path_put_conditional(link, nd);
751  path_put(&nd->path);
752  return -EACCES;
753 }
754 
767 static bool safe_hardlink_source(struct inode *inode)
768 {
769  umode_t mode = inode->i_mode;
770 
771  /* Special files should not get pinned to the filesystem. */
772  if (!S_ISREG(mode))
773  return false;
774 
775  /* Setuid files should not get pinned to the filesystem. */
776  if (mode & S_ISUID)
777  return false;
778 
779  /* Executable setgid files should not get pinned to the filesystem. */
780  if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
781  return false;
782 
783  /* Hardlinking to unreadable or unwritable sources is dangerous. */
784  if (inode_permission(inode, MAY_READ | MAY_WRITE))
785  return false;
786 
787  return true;
788 }
789 
802 static int may_linkat(struct path *link)
803 {
804  const struct cred *cred;
805  struct inode *inode;
806 
807  if (!sysctl_protected_hardlinks)
808  return 0;
809 
810  cred = current_cred();
811  inode = link->dentry->d_inode;
812 
813  /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
814  * otherwise, it must be a safe source.
815  */
816  if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
818  return 0;
819 
820  audit_log_link_denied("linkat", link);
821  return -EPERM;
822 }
823 
824 static __always_inline int
825 follow_link(struct path *link, struct nameidata *nd, void **p)
826 {
827  struct dentry *dentry = link->dentry;
828  int error;
829  char *s;
830 
831  BUG_ON(nd->flags & LOOKUP_RCU);
832 
833  if (link->mnt == nd->path.mnt)
834  mntget(link->mnt);
835 
836  error = -ELOOP;
837  if (unlikely(current->total_link_count >= 40))
838  goto out_put_nd_path;
839 
840  cond_resched();
841  current->total_link_count++;
842 
843  touch_atime(link);
844  nd_set_link(nd, NULL);
845 
846  error = security_inode_follow_link(link->dentry, nd);
847  if (error)
848  goto out_put_nd_path;
849 
850  nd->last_type = LAST_BIND;
851  *p = dentry->d_inode->i_op->follow_link(dentry, nd);
852  error = PTR_ERR(*p);
853  if (IS_ERR(*p))
854  goto out_put_nd_path;
855 
856  error = 0;
857  s = nd_get_link(nd);
858  if (s) {
859  error = __vfs_follow_link(nd, s);
860  if (unlikely(error))
861  put_link(nd, link, *p);
862  }
863 
864  return error;
865 
866 out_put_nd_path:
867  *p = NULL;
868  path_put(&nd->path);
869  path_put(link);
870  return error;
871 }
872 
873 static int follow_up_rcu(struct path *path)
874 {
875  struct mount *mnt = real_mount(path->mnt);
876  struct mount *parent;
877  struct dentry *mountpoint;
878 
879  parent = mnt->mnt_parent;
880  if (&parent->mnt == path->mnt)
881  return 0;
882  mountpoint = mnt->mnt_mountpoint;
883  path->dentry = mountpoint;
884  path->mnt = &parent->mnt;
885  return 1;
886 }
887 
888 /*
889  * follow_up - Find the mountpoint of path's vfsmount
890  *
891  * Given a path, find the mountpoint of its source file system.
892  * Replace @path with the path of the mountpoint in the parent mount.
893  * Up is towards /.
894  *
895  * Return 1 if we went up a level and 0 if we were already at the
896  * root.
897  */
898 int follow_up(struct path *path)
899 {
900  struct mount *mnt = real_mount(path->mnt);
901  struct mount *parent;
902  struct dentry *mountpoint;
903 
905  parent = mnt->mnt_parent;
906  if (parent == mnt) {
908  return 0;
909  }
910  mntget(&parent->mnt);
911  mountpoint = dget(mnt->mnt_mountpoint);
913  dput(path->dentry);
914  path->dentry = mountpoint;
915  mntput(path->mnt);
916  path->mnt = &parent->mnt;
917  return 1;
918 }
919 
920 /*
921  * Perform an automount
922  * - return -EISDIR to tell follow_managed() to stop and return the path we
923  * were called with.
924  */
925 static int follow_automount(struct path *path, unsigned flags,
926  bool *need_mntput)
927 {
928  struct vfsmount *mnt;
929  int err;
930 
931  if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
932  return -EREMOTE;
933 
934  /* We don't want to mount if someone's just doing a stat -
935  * unless they're stat'ing a directory and appended a '/' to
936  * the name.
937  *
938  * We do, however, want to mount if someone wants to open or
939  * create a file of any type under the mountpoint, wants to
940  * traverse through the mountpoint or wants to open the
941  * mounted directory. Also, autofs may mark negative dentries
942  * as being automount points. These will need the attentions
943  * of the daemon to instantiate them before they can be used.
944  */
945  if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
947  path->dentry->d_inode)
948  return -EISDIR;
949 
950  current->total_link_count++;
951  if (current->total_link_count >= 40)
952  return -ELOOP;
953 
954  mnt = path->dentry->d_op->d_automount(path);
955  if (IS_ERR(mnt)) {
956  /*
957  * The filesystem is allowed to return -EISDIR here to indicate
958  * it doesn't want to automount. For instance, autofs would do
959  * this so that its userspace daemon can mount on this dentry.
960  *
961  * However, we can only permit this if it's a terminal point in
962  * the path being looked up; if it wasn't then the remainder of
963  * the path is inaccessible and we should say so.
964  */
965  if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
966  return -EREMOTE;
967  return PTR_ERR(mnt);
968  }
969 
970  if (!mnt) /* mount collision */
971  return 0;
972 
973  if (!*need_mntput) {
974  /* lock_mount() may release path->mnt on error */
975  mntget(path->mnt);
976  *need_mntput = true;
977  }
978  err = finish_automount(mnt, path);
979 
980  switch (err) {
981  case -EBUSY:
982  /* Someone else made a mount here whilst we were busy */
983  return 0;
984  case 0:
985  path_put(path);
986  path->mnt = mnt;
987  path->dentry = dget(mnt->mnt_root);
988  return 0;
989  default:
990  return err;
991  }
992 
993 }
994 
995 /*
996  * Handle a dentry that is managed in some way.
997  * - Flagged for transit management (autofs)
998  * - Flagged as mountpoint
999  * - Flagged as automount point
1000  *
1001  * This may only be called in refwalk mode.
1002  *
1003  * Serialization is taken care of in namespace.c
1004  */
1005 static int follow_managed(struct path *path, unsigned flags)
1006 {
1007  struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1008  unsigned managed;
1009  bool need_mntput = false;
1010  int ret = 0;
1011 
1012  /* Given that we're not holding a lock here, we retain the value in a
1013  * local variable for each dentry as we look at it so that we don't see
1014  * the components of that value change under us */
1015  while (managed = ACCESS_ONCE(path->dentry->d_flags),
1016  managed &= DCACHE_MANAGED_DENTRY,
1017  unlikely(managed != 0)) {
1018  /* Allow the filesystem to manage the transit without i_mutex
1019  * being held. */
1020  if (managed & DCACHE_MANAGE_TRANSIT) {
1021  BUG_ON(!path->dentry->d_op);
1022  BUG_ON(!path->dentry->d_op->d_manage);
1023  ret = path->dentry->d_op->d_manage(path->dentry, false);
1024  if (ret < 0)
1025  break;
1026  }
1027 
1028  /* Transit to a mounted filesystem. */
1029  if (managed & DCACHE_MOUNTED) {
1030  struct vfsmount *mounted = lookup_mnt(path);
1031  if (mounted) {
1032  dput(path->dentry);
1033  if (need_mntput)
1034  mntput(path->mnt);
1035  path->mnt = mounted;
1036  path->dentry = dget(mounted->mnt_root);
1037  need_mntput = true;
1038  continue;
1039  }
1040 
1041  /* Something is mounted on this dentry in another
1042  * namespace and/or whatever was mounted there in this
1043  * namespace got unmounted before we managed to get the
1044  * vfsmount_lock */
1045  }
1046 
1047  /* Handle an automount point */
1048  if (managed & DCACHE_NEED_AUTOMOUNT) {
1049  ret = follow_automount(path, flags, &need_mntput);
1050  if (ret < 0)
1051  break;
1052  continue;
1053  }
1054 
1055  /* We didn't change the current path point */
1056  break;
1057  }
1058 
1059  if (need_mntput && path->mnt == mnt)
1060  mntput(path->mnt);
1061  if (ret == -EISDIR)
1062  ret = 0;
1063  return ret < 0 ? ret : need_mntput;
1064 }
1065 
1066 int follow_down_one(struct path *path)
1067 {
1068  struct vfsmount *mounted;
1069 
1070  mounted = lookup_mnt(path);
1071  if (mounted) {
1072  dput(path->dentry);
1073  mntput(path->mnt);
1074  path->mnt = mounted;
1075  path->dentry = dget(mounted->mnt_root);
1076  return 1;
1077  }
1078  return 0;
1079 }
1080 
1081 static inline bool managed_dentry_might_block(struct dentry *dentry)
1082 {
1083  return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
1084  dentry->d_op->d_manage(dentry, true) < 0);
1085 }
1086 
1087 /*
1088  * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
1089  * we meet a managed dentry that would need blocking.
1090  */
1091 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1092  struct inode **inode)
1093 {
1094  for (;;) {
1095  struct mount *mounted;
1096  /*
1097  * Don't forget we might have a non-mountpoint managed dentry
1098  * that wants to block transit.
1099  */
1100  if (unlikely(managed_dentry_might_block(path->dentry)))
1101  return false;
1102 
1103  if (!d_mountpoint(path->dentry))
1104  break;
1105 
1106  mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1107  if (!mounted)
1108  break;
1109  path->mnt = &mounted->mnt;
1110  path->dentry = mounted->mnt.mnt_root;
1111  nd->flags |= LOOKUP_JUMPED;
1112  nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1113  /*
1114  * Update the inode too. We don't need to re-check the
1115  * dentry sequence number here after this d_inode read,
1116  * because a mount-point is always pinned.
1117  */
1118  *inode = path->dentry->d_inode;
1119  }
1120  return true;
1121 }
1122 
1123 static void follow_mount_rcu(struct nameidata *nd)
1124 {
1125  while (d_mountpoint(nd->path.dentry)) {
1126  struct mount *mounted;
1127  mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
1128  if (!mounted)
1129  break;
1130  nd->path.mnt = &mounted->mnt;
1131  nd->path.dentry = mounted->mnt.mnt_root;
1132  nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1133  }
1134 }
1135 
1136 static int follow_dotdot_rcu(struct nameidata *nd)
1137 {
1138  set_root_rcu(nd);
1139 
1140  while (1) {
1141  if (nd->path.dentry == nd->root.dentry &&
1142  nd->path.mnt == nd->root.mnt) {
1143  break;
1144  }
1145  if (nd->path.dentry != nd->path.mnt->mnt_root) {
1146  struct dentry *old = nd->path.dentry;
1147  struct dentry *parent = old->d_parent;
1148  unsigned seq;
1149 
1150  seq = read_seqcount_begin(&parent->d_seq);
1151  if (read_seqcount_retry(&old->d_seq, nd->seq))
1152  goto failed;
1153  nd->path.dentry = parent;
1154  nd->seq = seq;
1155  break;
1156  }
1157  if (!follow_up_rcu(&nd->path))
1158  break;
1159  nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1160  }
1161  follow_mount_rcu(nd);
1162  nd->inode = nd->path.dentry->d_inode;
1163  return 0;
1164 
1165 failed:
1166  nd->flags &= ~LOOKUP_RCU;
1167  if (!(nd->flags & LOOKUP_ROOT))
1168  nd->root.mnt = NULL;
1169  unlock_rcu_walk();
1170  return -ECHILD;
1171 }
1172 
1173 /*
1174  * Follow down to the covering mount currently visible to userspace. At each
1175  * point, the filesystem owning that dentry may be queried as to whether the
1176  * caller is permitted to proceed or not.
1177  */
1178 int follow_down(struct path *path)
1179 {
1180  unsigned managed;
1181  int ret;
1182 
1183  while (managed = ACCESS_ONCE(path->dentry->d_flags),
1184  unlikely(managed & DCACHE_MANAGED_DENTRY)) {
1185  /* Allow the filesystem to manage the transit without i_mutex
1186  * being held.
1187  *
1188  * We indicate to the filesystem if someone is trying to mount
1189  * something here. This gives autofs the chance to deny anyone
1190  * other than its daemon the right to mount on its
1191  * superstructure.
1192  *
1193  * The filesystem may sleep at this point.
1194  */
1195  if (managed & DCACHE_MANAGE_TRANSIT) {
1196  BUG_ON(!path->dentry->d_op);
1197  BUG_ON(!path->dentry->d_op->d_manage);
1198  ret = path->dentry->d_op->d_manage(
1199  path->dentry, false);
1200  if (ret < 0)
1201  return ret == -EISDIR ? 0 : ret;
1202  }
1203 
1204  /* Transit to a mounted filesystem. */
1205  if (managed & DCACHE_MOUNTED) {
1206  struct vfsmount *mounted = lookup_mnt(path);
1207  if (!mounted)
1208  break;
1209  dput(path->dentry);
1210  mntput(path->mnt);
1211  path->mnt = mounted;
1212  path->dentry = dget(mounted->mnt_root);
1213  continue;
1214  }
1215 
1216  /* Don't handle automount points here */
1217  break;
1218  }
1219  return 0;
1220 }
1221 
1222 /*
1223  * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1224  */
1225 static void follow_mount(struct path *path)
1226 {
1227  while (d_mountpoint(path->dentry)) {
1228  struct vfsmount *mounted = lookup_mnt(path);
1229  if (!mounted)
1230  break;
1231  dput(path->dentry);
1232  mntput(path->mnt);
1233  path->mnt = mounted;
1234  path->dentry = dget(mounted->mnt_root);
1235  }
1236 }
1237 
1238 static void follow_dotdot(struct nameidata *nd)
1239 {
1240  set_root(nd);
1241 
1242  while(1) {
1243  struct dentry *old = nd->path.dentry;
1244 
1245  if (nd->path.dentry == nd->root.dentry &&
1246  nd->path.mnt == nd->root.mnt) {
1247  break;
1248  }
1249  if (nd->path.dentry != nd->path.mnt->mnt_root) {
1250  /* rare case of legitimate dget_parent()... */
1251  nd->path.dentry = dget_parent(nd->path.dentry);
1252  dput(old);
1253  break;
1254  }
1255  if (!follow_up(&nd->path))
1256  break;
1257  }
1258  follow_mount(&nd->path);
1259  nd->inode = nd->path.dentry->d_inode;
1260 }
1261 
1262 /*
1263  * This looks up the name in dcache, possibly revalidates the old dentry and
1264  * allocates a new one if not found or not valid. In the need_lookup argument
1265  * returns whether i_op->lookup is necessary.
1266  *
1267  * dir->d_inode->i_mutex must be held
1268  */
1269 static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1270  unsigned int flags, bool *need_lookup)
1271 {
1272  struct dentry *dentry;
1273  int error;
1274 
1275  *need_lookup = false;
1276  dentry = d_lookup(dir, name);
1277  if (dentry) {
1278  if (d_need_lookup(dentry)) {
1279  *need_lookup = true;
1280  } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1281  error = d_revalidate(dentry, flags);
1282  if (unlikely(error <= 0)) {
1283  if (error < 0) {
1284  dput(dentry);
1285  return ERR_PTR(error);
1286  } else if (!d_invalidate(dentry)) {
1287  dput(dentry);
1288  dentry = NULL;
1289  }
1290  }
1291  }
1292  }
1293 
1294  if (!dentry) {
1295  dentry = d_alloc(dir, name);
1296  if (unlikely(!dentry))
1297  return ERR_PTR(-ENOMEM);
1298 
1299  *need_lookup = true;
1300  }
1301  return dentry;
1302 }
1303 
1304 /*
1305  * Call i_op->lookup on the dentry. The dentry must be negative but may be
1306  * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
1307  *
1308  * dir->d_inode->i_mutex must be held
1309  */
1310 static struct dentry *lookup_real(struct inode *dir, struct dentry *dentry,
1311  unsigned int flags)
1312 {
1313  struct dentry *old;
1314 
1315  /* Don't create child dentry for a dead directory. */
1316  if (unlikely(IS_DEADDIR(dir))) {
1317  dput(dentry);
1318  return ERR_PTR(-ENOENT);
1319  }
1320 
1321  old = dir->i_op->lookup(dir, dentry, flags);
1322  if (unlikely(old)) {
1323  dput(dentry);
1324  dentry = old;
1325  }
1326  return dentry;
1327 }
1328 
1329 static struct dentry *__lookup_hash(struct qstr *name,
1330  struct dentry *base, unsigned int flags)
1331 {
1332  bool need_lookup;
1333  struct dentry *dentry;
1334 
1335  dentry = lookup_dcache(name, base, flags, &need_lookup);
1336  if (!need_lookup)
1337  return dentry;
1338 
1339  return lookup_real(base->d_inode, dentry, flags);
1340 }
1341 
1342 /*
1343  * It's more convoluted than I'd like it to be, but... it's still fairly
1344  * small and for now I'd prefer to have fast path as straight as possible.
1345  * It _is_ time-critical.
1346  */
1347 static int lookup_fast(struct nameidata *nd, struct qstr *name,
1348  struct path *path, struct inode **inode)
1349 {
1350  struct vfsmount *mnt = nd->path.mnt;
1351  struct dentry *dentry, *parent = nd->path.dentry;
1352  int need_reval = 1;
1353  int status = 1;
1354  int err;
1355 
1356  /*
1357  * Rename seqlock is not required here because in the off chance
1358  * of a false negative due to a concurrent rename, we're going to
1359  * do the non-racy lookup, below.
1360  */
1361  if (nd->flags & LOOKUP_RCU) {
1362  unsigned seq;
1363  dentry = __d_lookup_rcu(parent, name, &seq, nd->inode);
1364  if (!dentry)
1365  goto unlazy;
1366 
1367  /*
1368  * This sequence count validates that the inode matches
1369  * the dentry name information from lookup.
1370  */
1371  *inode = dentry->d_inode;
1372  if (read_seqcount_retry(&dentry->d_seq, seq))
1373  return -ECHILD;
1374 
1375  /*
1376  * This sequence count validates that the parent had no
1377  * changes while we did the lookup of the dentry above.
1378  *
1379  * The memory barrier in read_seqcount_begin of child is
1380  * enough, we can use __read_seqcount_retry here.
1381  */
1382  if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1383  return -ECHILD;
1384  nd->seq = seq;
1385 
1386  if (unlikely(d_need_lookup(dentry)))
1387  goto unlazy;
1388  if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1389  status = d_revalidate(dentry, nd->flags);
1390  if (unlikely(status <= 0)) {
1391  if (status != -ECHILD)
1392  need_reval = 0;
1393  goto unlazy;
1394  }
1395  }
1396  path->mnt = mnt;
1397  path->dentry = dentry;
1398  if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1399  goto unlazy;
1400  if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1401  goto unlazy;
1402  return 0;
1403 unlazy:
1404  if (unlazy_walk(nd, dentry))
1405  return -ECHILD;
1406  } else {
1407  dentry = __d_lookup(parent, name);
1408  }
1409 
1410  if (unlikely(!dentry))
1411  goto need_lookup;
1412 
1413  if (unlikely(d_need_lookup(dentry))) {
1414  dput(dentry);
1415  goto need_lookup;
1416  }
1417 
1418  if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1419  status = d_revalidate(dentry, nd->flags);
1420  if (unlikely(status <= 0)) {
1421  if (status < 0) {
1422  dput(dentry);
1423  return status;
1424  }
1425  if (!d_invalidate(dentry)) {
1426  dput(dentry);
1427  goto need_lookup;
1428  }
1429  }
1430 
1431  path->mnt = mnt;
1432  path->dentry = dentry;
1433  err = follow_managed(path, nd->flags);
1434  if (unlikely(err < 0)) {
1435  path_put_conditional(path, nd);
1436  return err;
1437  }
1438  if (err)
1439  nd->flags |= LOOKUP_JUMPED;
1440  *inode = path->dentry->d_inode;
1441  return 0;
1442 
1443 need_lookup:
1444  return 1;
1445 }
1446 
1447 /* Fast lookup failed, do it the slow way */
1448 static int lookup_slow(struct nameidata *nd, struct qstr *name,
1449  struct path *path)
1450 {
1451  struct dentry *dentry, *parent;
1452  int err;
1453 
1454  parent = nd->path.dentry;
1455  BUG_ON(nd->inode != parent->d_inode);
1456 
1457  mutex_lock(&parent->d_inode->i_mutex);
1458  dentry = __lookup_hash(name, parent, nd->flags);
1459  mutex_unlock(&parent->d_inode->i_mutex);
1460  if (IS_ERR(dentry))
1461  return PTR_ERR(dentry);
1462  path->mnt = nd->path.mnt;
1463  path->dentry = dentry;
1464  err = follow_managed(path, nd->flags);
1465  if (unlikely(err < 0)) {
1466  path_put_conditional(path, nd);
1467  return err;
1468  }
1469  if (err)
1470  nd->flags |= LOOKUP_JUMPED;
1471  return 0;
1472 }
1473 
1474 static inline int may_lookup(struct nameidata *nd)
1475 {
1476  if (nd->flags & LOOKUP_RCU) {
1477  int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1478  if (err != -ECHILD)
1479  return err;
1480  if (unlazy_walk(nd, NULL))
1481  return -ECHILD;
1482  }
1483  return inode_permission(nd->inode, MAY_EXEC);
1484 }
1485 
1486 static inline int handle_dots(struct nameidata *nd, int type)
1487 {
1488  if (type == LAST_DOTDOT) {
1489  if (nd->flags & LOOKUP_RCU) {
1490  if (follow_dotdot_rcu(nd))
1491  return -ECHILD;
1492  } else
1493  follow_dotdot(nd);
1494  }
1495  return 0;
1496 }
1497 
1498 static void terminate_walk(struct nameidata *nd)
1499 {
1500  if (!(nd->flags & LOOKUP_RCU)) {
1501  path_put(&nd->path);
1502  } else {
1503  nd->flags &= ~LOOKUP_RCU;
1504  if (!(nd->flags & LOOKUP_ROOT))
1505  nd->root.mnt = NULL;
1506  unlock_rcu_walk();
1507  }
1508 }
1509 
1510 /*
1511  * Do we need to follow links? We _really_ want to be able
1512  * to do this check without having to look at inode->i_op,
1513  * so we keep a cache of "no, this doesn't need follow_link"
1514  * for the common case.
1515  */
1516 static inline int should_follow_link(struct inode *inode, int follow)
1517 {
1518  if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1519  if (likely(inode->i_op->follow_link))
1520  return follow;
1521 
1522  /* This gets set once for the inode lifetime */
1523  spin_lock(&inode->i_lock);
1524  inode->i_opflags |= IOP_NOFOLLOW;
1525  spin_unlock(&inode->i_lock);
1526  }
1527  return 0;
1528 }
1529 
1530 static inline int walk_component(struct nameidata *nd, struct path *path,
1531  struct qstr *name, int type, int follow)
1532 {
1533  struct inode *inode;
1534  int err;
1535  /*
1536  * "." and ".." are special - ".." especially so because it has
1537  * to be able to know about the current root directory and
1538  * parent relationships.
1539  */
1540  if (unlikely(type != LAST_NORM))
1541  return handle_dots(nd, type);
1542  err = lookup_fast(nd, name, path, &inode);
1543  if (unlikely(err)) {
1544  if (err < 0)
1545  goto out_err;
1546 
1547  err = lookup_slow(nd, name, path);
1548  if (err < 0)
1549  goto out_err;
1550 
1551  inode = path->dentry->d_inode;
1552  }
1553  err = -ENOENT;
1554  if (!inode)
1555  goto out_path_put;
1556 
1557  if (should_follow_link(inode, follow)) {
1558  if (nd->flags & LOOKUP_RCU) {
1559  if (unlikely(unlazy_walk(nd, path->dentry))) {
1560  err = -ECHILD;
1561  goto out_err;
1562  }
1563  }
1564  BUG_ON(inode != path->dentry->d_inode);
1565  return 1;
1566  }
1567  path_to_nameidata(path, nd);
1568  nd->inode = inode;
1569  return 0;
1570 
1571 out_path_put:
1572  path_to_nameidata(path, nd);
1573 out_err:
1574  terminate_walk(nd);
1575  return err;
1576 }
1577 
1578 /*
1579  * This limits recursive symlink follows to 8, while
1580  * limiting consecutive symlinks to 40.
1581  *
1582  * Without that kind of total limit, nasty chains of consecutive
1583  * symlinks can cause almost arbitrarily long lookups.
1584  */
1585 static inline int nested_symlink(struct path *path, struct nameidata *nd)
1586 {
1587  int res;
1588 
1589  if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1590  path_put_conditional(path, nd);
1591  path_put(&nd->path);
1592  return -ELOOP;
1593  }
1594  BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1595 
1596  nd->depth++;
1597  current->link_count++;
1598 
1599  do {
1600  struct path link = *path;
1601  void *cookie;
1602 
1603  res = follow_link(&link, nd, &cookie);
1604  if (res)
1605  break;
1606  res = walk_component(nd, path, &nd->last,
1607  nd->last_type, LOOKUP_FOLLOW);
1608  put_link(nd, &link, cookie);
1609  } while (res > 0);
1610 
1611  current->link_count--;
1612  nd->depth--;
1613  return res;
1614 }
1615 
1616 /*
1617  * We really don't want to look at inode->i_op->lookup
1618  * when we don't have to. So we keep a cache bit in
1619  * the inode ->i_opflags field that says "yes, we can
1620  * do lookup on this inode".
1621  */
1622 static inline int can_lookup(struct inode *inode)
1623 {
1624  if (likely(inode->i_opflags & IOP_LOOKUP))
1625  return 1;
1626  if (likely(!inode->i_op->lookup))
1627  return 0;
1628 
1629  /* We do this once for the lifetime of the inode */
1630  spin_lock(&inode->i_lock);
1631  inode->i_opflags |= IOP_LOOKUP;
1632  spin_unlock(&inode->i_lock);
1633  return 1;
1634 }
1635 
1636 /*
1637  * We can do the critical dentry name comparison and hashing
1638  * operations one word at a time, but we are limited to:
1639  *
1640  * - Architectures with fast unaligned word accesses. We could
1641  * do a "get_unaligned()" if this helps and is sufficiently
1642  * fast.
1643  *
1644  * - Little-endian machines (so that we can generate the mask
1645  * of low bytes efficiently). Again, we *could* do a byte
1646  * swapping load on big-endian architectures if that is not
1647  * expensive enough to make the optimization worthless.
1648  *
1649  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1650  * do not trap on the (extremely unlikely) case of a page
1651  * crossing operation.
1652  *
1653  * - Furthermore, we need an efficient 64-bit compile for the
1654  * 64-bit case in order to generate the "number of bytes in
1655  * the final mask". Again, that could be replaced with a
1656  * efficient population count instruction or similar.
1657  */
1658 #ifdef CONFIG_DCACHE_WORD_ACCESS
1659 
1660 #include <asm/word-at-a-time.h>
1661 
1662 #ifdef CONFIG_64BIT
1663 
1664 static inline unsigned int fold_hash(unsigned long hash)
1665 {
1666  hash += hash >> (8*sizeof(int));
1667  return hash;
1668 }
1669 
1670 #else /* 32-bit case */
1671 
1672 #define fold_hash(x) (x)
1673 
1674 #endif
1675 
1676 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1677 {
1678  unsigned long a, mask;
1679  unsigned long hash = 0;
1680 
1681  for (;;) {
1682  a = load_unaligned_zeropad(name);
1683  if (len < sizeof(unsigned long))
1684  break;
1685  hash += a;
1686  hash *= 9;
1687  name += sizeof(unsigned long);
1688  len -= sizeof(unsigned long);
1689  if (!len)
1690  goto done;
1691  }
1692  mask = ~(~0ul << len*8);
1693  hash += mask & a;
1694 done:
1695  return fold_hash(hash);
1696 }
1698 
1699 /*
1700  * Calculate the length and hash of the path component, and
1701  * return the length of the component;
1702  */
1703 static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1704 {
1705  unsigned long a, b, adata, bdata, mask, hash, len;
1706  const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
1707 
1708  hash = a = 0;
1709  len = -sizeof(unsigned long);
1710  do {
1711  hash = (hash + a) * 9;
1712  len += sizeof(unsigned long);
1713  a = load_unaligned_zeropad(name+len);
1714  b = a ^ REPEAT_BYTE('/');
1715  } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
1716 
1717  adata = prep_zero_mask(a, adata, &constants);
1718  bdata = prep_zero_mask(b, bdata, &constants);
1719 
1720  mask = create_zero_mask(adata | bdata);
1721 
1722  hash += a & zero_bytemask(mask);
1723  *hashp = fold_hash(hash);
1724 
1725  return len + find_zero(mask);
1726 }
1727 
1728 #else
1729 
1730 unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1731 {
1732  unsigned long hash = init_name_hash();
1733  while (len--)
1734  hash = partial_name_hash(*name++, hash);
1735  return end_name_hash(hash);
1736 }
1738 
1739 /*
1740  * We know there's a real path component here of at least
1741  * one character.
1742  */
1743 static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1744 {
1745  unsigned long hash = init_name_hash();
1746  unsigned long len = 0, c;
1747 
1748  c = (unsigned char)*name;
1749  do {
1750  len++;
1751  hash = partial_name_hash(c, hash);
1752  c = (unsigned char)name[len];
1753  } while (c && c != '/');
1754  *hashp = end_name_hash(hash);
1755  return len;
1756 }
1757 
1758 #endif
1759 
1760 /*
1761  * Name resolution.
1762  * This is the basic name resolution function, turning a pathname into
1763  * the final dentry. We expect 'base' to be positive and a directory.
1764  *
1765  * Returns 0 and nd will have valid dentry and mnt on success.
1766  * Returns error and drops reference to input namei data on failure.
1767  */
1768 static int link_path_walk(const char *name, struct nameidata *nd)
1769 {
1770  struct path next;
1771  int err;
1772 
1773  while (*name=='/')
1774  name++;
1775  if (!*name)
1776  return 0;
1777 
1778  /* At this point we know we have a real path component. */
1779  for(;;) {
1780  struct qstr this;
1781  long len;
1782  int type;
1783 
1784  err = may_lookup(nd);
1785  if (err)
1786  break;
1787 
1788  len = hash_name(name, &this.hash);
1789  this.name = name;
1790  this.len = len;
1791 
1792  type = LAST_NORM;
1793  if (name[0] == '.') switch (len) {
1794  case 2:
1795  if (name[1] == '.') {
1796  type = LAST_DOTDOT;
1797  nd->flags |= LOOKUP_JUMPED;
1798  }
1799  break;
1800  case 1:
1801  type = LAST_DOT;
1802  }
1803  if (likely(type == LAST_NORM)) {
1804  struct dentry *parent = nd->path.dentry;
1805  nd->flags &= ~LOOKUP_JUMPED;
1806  if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1807  err = parent->d_op->d_hash(parent, nd->inode,
1808  &this);
1809  if (err < 0)
1810  break;
1811  }
1812  }
1813 
1814  if (!name[len])
1815  goto last_component;
1816  /*
1817  * If it wasn't NUL, we know it was '/'. Skip that
1818  * slash, and continue until no more slashes.
1819  */
1820  do {
1821  len++;
1822  } while (unlikely(name[len] == '/'));
1823  if (!name[len])
1824  goto last_component;
1825  name += len;
1826 
1827  err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1828  if (err < 0)
1829  return err;
1830 
1831  if (err) {
1832  err = nested_symlink(&next, nd);
1833  if (err)
1834  return err;
1835  }
1836  if (can_lookup(nd->inode))
1837  continue;
1838  err = -ENOTDIR;
1839  break;
1840  /* here ends the main loop */
1841 
1842 last_component:
1843  nd->last = this;
1844  nd->last_type = type;
1845  return 0;
1846  }
1847  terminate_walk(nd);
1848  return err;
1849 }
1850 
1851 static int path_init(int dfd, const char *name, unsigned int flags,
1852  struct nameidata *nd, struct file **fp)
1853 {
1854  int retval = 0;
1855 
1856  nd->last_type = LAST_ROOT; /* if there are only slashes... */
1857  nd->flags = flags | LOOKUP_JUMPED;
1858  nd->depth = 0;
1859  if (flags & LOOKUP_ROOT) {
1860  struct inode *inode = nd->root.dentry->d_inode;
1861  if (*name) {
1862  if (!inode->i_op->lookup)
1863  return -ENOTDIR;
1864  retval = inode_permission(inode, MAY_EXEC);
1865  if (retval)
1866  return retval;
1867  }
1868  nd->path = nd->root;
1869  nd->inode = inode;
1870  if (flags & LOOKUP_RCU) {
1871  lock_rcu_walk();
1872  nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1873  } else {
1874  path_get(&nd->path);
1875  }
1876  return 0;
1877  }
1878 
1879  nd->root.mnt = NULL;
1880 
1881  if (*name=='/') {
1882  if (flags & LOOKUP_RCU) {
1883  lock_rcu_walk();
1884  set_root_rcu(nd);
1885  } else {
1886  set_root(nd);
1887  path_get(&nd->root);
1888  }
1889  nd->path = nd->root;
1890  } else if (dfd == AT_FDCWD) {
1891  if (flags & LOOKUP_RCU) {
1892  struct fs_struct *fs = current->fs;
1893  unsigned seq;
1894 
1895  lock_rcu_walk();
1896 
1897  do {
1898  seq = read_seqcount_begin(&fs->seq);
1899  nd->path = fs->pwd;
1900  nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1901  } while (read_seqcount_retry(&fs->seq, seq));
1902  } else {
1903  get_fs_pwd(current->fs, &nd->path);
1904  }
1905  } else {
1906  struct fd f = fdget_raw(dfd);
1907  struct dentry *dentry;
1908 
1909  if (!f.file)
1910  return -EBADF;
1911 
1912  dentry = f.file->f_path.dentry;
1913 
1914  if (*name) {
1915  if (!S_ISDIR(dentry->d_inode->i_mode)) {
1916  fdput(f);
1917  return -ENOTDIR;
1918  }
1919 
1920  retval = inode_permission(dentry->d_inode, MAY_EXEC);
1921  if (retval) {
1922  fdput(f);
1923  return retval;
1924  }
1925  }
1926 
1927  nd->path = f.file->f_path;
1928  if (flags & LOOKUP_RCU) {
1929  if (f.need_put)
1930  *fp = f.file;
1931  nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1932  lock_rcu_walk();
1933  } else {
1934  path_get(&nd->path);
1935  fdput(f);
1936  }
1937  }
1938 
1939  nd->inode = nd->path.dentry->d_inode;
1940  return 0;
1941 }
1942 
1943 static inline int lookup_last(struct nameidata *nd, struct path *path)
1944 {
1945  if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1947 
1948  nd->flags &= ~LOOKUP_PARENT;
1949  return walk_component(nd, path, &nd->last, nd->last_type,
1950  nd->flags & LOOKUP_FOLLOW);
1951 }
1952 
1953 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1954 static int path_lookupat(int dfd, const char *name,
1955  unsigned int flags, struct nameidata *nd)
1956 {
1957  struct file *base = NULL;
1958  struct path path;
1959  int err;
1960 
1961  /*
1962  * Path walking is largely split up into 2 different synchronisation
1963  * schemes, rcu-walk and ref-walk (explained in
1964  * Documentation/filesystems/path-lookup.txt). These share much of the
1965  * path walk code, but some things particularly setup, cleanup, and
1966  * following mounts are sufficiently divergent that functions are
1967  * duplicated. Typically there is a function foo(), and its RCU
1968  * analogue, foo_rcu().
1969  *
1970  * -ECHILD is the error number of choice (just to avoid clashes) that
1971  * is returned if some aspect of an rcu-walk fails. Such an error must
1972  * be handled by restarting a traditional ref-walk (which will always
1973  * be able to complete).
1974  */
1975  err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1976 
1977  if (unlikely(err))
1978  return err;
1979 
1980  current->total_link_count = 0;
1981  err = link_path_walk(name, nd);
1982 
1983  if (!err && !(flags & LOOKUP_PARENT)) {
1984  err = lookup_last(nd, &path);
1985  while (err > 0) {
1986  void *cookie;
1987  struct path link = path;
1988  err = may_follow_link(&link, nd);
1989  if (unlikely(err))
1990  break;
1991  nd->flags |= LOOKUP_PARENT;
1992  err = follow_link(&link, nd, &cookie);
1993  if (err)
1994  break;
1995  err = lookup_last(nd, &path);
1996  put_link(nd, &link, cookie);
1997  }
1998  }
1999 
2000  if (!err)
2001  err = complete_walk(nd);
2002 
2003  if (!err && nd->flags & LOOKUP_DIRECTORY) {
2004  if (!nd->inode->i_op->lookup) {
2005  path_put(&nd->path);
2006  err = -ENOTDIR;
2007  }
2008  }
2009 
2010  if (base)
2011  fput(base);
2012 
2013  if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
2014  path_put(&nd->root);
2015  nd->root.mnt = NULL;
2016  }
2017  return err;
2018 }
2019 
2020 static int filename_lookup(int dfd, struct filename *name,
2021  unsigned int flags, struct nameidata *nd)
2022 {
2023  int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
2024  if (unlikely(retval == -ECHILD))
2025  retval = path_lookupat(dfd, name->name, flags, nd);
2026  if (unlikely(retval == -ESTALE))
2027  retval = path_lookupat(dfd, name->name,
2028  flags | LOOKUP_REVAL, nd);
2029 
2030  if (likely(!retval))
2031  audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
2032  return retval;
2033 }
2034 
2035 static int do_path_lookup(int dfd, const char *name,
2036  unsigned int flags, struct nameidata *nd)
2037 {
2038  struct filename filename = { .name = name };
2039 
2040  return filename_lookup(dfd, &filename, flags, nd);
2041 }
2042 
2043 /* does lookup, returns the object with parent locked */
2044 struct dentry *kern_path_locked(const char *name, struct path *path)
2045 {
2046  struct nameidata nd;
2047  struct dentry *d;
2048  int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
2049  if (err)
2050  return ERR_PTR(err);
2051  if (nd.last_type != LAST_NORM) {
2052  path_put(&nd.path);
2053  return ERR_PTR(-EINVAL);
2054  }
2055  mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
2056  d = __lookup_hash(&nd.last, nd.path.dentry, 0);
2057  if (IS_ERR(d)) {
2058  mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
2059  path_put(&nd.path);
2060  return d;
2061  }
2062  *path = nd.path;
2063  return d;
2064 }
2065 
2066 int kern_path(const char *name, unsigned int flags, struct path *path)
2067 {
2068  struct nameidata nd;
2069  int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
2070  if (!res)
2071  *path = nd.path;
2072  return res;
2073 }
2074 
2083 int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
2084  const char *name, unsigned int flags,
2085  struct path *path)
2086 {
2087  struct nameidata nd;
2088  int err;
2089  nd.root.dentry = dentry;
2090  nd.root.mnt = mnt;
2091  BUG_ON(flags & LOOKUP_PARENT);
2092  /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
2093  err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
2094  if (!err)
2095  *path = nd.path;
2096  return err;
2097 }
2098 
2099 /*
2100  * Restricted form of lookup. Doesn't follow links, single-component only,
2101  * needs parent already locked. Doesn't follow mounts.
2102  * SMP-safe.
2103  */
2104 static struct dentry *lookup_hash(struct nameidata *nd)
2105 {
2106  return __lookup_hash(&nd->last, nd->path.dentry, nd->flags);
2107 }
2108 
2120 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
2121 {
2122  struct qstr this;
2123  unsigned int c;
2124  int err;
2125 
2126  WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
2127 
2128  this.name = name;
2129  this.len = len;
2130  this.hash = full_name_hash(name, len);
2131  if (!len)
2132  return ERR_PTR(-EACCES);
2133 
2134  if (unlikely(name[0] == '.')) {
2135  if (len < 2 || (len == 2 && name[1] == '.'))
2136  return ERR_PTR(-EACCES);
2137  }
2138 
2139  while (len--) {
2140  c = *(const unsigned char *)name++;
2141  if (c == '/' || c == '\0')
2142  return ERR_PTR(-EACCES);
2143  }
2144  /*
2145  * See if the low-level filesystem might want
2146  * to use its own hash..
2147  */
2148  if (base->d_flags & DCACHE_OP_HASH) {
2149  int err = base->d_op->d_hash(base, base->d_inode, &this);
2150  if (err < 0)
2151  return ERR_PTR(err);
2152  }
2153 
2154  err = inode_permission(base->d_inode, MAY_EXEC);
2155  if (err)
2156  return ERR_PTR(err);
2157 
2158  return __lookup_hash(&this, base, 0);
2159 }
2160 
2161 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
2162  struct path *path, int *empty)
2163 {
2164  struct nameidata nd;
2165  struct filename *tmp = getname_flags(name, flags, empty);
2166  int err = PTR_ERR(tmp);
2167  if (!IS_ERR(tmp)) {
2168 
2169  BUG_ON(flags & LOOKUP_PARENT);
2170 
2171  err = filename_lookup(dfd, tmp, flags, &nd);
2172  putname(tmp);
2173  if (!err)
2174  *path = nd.path;
2175  }
2176  return err;
2177 }
2178 
2179 int user_path_at(int dfd, const char __user *name, unsigned flags,
2180  struct path *path)
2181 {
2182  return user_path_at_empty(dfd, name, flags, path, NULL);
2183 }
2184 
2185 /*
2186  * NB: most callers don't do anything directly with the reference to the
2187  * to struct filename, but the nd->last pointer points into the name string
2188  * allocated by getname. So we must hold the reference to it until all
2189  * path-walking is complete.
2190  */
2191 static struct filename *
2192 user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
2193 {
2194  struct filename *s = getname(path);
2195  int error;
2196 
2197  if (IS_ERR(s))
2198  return s;
2199 
2200  error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
2201  if (error) {
2202  putname(s);
2203  return ERR_PTR(error);
2204  }
2205 
2206  return s;
2207 }
2208 
2209 /*
2210  * It's inline, so penalty for filesystems that don't use sticky bit is
2211  * minimal.
2212  */
2213 static inline int check_sticky(struct inode *dir, struct inode *inode)
2214 {
2215  kuid_t fsuid = current_fsuid();
2216 
2217  if (!(dir->i_mode & S_ISVTX))
2218  return 0;
2219  if (uid_eq(inode->i_uid, fsuid))
2220  return 0;
2221  if (uid_eq(dir->i_uid, fsuid))
2222  return 0;
2223  return !inode_capable(inode, CAP_FOWNER);
2224 }
2225 
2226 /*
2227  * Check whether we can remove a link victim from directory dir, check
2228  * whether the type of victim is right.
2229  * 1. We can't do it if dir is read-only (done in permission())
2230  * 2. We should have write and exec permissions on dir
2231  * 3. We can't remove anything from append-only dir
2232  * 4. We can't do anything with immutable dir (done in permission())
2233  * 5. If the sticky bit on dir is set we should either
2234  * a. be owner of dir, or
2235  * b. be owner of victim, or
2236  * c. have CAP_FOWNER capability
2237  * 6. If the victim is append-only or immutable we can't do antyhing with
2238  * links pointing to it.
2239  * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
2240  * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
2241  * 9. We can't remove a root or mountpoint.
2242  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
2243  * nfs_async_unlink().
2244  */
2245 static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
2246 {
2247  int error;
2248 
2249  if (!victim->d_inode)
2250  return -ENOENT;
2251 
2252  BUG_ON(victim->d_parent->d_inode != dir);
2253  audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
2254 
2255  error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
2256  if (error)
2257  return error;
2258  if (IS_APPEND(dir))
2259  return -EPERM;
2260  if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
2261  IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
2262  return -EPERM;
2263  if (isdir) {
2264  if (!S_ISDIR(victim->d_inode->i_mode))
2265  return -ENOTDIR;
2266  if (IS_ROOT(victim))
2267  return -EBUSY;
2268  } else if (S_ISDIR(victim->d_inode->i_mode))
2269  return -EISDIR;
2270  if (IS_DEADDIR(dir))
2271  return -ENOENT;
2272  if (victim->d_flags & DCACHE_NFSFS_RENAMED)
2273  return -EBUSY;
2274  return 0;
2275 }
2276 
2277 /* Check whether we can create an object with dentry child in directory
2278  * dir.
2279  * 1. We can't do it if child already exists (open has special treatment for
2280  * this case, but since we are inlined it's OK)
2281  * 2. We can't do it if dir is read-only (done in permission())
2282  * 3. We should have write and exec permissions on dir
2283  * 4. We can't do it if dir is immutable (done in permission())
2284  */
2285 static inline int may_create(struct inode *dir, struct dentry *child)
2286 {
2287  if (child->d_inode)
2288  return -EEXIST;
2289  if (IS_DEADDIR(dir))
2290  return -ENOENT;
2291  return inode_permission(dir, MAY_WRITE | MAY_EXEC);
2292 }
2293 
2294 /*
2295  * p1 and p2 should be directories on the same fs.
2296  */
2297 struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
2298 {
2299  struct dentry *p;
2300 
2301  if (p1 == p2) {
2302  mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2303  return NULL;
2304  }
2305 
2306  mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2307 
2308  p = d_ancestor(p2, p1);
2309  if (p) {
2310  mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
2311  mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
2312  return p;
2313  }
2314 
2315  p = d_ancestor(p1, p2);
2316  if (p) {
2317  mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2318  mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2319  return p;
2320  }
2321 
2322  mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
2323  mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
2324  return NULL;
2325 }
2326 
2327 void unlock_rename(struct dentry *p1, struct dentry *p2)
2328 {
2329  mutex_unlock(&p1->d_inode->i_mutex);
2330  if (p1 != p2) {
2331  mutex_unlock(&p2->d_inode->i_mutex);
2332  mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
2333  }
2334 }
2335 
2336 int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2337  bool want_excl)
2338 {
2339  int error = may_create(dir, dentry);
2340  if (error)
2341  return error;
2342 
2343  if (!dir->i_op->create)
2344  return -EACCES; /* shouldn't it be ENOSYS? */
2345  mode &= S_IALLUGO;
2346  mode |= S_IFREG;
2347  error = security_inode_create(dir, dentry, mode);
2348  if (error)
2349  return error;
2350  error = dir->i_op->create(dir, dentry, mode, want_excl);
2351  if (!error)
2352  fsnotify_create(dir, dentry);
2353  return error;
2354 }
2355 
2356 static int may_open(struct path *path, int acc_mode, int flag)
2357 {
2358  struct dentry *dentry = path->dentry;
2359  struct inode *inode = dentry->d_inode;
2360  int error;
2361 
2362  /* O_PATH? */
2363  if (!acc_mode)
2364  return 0;
2365 
2366  if (!inode)
2367  return -ENOENT;
2368 
2369  switch (inode->i_mode & S_IFMT) {
2370  case S_IFLNK:
2371  return -ELOOP;
2372  case S_IFDIR:
2373  if (acc_mode & MAY_WRITE)
2374  return -EISDIR;
2375  break;
2376  case S_IFBLK:
2377  case S_IFCHR:
2378  if (path->mnt->mnt_flags & MNT_NODEV)
2379  return -EACCES;
2380  /*FALLTHRU*/
2381  case S_IFIFO:
2382  case S_IFSOCK:
2383  flag &= ~O_TRUNC;
2384  break;
2385  }
2386 
2387  error = inode_permission(inode, acc_mode);
2388  if (error)
2389  return error;
2390 
2391  /*
2392  * An append-only file must be opened in append mode for writing.
2393  */
2394  if (IS_APPEND(inode)) {
2395  if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
2396  return -EPERM;
2397  if (flag & O_TRUNC)
2398  return -EPERM;
2399  }
2400 
2401  /* O_NOATIME can only be set by the owner or superuser */
2402  if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2403  return -EPERM;
2404 
2405  return 0;
2406 }
2407 
2408 static int handle_truncate(struct file *filp)
2409 {
2410  struct path *path = &filp->f_path;
2411  struct inode *inode = path->dentry->d_inode;
2412  int error = get_write_access(inode);
2413  if (error)
2414  return error;
2415  /*
2416  * Refuse to truncate files with mandatory locks held on them.
2417  */
2418  error = locks_verify_locked(inode);
2419  if (!error)
2420  error = security_path_truncate(path);
2421  if (!error) {
2422  error = do_truncate(path->dentry, 0,
2424  filp);
2425  }
2426  put_write_access(inode);
2427  return error;
2428 }
2429 
2430 static inline int open_to_namei_flags(int flag)
2431 {
2432  if ((flag & O_ACCMODE) == 3)
2433  flag--;
2434  return flag;
2435 }
2436 
2437 static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
2438 {
2439  int error = security_path_mknod(dir, dentry, mode, 0);
2440  if (error)
2441  return error;
2442 
2443  error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
2444  if (error)
2445  return error;
2446 
2447  return security_inode_create(dir->dentry->d_inode, dentry, mode);
2448 }
2449 
2450 /*
2451  * Attempt to atomically look up, create and open a file from a negative
2452  * dentry.
2453  *
2454  * Returns 0 if successful. The file will have been created and attached to
2455  * @file by the filesystem calling finish_open().
2456  *
2457  * Returns 1 if the file was looked up only or didn't need creating. The
2458  * caller will need to perform the open themselves. @path will have been
2459  * updated to point to the new dentry. This may be negative.
2460  *
2461  * Returns an error code otherwise.
2462  */
2463 static int atomic_open(struct nameidata *nd, struct dentry *dentry,
2464  struct path *path, struct file *file,
2465  const struct open_flags *op,
2466  bool got_write, bool need_lookup,
2467  int *opened)
2468 {
2469  struct inode *dir = nd->path.dentry->d_inode;
2470  unsigned open_flag = open_to_namei_flags(op->open_flag);
2471  umode_t mode;
2472  int error;
2473  int acc_mode;
2474  int create_error = 0;
2475  struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
2476 
2477  BUG_ON(dentry->d_inode);
2478 
2479  /* Don't create child dentry for a dead directory. */
2480  if (unlikely(IS_DEADDIR(dir))) {
2481  error = -ENOENT;
2482  goto out;
2483  }
2484 
2485  mode = op->mode;
2486  if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
2487  mode &= ~current_umask();
2488 
2489  if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
2490  open_flag &= ~O_TRUNC;
2491  *opened |= FILE_CREATED;
2492  }
2493 
2494  /*
2495  * Checking write permission is tricky, bacuse we don't know if we are
2496  * going to actually need it: O_CREAT opens should work as long as the
2497  * file exists. But checking existence breaks atomicity. The trick is
2498  * to check access and if not granted clear O_CREAT from the flags.
2499  *
2500  * Another problem is returing the "right" error value (e.g. for an
2501  * O_EXCL open we want to return EEXIST not EROFS).
2502  */
2503  if (((open_flag & (O_CREAT | O_TRUNC)) ||
2504  (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
2505  if (!(open_flag & O_CREAT)) {
2506  /*
2507  * No O_CREATE -> atomicity not a requirement -> fall
2508  * back to lookup + open
2509  */
2510  goto no_open;
2511  } else if (open_flag & (O_EXCL | O_TRUNC)) {
2512  /* Fall back and fail with the right error */
2513  create_error = -EROFS;
2514  goto no_open;
2515  } else {
2516  /* No side effects, safe to clear O_CREAT */
2517  create_error = -EROFS;
2518  open_flag &= ~O_CREAT;
2519  }
2520  }
2521 
2522  if (open_flag & O_CREAT) {
2523  error = may_o_create(&nd->path, dentry, mode);
2524  if (error) {
2525  create_error = error;
2526  if (open_flag & O_EXCL)
2527  goto no_open;
2528  open_flag &= ~O_CREAT;
2529  }
2530  }
2531 
2532  if (nd->flags & LOOKUP_DIRECTORY)
2533  open_flag |= O_DIRECTORY;
2534 
2535  file->f_path.dentry = DENTRY_NOT_SET;
2536  file->f_path.mnt = nd->path.mnt;
2537  error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
2538  opened);
2539  if (error < 0) {
2540  if (create_error && error == -ENOENT)
2541  error = create_error;
2542  goto out;
2543  }
2544 
2545  acc_mode = op->acc_mode;
2546  if (*opened & FILE_CREATED) {
2547  fsnotify_create(dir, dentry);
2548  acc_mode = MAY_OPEN;
2549  }
2550 
2551  if (error) { /* returned 1, that is */
2552  if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
2553  error = -EIO;
2554  goto out;
2555  }
2556  if (file->f_path.dentry) {
2557  dput(dentry);
2558  dentry = file->f_path.dentry;
2559  }
2560  if (create_error && dentry->d_inode == NULL) {
2561  error = create_error;
2562  goto out;
2563  }
2564  goto looked_up;
2565  }
2566 
2567  /*
2568  * We didn't have the inode before the open, so check open permission
2569  * here.
2570  */
2571  error = may_open(&file->f_path, acc_mode, open_flag);
2572  if (error)
2573  fput(file);
2574 
2575 out:
2576  dput(dentry);
2577  return error;
2578 
2579 no_open:
2580  if (need_lookup) {
2581  dentry = lookup_real(dir, dentry, nd->flags);
2582  if (IS_ERR(dentry))
2583  return PTR_ERR(dentry);
2584 
2585  if (create_error) {
2586  int open_flag = op->open_flag;
2587 
2588  error = create_error;
2589  if ((open_flag & O_EXCL)) {
2590  if (!dentry->d_inode)
2591  goto out;
2592  } else if (!dentry->d_inode) {
2593  goto out;
2594  } else if ((open_flag & O_TRUNC) &&
2595  S_ISREG(dentry->d_inode->i_mode)) {
2596  goto out;
2597  }
2598  /* will fail later, go on to get the right error */
2599  }
2600  }
2601 looked_up:
2602  path->dentry = dentry;
2603  path->mnt = nd->path.mnt;
2604  return 1;
2605 }
2606 
2607 /*
2608  * Look up and maybe create and open the last component.
2609  *
2610  * Must be called with i_mutex held on parent.
2611  *
2612  * Returns 0 if the file was successfully atomically created (if necessary) and
2613  * opened. In this case the file will be returned attached to @file.
2614  *
2615  * Returns 1 if the file was not completely opened at this time, though lookups
2616  * and creations will have been performed and the dentry returned in @path will
2617  * be positive upon return if O_CREAT was specified. If O_CREAT wasn't
2618  * specified then a negative dentry may be returned.
2619  *
2620  * An error code is returned otherwise.
2621  *
2622  * FILE_CREATE will be set in @*opened if the dentry was created and will be
2623  * cleared otherwise prior to returning.
2624  */
2625 static int lookup_open(struct nameidata *nd, struct path *path,
2626  struct file *file,
2627  const struct open_flags *op,
2628  bool got_write, int *opened)
2629 {
2630  struct dentry *dir = nd->path.dentry;
2631  struct inode *dir_inode = dir->d_inode;
2632  struct dentry *dentry;
2633  int error;
2634  bool need_lookup;
2635 
2636  *opened &= ~FILE_CREATED;
2637  dentry = lookup_dcache(&nd->last, dir, nd->flags, &need_lookup);
2638  if (IS_ERR(dentry))
2639  return PTR_ERR(dentry);
2640 
2641  /* Cached positive dentry: will open in f_op->open */
2642  if (!need_lookup && dentry->d_inode)
2643  goto out_no_open;
2644 
2645  if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
2646  return atomic_open(nd, dentry, path, file, op, got_write,
2647  need_lookup, opened);
2648  }
2649 
2650  if (need_lookup) {
2651  BUG_ON(dentry->d_inode);
2652 
2653  dentry = lookup_real(dir_inode, dentry, nd->flags);
2654  if (IS_ERR(dentry))
2655  return PTR_ERR(dentry);
2656  }
2657 
2658  /* Negative dentry, just create the file */
2659  if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
2660  umode_t mode = op->mode;
2661  if (!IS_POSIXACL(dir->d_inode))
2662  mode &= ~current_umask();
2663  /*
2664  * This write is needed to ensure that a
2665  * rw->ro transition does not occur between
2666  * the time when the file is created and when
2667  * a permanent write count is taken through
2668  * the 'struct file' in finish_open().
2669  */
2670  if (!got_write) {
2671  error = -EROFS;
2672  goto out_dput;
2673  }
2674  *opened |= FILE_CREATED;
2675  error = security_path_mknod(&nd->path, dentry, mode, 0);
2676  if (error)
2677  goto out_dput;
2678  error = vfs_create(dir->d_inode, dentry, mode,
2679  nd->flags & LOOKUP_EXCL);
2680  if (error)
2681  goto out_dput;
2682  }
2683 out_no_open:
2684  path->dentry = dentry;
2685  path->mnt = nd->path.mnt;
2686  return 1;
2687 
2688 out_dput:
2689  dput(dentry);
2690  return error;
2691 }
2692 
2693 /*
2694  * Handle the last step of open()
2695  */
2696 static int do_last(struct nameidata *nd, struct path *path,
2697  struct file *file, const struct open_flags *op,
2698  int *opened, struct filename *name)
2699 {
2700  struct dentry *dir = nd->path.dentry;
2701  int open_flag = op->open_flag;
2702  bool will_truncate = (open_flag & O_TRUNC) != 0;
2703  bool got_write = false;
2704  int acc_mode = op->acc_mode;
2705  struct inode *inode;
2706  bool symlink_ok = false;
2707  struct path save_parent = { .dentry = NULL, .mnt = NULL };
2708  bool retried = false;
2709  int error;
2710 
2711  nd->flags &= ~LOOKUP_PARENT;
2712  nd->flags |= op->intent;
2713 
2714  switch (nd->last_type) {
2715  case LAST_DOTDOT:
2716  case LAST_DOT:
2717  error = handle_dots(nd, nd->last_type);
2718  if (error)
2719  return error;
2720  /* fallthrough */
2721  case LAST_ROOT:
2722  error = complete_walk(nd);
2723  if (error)
2724  return error;
2725  audit_inode(name, nd->path.dentry, 0);
2726  if (open_flag & O_CREAT) {
2727  error = -EISDIR;
2728  goto out;
2729  }
2730  goto finish_open;
2731  case LAST_BIND:
2732  error = complete_walk(nd);
2733  if (error)
2734  return error;
2735  audit_inode(name, dir, 0);
2736  goto finish_open;
2737  }
2738 
2739  if (!(open_flag & O_CREAT)) {
2740  if (nd->last.name[nd->last.len])
2742  if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2743  symlink_ok = true;
2744  /* we _can_ be in RCU mode here */
2745  error = lookup_fast(nd, &nd->last, path, &inode);
2746  if (likely(!error))
2747  goto finish_lookup;
2748 
2749  if (error < 0)
2750  goto out;
2751 
2752  BUG_ON(nd->inode != dir->d_inode);
2753  } else {
2754  /* create side of things */
2755  /*
2756  * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED
2757  * has been cleared when we got to the last component we are
2758  * about to look up
2759  */
2760  error = complete_walk(nd);
2761  if (error)
2762  return error;
2763 
2764  audit_inode(name, dir, 0);
2765  error = -EISDIR;
2766  /* trailing slashes? */
2767  if (nd->last.name[nd->last.len])
2768  goto out;
2769  }
2770 
2771 retry_lookup:
2772  if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
2773  error = mnt_want_write(nd->path.mnt);
2774  if (!error)
2775  got_write = true;
2776  /*
2777  * do _not_ fail yet - we might not need that or fail with
2778  * a different error; let lookup_open() decide; we'll be
2779  * dropping this one anyway.
2780  */
2781  }
2782  mutex_lock(&dir->d_inode->i_mutex);
2783  error = lookup_open(nd, path, file, op, got_write, opened);
2784  mutex_unlock(&dir->d_inode->i_mutex);
2785 
2786  if (error <= 0) {
2787  if (error)
2788  goto out;
2789 
2790  if ((*opened & FILE_CREATED) ||
2791  !S_ISREG(file->f_path.dentry->d_inode->i_mode))
2792  will_truncate = false;
2793 
2794  audit_inode(name, file->f_path.dentry, 0);
2795  goto opened;
2796  }
2797 
2798  if (*opened & FILE_CREATED) {
2799  /* Don't check for write permission, don't truncate */
2800  open_flag &= ~O_TRUNC;
2801  will_truncate = false;
2802  acc_mode = MAY_OPEN;
2803  path_to_nameidata(path, nd);
2804  goto finish_open_created;
2805  }
2806 
2807  /*
2808  * create/update audit record if it already exists.
2809  */
2810  if (path->dentry->d_inode)
2811  audit_inode(name, path->dentry, 0);
2812 
2813  /*
2814  * If atomic_open() acquired write access it is dropped now due to
2815  * possible mount and symlink following (this might be optimized away if
2816  * necessary...)
2817  */
2818  if (got_write) {
2819  mnt_drop_write(nd->path.mnt);
2820  got_write = false;
2821  }
2822 
2823  error = -EEXIST;
2824  if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
2825  goto exit_dput;
2826 
2827  error = follow_managed(path, nd->flags);
2828  if (error < 0)
2829  goto exit_dput;
2830 
2831  if (error)
2832  nd->flags |= LOOKUP_JUMPED;
2833 
2834  BUG_ON(nd->flags & LOOKUP_RCU);
2835  inode = path->dentry->d_inode;
2836 finish_lookup:
2837  /* we _can_ be in RCU mode here */
2838  error = -ENOENT;
2839  if (!inode) {
2840  path_to_nameidata(path, nd);
2841  goto out;
2842  }
2843 
2844  if (should_follow_link(inode, !symlink_ok)) {
2845  if (nd->flags & LOOKUP_RCU) {
2846  if (unlikely(unlazy_walk(nd, path->dentry))) {
2847  error = -ECHILD;
2848  goto out;
2849  }
2850  }
2851  BUG_ON(inode != path->dentry->d_inode);
2852  return 1;
2853  }
2854 
2855  if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
2856  path_to_nameidata(path, nd);
2857  } else {
2858  save_parent.dentry = nd->path.dentry;
2859  save_parent.mnt = mntget(path->mnt);
2860  nd->path.dentry = path->dentry;
2861 
2862  }
2863  nd->inode = inode;
2864  /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
2865  error = complete_walk(nd);
2866  if (error) {
2867  path_put(&save_parent);
2868  return error;
2869  }
2870  error = -EISDIR;
2871  if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
2872  goto out;
2873  error = -ENOTDIR;
2874  if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
2875  goto out;
2876  audit_inode(name, nd->path.dentry, 0);
2877 finish_open:
2878  if (!S_ISREG(nd->inode->i_mode))
2879  will_truncate = false;
2880 
2881  if (will_truncate) {
2882  error = mnt_want_write(nd->path.mnt);
2883  if (error)
2884  goto out;
2885  got_write = true;
2886  }
2887 finish_open_created:
2888  error = may_open(&nd->path, acc_mode, open_flag);
2889  if (error)
2890  goto out;
2891  file->f_path.mnt = nd->path.mnt;
2892  error = finish_open(file, nd->path.dentry, NULL, opened);
2893  if (error) {
2894  if (error == -EOPENSTALE)
2895  goto stale_open;
2896  goto out;
2897  }
2898 opened:
2899  error = open_check_o_direct(file);
2900  if (error)
2901  goto exit_fput;
2902  error = ima_file_check(file, op->acc_mode);
2903  if (error)
2904  goto exit_fput;
2905 
2906  if (will_truncate) {
2907  error = handle_truncate(file);
2908  if (error)
2909  goto exit_fput;
2910  }
2911 out:
2912  if (got_write)
2913  mnt_drop_write(nd->path.mnt);
2914  path_put(&save_parent);
2915  terminate_walk(nd);
2916  return error;
2917 
2918 exit_dput:
2919  path_put_conditional(path, nd);
2920  goto out;
2921 exit_fput:
2922  fput(file);
2923  goto out;
2924 
2925 stale_open:
2926  /* If no saved parent or already retried then can't retry */
2927  if (!save_parent.dentry || retried)
2928  goto out;
2929 
2930  BUG_ON(save_parent.dentry != dir);
2931  path_put(&nd->path);
2932  nd->path = save_parent;
2933  nd->inode = dir->d_inode;
2934  save_parent.mnt = NULL;
2935  save_parent.dentry = NULL;
2936  if (got_write) {
2937  mnt_drop_write(nd->path.mnt);
2938  got_write = false;
2939  }
2940  retried = true;
2941  goto retry_lookup;
2942 }
2943 
2944 static struct file *path_openat(int dfd, struct filename *pathname,
2945  struct nameidata *nd, const struct open_flags *op, int flags)
2946 {
2947  struct file *base = NULL;
2948  struct file *file;
2949  struct path path;
2950  int opened = 0;
2951  int error;
2952 
2953  file = get_empty_filp();
2954  if (!file)
2955  return ERR_PTR(-ENFILE);
2956 
2957  file->f_flags = op->open_flag;
2958 
2959  error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
2960  if (unlikely(error))
2961  goto out;
2962 
2963  current->total_link_count = 0;
2964  error = link_path_walk(pathname->name, nd);
2965  if (unlikely(error))
2966  goto out;
2967 
2968  error = do_last(nd, &path, file, op, &opened, pathname);
2969  while (unlikely(error > 0)) { /* trailing symlink */
2970  struct path link = path;
2971  void *cookie;
2972  if (!(nd->flags & LOOKUP_FOLLOW)) {
2973  path_put_conditional(&path, nd);
2974  path_put(&nd->path);
2975  error = -ELOOP;
2976  break;
2977  }
2978  error = may_follow_link(&link, nd);
2979  if (unlikely(error))
2980  break;
2981  nd->flags |= LOOKUP_PARENT;
2983  error = follow_link(&link, nd, &cookie);
2984  if (unlikely(error))
2985  break;
2986  error = do_last(nd, &path, file, op, &opened, pathname);
2987  put_link(nd, &link, cookie);
2988  }
2989 out:
2990  if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
2991  path_put(&nd->root);
2992  if (base)
2993  fput(base);
2994  if (!(opened & FILE_OPENED)) {
2995  BUG_ON(!error);
2996  put_filp(file);
2997  }
2998  if (unlikely(error)) {
2999  if (error == -EOPENSTALE) {
3000  if (flags & LOOKUP_RCU)
3001  error = -ECHILD;
3002  else
3003  error = -ESTALE;
3004  }
3005  file = ERR_PTR(error);
3006  }
3007  return file;
3008 }
3009 
3010 struct file *do_filp_open(int dfd, struct filename *pathname,
3011  const struct open_flags *op, int flags)
3012 {
3013  struct nameidata nd;
3014  struct file *filp;
3015 
3016  filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
3017  if (unlikely(filp == ERR_PTR(-ECHILD)))
3018  filp = path_openat(dfd, pathname, &nd, op, flags);
3019  if (unlikely(filp == ERR_PTR(-ESTALE)))
3020  filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
3021  return filp;
3022 }
3023 
3024 struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3025  const char *name, const struct open_flags *op, int flags)
3026 {
3027  struct nameidata nd;
3028  struct file *file;
3029  struct filename filename = { .name = name };
3030 
3031  nd.root.mnt = mnt;
3032  nd.root.dentry = dentry;
3033 
3034  flags |= LOOKUP_ROOT;
3035 
3036  if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
3037  return ERR_PTR(-ELOOP);
3038 
3039  file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
3040  if (unlikely(file == ERR_PTR(-ECHILD)))
3041  file = path_openat(-1, &filename, &nd, op, flags);
3042  if (unlikely(file == ERR_PTR(-ESTALE)))
3043  file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
3044  return file;
3045 }
3046 
3047 struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
3048 {
3049  struct dentry *dentry = ERR_PTR(-EEXIST);
3050  struct nameidata nd;
3051  int err2;
3052  int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
3053  if (error)
3054  return ERR_PTR(error);
3055 
3056  /*
3057  * Yucky last component or no last component at all?
3058  * (foo/., foo/.., /////)
3059  */
3060  if (nd.last_type != LAST_NORM)
3061  goto out;
3062  nd.flags &= ~LOOKUP_PARENT;
3064 
3065  /* don't fail immediately if it's r/o, at least try to report other errors */
3066  err2 = mnt_want_write(nd.path.mnt);
3067  /*
3068  * Do the final lookup.
3069  */
3070  mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3071  dentry = lookup_hash(&nd);
3072  if (IS_ERR(dentry))
3073  goto unlock;
3074 
3075  error = -EEXIST;
3076  if (dentry->d_inode)
3077  goto fail;
3078  /*
3079  * Special case - lookup gave negative, but... we had foo/bar/
3080  * From the vfs_mknod() POV we just have a negative dentry -
3081  * all is fine. Let's be bastards - you had / on the end, you've
3082  * been asking for (non-existent) directory. -ENOENT for you.
3083  */
3084  if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
3085  error = -ENOENT;
3086  goto fail;
3087  }
3088  if (unlikely(err2)) {
3089  error = err2;
3090  goto fail;
3091  }
3092  *path = nd.path;
3093  return dentry;
3094 fail:
3095  dput(dentry);
3096  dentry = ERR_PTR(error);
3097 unlock:
3098  mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3099  if (!err2)
3100  mnt_drop_write(nd.path.mnt);
3101 out:
3102  path_put(&nd.path);
3103  return dentry;
3104 }
3106 
3107 void done_path_create(struct path *path, struct dentry *dentry)
3108 {
3109  dput(dentry);
3110  mutex_unlock(&path->dentry->d_inode->i_mutex);
3111  mnt_drop_write(path->mnt);
3112  path_put(path);
3113 }
3115 
3116 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
3117 {
3118  struct filename *tmp = getname(pathname);
3119  struct dentry *res;
3120  if (IS_ERR(tmp))
3121  return ERR_CAST(tmp);
3122  res = kern_path_create(dfd, tmp->name, path, is_dir);
3123  putname(tmp);
3124  return res;
3125 }
3127 
3128 int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
3129 {
3130  int error = may_create(dir, dentry);
3131 
3132  if (error)
3133  return error;
3134 
3135  if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
3136  return -EPERM;
3137 
3138  if (!dir->i_op->mknod)
3139  return -EPERM;
3140 
3141  error = devcgroup_inode_mknod(mode, dev);
3142  if (error)
3143  return error;
3144 
3145  error = security_inode_mknod(dir, dentry, mode, dev);
3146  if (error)
3147  return error;
3148 
3149  error = dir->i_op->mknod(dir, dentry, mode, dev);
3150  if (!error)
3151  fsnotify_create(dir, dentry);
3152  return error;
3153 }
3154 
3155 static int may_mknod(umode_t mode)
3156 {
3157  switch (mode & S_IFMT) {
3158  case S_IFREG:
3159  case S_IFCHR:
3160  case S_IFBLK:
3161  case S_IFIFO:
3162  case S_IFSOCK:
3163  case 0: /* zero mode translates to S_IFREG */
3164  return 0;
3165  case S_IFDIR:
3166  return -EPERM;
3167  default:
3168  return -EINVAL;
3169  }
3170 }
3171 
3172 SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3173  unsigned, dev)
3174 {
3175  struct dentry *dentry;
3176  struct path path;
3177  int error;
3178 
3179  error = may_mknod(mode);
3180  if (error)
3181  return error;
3182 
3183  dentry = user_path_create(dfd, filename, &path, 0);
3184  if (IS_ERR(dentry))
3185  return PTR_ERR(dentry);
3186 
3187  if (!IS_POSIXACL(path.dentry->d_inode))
3188  mode &= ~current_umask();
3189  error = security_path_mknod(&path, dentry, mode, dev);
3190  if (error)
3191  goto out;
3192  switch (mode & S_IFMT) {
3193  case 0: case S_IFREG:
3194  error = vfs_create(path.dentry->d_inode,dentry,mode,true);
3195  break;
3196  case S_IFCHR: case S_IFBLK:
3197  error = vfs_mknod(path.dentry->d_inode,dentry,mode,
3198  new_decode_dev(dev));
3199  break;
3200  case S_IFIFO: case S_IFSOCK:
3201  error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
3202  break;
3203  }
3204 out:
3205  done_path_create(&path, dentry);
3206  return error;
3207 }
3208 
3209 SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
3210 {
3211  return sys_mknodat(AT_FDCWD, filename, mode, dev);
3212 }
3213 
3214 int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
3215 {
3216  int error = may_create(dir, dentry);
3217  unsigned max_links = dir->i_sb->s_max_links;
3218 
3219  if (error)
3220  return error;
3221 
3222  if (!dir->i_op->mkdir)
3223  return -EPERM;
3224 
3225  mode &= (S_IRWXUGO|S_ISVTX);
3226  error = security_inode_mkdir(dir, dentry, mode);
3227  if (error)
3228  return error;
3229 
3230  if (max_links && dir->i_nlink >= max_links)
3231  return -EMLINK;
3232 
3233  error = dir->i_op->mkdir(dir, dentry, mode);
3234  if (!error)
3235  fsnotify_mkdir(dir, dentry);
3236  return error;
3237 }
3238 
3239 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3240 {
3241  struct dentry *dentry;
3242  struct path path;
3243  int error;
3244 
3245  dentry = user_path_create(dfd, pathname, &path, 1);
3246  if (IS_ERR(dentry))
3247  return PTR_ERR(dentry);
3248 
3249  if (!IS_POSIXACL(path.dentry->d_inode))
3250  mode &= ~current_umask();
3251  error = security_path_mkdir(&path, dentry, mode);
3252  if (!error)
3253  error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3254  done_path_create(&path, dentry);
3255  return error;
3256 }
3257 
3258 SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
3259 {
3260  return sys_mkdirat(AT_FDCWD, pathname, mode);
3261 }
3262 
3263 /*
3264  * The dentry_unhash() helper will try to drop the dentry early: we
3265  * should have a usage count of 1 if we're the only user of this
3266  * dentry, and if that is true (possibly after pruning the dcache),
3267  * then we drop the dentry now.
3268  *
3269  * A low-level filesystem can, if it choses, legally
3270  * do a
3271  *
3272  * if (!d_unhashed(dentry))
3273  * return -EBUSY;
3274  *
3275  * if it cannot handle the case of removing a directory
3276  * that is still in use by something else..
3277  */
3278 void dentry_unhash(struct dentry *dentry)
3279 {
3280  shrink_dcache_parent(dentry);
3281  spin_lock(&dentry->d_lock);
3282  if (dentry->d_count == 1)
3283  __d_drop(dentry);
3284  spin_unlock(&dentry->d_lock);
3285 }
3286 
3287 int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3288 {
3289  int error = may_delete(dir, dentry, 1);
3290 
3291  if (error)
3292  return error;
3293 
3294  if (!dir->i_op->rmdir)
3295  return -EPERM;
3296 
3297  dget(dentry);
3298  mutex_lock(&dentry->d_inode->i_mutex);
3299 
3300  error = -EBUSY;
3301  if (d_mountpoint(dentry))
3302  goto out;
3303 
3304  error = security_inode_rmdir(dir, dentry);
3305  if (error)
3306  goto out;
3307 
3308  shrink_dcache_parent(dentry);
3309  error = dir->i_op->rmdir(dir, dentry);
3310  if (error)
3311  goto out;
3312 
3313  dentry->d_inode->i_flags |= S_DEAD;
3314  dont_mount(dentry);
3315 
3316 out:
3317  mutex_unlock(&dentry->d_inode->i_mutex);
3318  dput(dentry);
3319  if (!error)
3320  d_delete(dentry);
3321  return error;
3322 }
3323 
3324 static long do_rmdir(int dfd, const char __user *pathname)
3325 {
3326  int error = 0;
3327  struct filename *name;
3328  struct dentry *dentry;
3329  struct nameidata nd;
3330 
3331  name = user_path_parent(dfd, pathname, &nd);
3332  if (IS_ERR(name))
3333  return PTR_ERR(name);
3334 
3335  switch(nd.last_type) {
3336  case LAST_DOTDOT:
3337  error = -ENOTEMPTY;
3338  goto exit1;
3339  case LAST_DOT:
3340  error = -EINVAL;
3341  goto exit1;
3342  case LAST_ROOT:
3343  error = -EBUSY;
3344  goto exit1;
3345  }
3346 
3347  nd.flags &= ~LOOKUP_PARENT;
3348  error = mnt_want_write(nd.path.mnt);
3349  if (error)
3350  goto exit1;
3351 
3352  mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3353  dentry = lookup_hash(&nd);
3354  error = PTR_ERR(dentry);
3355  if (IS_ERR(dentry))
3356  goto exit2;
3357  if (!dentry->d_inode) {
3358  error = -ENOENT;
3359  goto exit3;
3360  }
3361  error = security_path_rmdir(&nd.path, dentry);
3362  if (error)
3363  goto exit3;
3364  error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
3365 exit3:
3366  dput(dentry);
3367 exit2:
3368  mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3369  mnt_drop_write(nd.path.mnt);
3370 exit1:
3371  path_put(&nd.path);
3372  putname(name);
3373  return error;
3374 }
3375 
3376 SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
3377 {
3378  return do_rmdir(AT_FDCWD, pathname);
3379 }
3380 
3381 int vfs_unlink(struct inode *dir, struct dentry *dentry)
3382 {
3383  int error = may_delete(dir, dentry, 0);
3384 
3385  if (error)
3386  return error;
3387 
3388  if (!dir->i_op->unlink)
3389  return -EPERM;
3390 
3391  mutex_lock(&dentry->d_inode->i_mutex);
3392  if (d_mountpoint(dentry))
3393  error = -EBUSY;
3394  else {
3395  error = security_inode_unlink(dir, dentry);
3396  if (!error) {
3397  error = dir->i_op->unlink(dir, dentry);
3398  if (!error)
3399  dont_mount(dentry);
3400  }
3401  }
3402  mutex_unlock(&dentry->d_inode->i_mutex);
3403 
3404  /* We don't d_delete() NFS sillyrenamed files--they still exist. */
3405  if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
3406  fsnotify_link_count(dentry->d_inode);
3407  d_delete(dentry);
3408  }
3409 
3410  return error;
3411 }
3412 
3413 /*
3414  * Make sure that the actual truncation of the file will occur outside its
3415  * directory's i_mutex. Truncate can take a long time if there is a lot of
3416  * writeout happening, and we don't want to prevent access to the directory
3417  * while waiting on the I/O.
3418  */
3419 static long do_unlinkat(int dfd, const char __user *pathname)
3420 {
3421  int error;
3422  struct filename *name;
3423  struct dentry *dentry;
3424  struct nameidata nd;
3425  struct inode *inode = NULL;
3426 
3427  name = user_path_parent(dfd, pathname, &nd);
3428  if (IS_ERR(name))
3429  return PTR_ERR(name);
3430 
3431  error = -EISDIR;
3432  if (nd.last_type != LAST_NORM)
3433  goto exit1;
3434 
3435  nd.flags &= ~LOOKUP_PARENT;
3436  error = mnt_want_write(nd.path.mnt);
3437  if (error)
3438  goto exit1;
3439 
3440  mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
3441  dentry = lookup_hash(&nd);
3442  error = PTR_ERR(dentry);
3443  if (!IS_ERR(dentry)) {
3444  /* Why not before? Because we want correct error value */
3445  if (nd.last.name[nd.last.len])
3446  goto slashes;
3447  inode = dentry->d_inode;
3448  if (!inode)
3449  goto slashes;
3450  ihold(inode);
3451  error = security_path_unlink(&nd.path, dentry);
3452  if (error)
3453  goto exit2;
3454  error = vfs_unlink(nd.path.dentry->d_inode, dentry);
3455 exit2:
3456  dput(dentry);
3457  }
3458  mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
3459  if (inode)
3460  iput(inode); /* truncate the inode here */
3461  mnt_drop_write(nd.path.mnt);
3462 exit1:
3463  path_put(&nd.path);
3464  putname(name);
3465  return error;
3466 
3467 slashes:
3468  error = !dentry->d_inode ? -ENOENT :
3469  S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
3470  goto exit2;
3471 }
3472 
3473 SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
3474 {
3475  if ((flag & ~AT_REMOVEDIR) != 0)
3476  return -EINVAL;
3477 
3478  if (flag & AT_REMOVEDIR)
3479  return do_rmdir(dfd, pathname);
3480 
3481  return do_unlinkat(dfd, pathname);
3482 }
3483 
3484 SYSCALL_DEFINE1(unlink, const char __user *, pathname)
3485 {
3486  return do_unlinkat(AT_FDCWD, pathname);
3487 }
3488 
3489 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
3490 {
3491  int error = may_create(dir, dentry);
3492 
3493  if (error)
3494  return error;
3495 
3496  if (!dir->i_op->symlink)
3497  return -EPERM;
3498 
3499  error = security_inode_symlink(dir, dentry, oldname);
3500  if (error)
3501  return error;
3502 
3503  error = dir->i_op->symlink(dir, dentry, oldname);
3504  if (!error)
3505  fsnotify_create(dir, dentry);
3506  return error;
3507 }
3508 
3509 SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3510  int, newdfd, const char __user *, newname)
3511 {
3512  int error;
3513  struct filename *from;
3514  struct dentry *dentry;
3515  struct path path;
3516 
3517  from = getname(oldname);
3518  if (IS_ERR(from))
3519  return PTR_ERR(from);
3520 
3521  dentry = user_path_create(newdfd, newname, &path, 0);
3522  error = PTR_ERR(dentry);
3523  if (IS_ERR(dentry))
3524  goto out_putname;
3525 
3526  error = security_path_symlink(&path, dentry, from->name);
3527  if (!error)
3528  error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3529  done_path_create(&path, dentry);
3530 out_putname:
3531  putname(from);
3532  return error;
3533 }
3534 
3535 SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
3536 {
3537  return sys_symlinkat(oldname, AT_FDCWD, newname);
3538 }
3539 
3540 int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
3541 {
3542  struct inode *inode = old_dentry->d_inode;
3543  unsigned max_links = dir->i_sb->s_max_links;
3544  int error;
3545 
3546  if (!inode)
3547  return -ENOENT;
3548 
3549  error = may_create(dir, new_dentry);
3550  if (error)
3551  return error;
3552 
3553  if (dir->i_sb != inode->i_sb)
3554  return -EXDEV;
3555 
3556  /*
3557  * A link to an append-only or immutable file cannot be created.
3558  */
3559  if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
3560  return -EPERM;
3561  if (!dir->i_op->link)
3562  return -EPERM;
3563  if (S_ISDIR(inode->i_mode))
3564  return -EPERM;
3565 
3566  error = security_inode_link(old_dentry, dir, new_dentry);
3567  if (error)
3568  return error;
3569 
3570  mutex_lock(&inode->i_mutex);
3571  /* Make sure we don't allow creating hardlink to an unlinked file */
3572  if (inode->i_nlink == 0)
3573  error = -ENOENT;
3574  else if (max_links && inode->i_nlink >= max_links)
3575  error = -EMLINK;
3576  else
3577  error = dir->i_op->link(old_dentry, dir, new_dentry);
3578  mutex_unlock(&inode->i_mutex);
3579  if (!error)
3580  fsnotify_link(dir, inode, new_dentry);
3581  return error;
3582 }
3583 
3584 /*
3585  * Hardlinks are often used in delicate situations. We avoid
3586  * security-related surprises by not following symlinks on the
3587  * newname. --KAB
3588  *
3589  * We don't follow them on the oldname either to be compatible
3590  * with linux 2.0, and to avoid hard-linking to directories
3591  * and other special files. --ADM
3592  */
3593 SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3594  int, newdfd, const char __user *, newname, int, flags)
3595 {
3596  struct dentry *new_dentry;
3597  struct path old_path, new_path;
3598  int how = 0;
3599  int error;
3600 
3601  if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3602  return -EINVAL;
3603  /*
3604  * To use null names we require CAP_DAC_READ_SEARCH
3605  * This ensures that not everyone will be able to create
3606  * handlink using the passed filedescriptor.
3607  */
3608  if (flags & AT_EMPTY_PATH) {
3610  return -ENOENT;
3611  how = LOOKUP_EMPTY;
3612  }
3613 
3614  if (flags & AT_SYMLINK_FOLLOW)
3615  how |= LOOKUP_FOLLOW;
3616 
3617  error = user_path_at(olddfd, oldname, how, &old_path);
3618  if (error)
3619  return error;
3620 
3621  new_dentry = user_path_create(newdfd, newname, &new_path, 0);
3622  error = PTR_ERR(new_dentry);
3623  if (IS_ERR(new_dentry))
3624  goto out;
3625 
3626  error = -EXDEV;
3627  if (old_path.mnt != new_path.mnt)
3628  goto out_dput;
3629  error = may_linkat(&old_path);
3630  if (unlikely(error))
3631  goto out_dput;
3632  error = security_path_link(old_path.dentry, &new_path, new_dentry);
3633  if (error)
3634  goto out_dput;
3635  error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3636 out_dput:
3637  done_path_create(&new_path, new_dentry);
3638 out:
3639  path_put(&old_path);
3640 
3641  return error;
3642 }
3643 
3644 SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
3645 {
3646  return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
3647 }
3648 
3649 /*
3650  * The worst of all namespace operations - renaming directory. "Perverted"
3651  * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
3652  * Problems:
3653  * a) we can get into loop creation. Check is done in is_subdir().
3654  * b) race potential - two innocent renames can create a loop together.
3655  * That's where 4.4 screws up. Current fix: serialization on
3656  * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
3657  * story.
3658  * c) we have to lock _three_ objects - parents and victim (if it exists).
3659  * And that - after we got ->i_mutex on parents (until then we don't know
3660  * whether the target exists). Solution: try to be smart with locking
3661  * order for inodes. We rely on the fact that tree topology may change
3662  * only under ->s_vfs_rename_mutex _and_ that parent of the object we
3663  * move will be locked. Thus we can rank directories by the tree
3664  * (ancestors first) and rank all non-directories after them.
3665  * That works since everybody except rename does "lock parent, lookup,
3666  * lock child" and rename is under ->s_vfs_rename_mutex.
3667  * HOWEVER, it relies on the assumption that any object with ->lookup()
3668  * has no more than 1 dentry. If "hybrid" objects will ever appear,
3669  * we'd better make sure that there's no link(2) for them.
3670  * d) conversion from fhandle to dentry may come in the wrong moment - when
3671  * we are removing the target. Solution: we will have to grab ->i_mutex
3672  * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
3673  * ->i_mutex on parents, which works but leads to some truly excessive
3674  * locking].
3675  */
3676 static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
3677  struct inode *new_dir, struct dentry *new_dentry)
3678 {
3679  int error = 0;
3680  struct inode *target = new_dentry->d_inode;
3681  unsigned max_links = new_dir->i_sb->s_max_links;
3682 
3683  /*
3684  * If we are going to change the parent - check write permissions,
3685  * we'll need to flip '..'.
3686  */
3687  if (new_dir != old_dir) {
3688  error = inode_permission(old_dentry->d_inode, MAY_WRITE);
3689  if (error)
3690  return error;
3691  }
3692 
3693  error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
3694  if (error)
3695  return error;
3696 
3697  dget(new_dentry);
3698  if (target)
3699  mutex_lock(&target->i_mutex);
3700 
3701  error = -EBUSY;
3702  if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
3703  goto out;
3704 
3705  error = -EMLINK;
3706  if (max_links && !target && new_dir != old_dir &&
3707  new_dir->i_nlink >= max_links)
3708  goto out;
3709 
3710  if (target)
3711  shrink_dcache_parent(new_dentry);
3712  error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3713  if (error)
3714  goto out;
3715 
3716  if (target) {
3717  target->i_flags |= S_DEAD;
3718  dont_mount(new_dentry);
3719  }
3720 out:
3721  if (target)
3722  mutex_unlock(&target->i_mutex);
3723  dput(new_dentry);
3724  if (!error)
3725  if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3726  d_move(old_dentry,new_dentry);
3727  return error;
3728 }
3729 
3730 static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3731  struct inode *new_dir, struct dentry *new_dentry)
3732 {
3733  struct inode *target = new_dentry->d_inode;
3734  int error;
3735 
3736  error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
3737  if (error)
3738  return error;
3739 
3740  dget(new_dentry);
3741  if (target)
3742  mutex_lock(&target->i_mutex);
3743 
3744  error = -EBUSY;
3745  if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
3746  goto out;
3747 
3748  error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
3749  if (error)
3750  goto out;
3751 
3752  if (target)
3753  dont_mount(new_dentry);
3754  if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
3755  d_move(old_dentry, new_dentry);
3756 out:
3757  if (target)
3758  mutex_unlock(&target->i_mutex);
3759  dput(new_dentry);
3760  return error;
3761 }
3762 
3763 int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
3764  struct inode *new_dir, struct dentry *new_dentry)
3765 {
3766  int error;
3767  int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
3768  const unsigned char *old_name;
3769 
3770  if (old_dentry->d_inode == new_dentry->d_inode)
3771  return 0;
3772 
3773  error = may_delete(old_dir, old_dentry, is_dir);
3774  if (error)
3775  return error;
3776 
3777  if (!new_dentry->d_inode)
3778  error = may_create(new_dir, new_dentry);
3779  else
3780  error = may_delete(new_dir, new_dentry, is_dir);
3781  if (error)
3782  return error;
3783 
3784  if (!old_dir->i_op->rename)
3785  return -EPERM;
3786 
3787  old_name = fsnotify_oldname_init(old_dentry->d_name.name);
3788 
3789  if (is_dir)
3790  error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
3791  else
3792  error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
3793  if (!error)
3794  fsnotify_move(old_dir, new_dir, old_name, is_dir,
3795  new_dentry->d_inode, old_dentry);
3796  fsnotify_oldname_free(old_name);
3797 
3798  return error;
3799 }
3800 
3801 SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3802  int, newdfd, const char __user *, newname)
3803 {
3804  struct dentry *old_dir, *new_dir;
3805  struct dentry *old_dentry, *new_dentry;
3806  struct dentry *trap;
3807  struct nameidata oldnd, newnd;
3808  struct filename *from;
3809  struct filename *to;
3810  int error;
3811 
3812  from = user_path_parent(olddfd, oldname, &oldnd);
3813  if (IS_ERR(from)) {
3814  error = PTR_ERR(from);
3815  goto exit;
3816  }
3817 
3818  to = user_path_parent(newdfd, newname, &newnd);
3819  if (IS_ERR(to)) {
3820  error = PTR_ERR(to);
3821  goto exit1;
3822  }
3823 
3824  error = -EXDEV;
3825  if (oldnd.path.mnt != newnd.path.mnt)
3826  goto exit2;
3827 
3828  old_dir = oldnd.path.dentry;
3829  error = -EBUSY;
3830  if (oldnd.last_type != LAST_NORM)
3831  goto exit2;
3832 
3833  new_dir = newnd.path.dentry;
3834  if (newnd.last_type != LAST_NORM)
3835  goto exit2;
3836 
3837  error = mnt_want_write(oldnd.path.mnt);
3838  if (error)
3839  goto exit2;
3840 
3841  oldnd.flags &= ~LOOKUP_PARENT;
3842  newnd.flags &= ~LOOKUP_PARENT;
3843  newnd.flags |= LOOKUP_RENAME_TARGET;
3844 
3845  trap = lock_rename(new_dir, old_dir);
3846 
3847  old_dentry = lookup_hash(&oldnd);
3848  error = PTR_ERR(old_dentry);
3849  if (IS_ERR(old_dentry))
3850  goto exit3;
3851  /* source must exist */
3852  error = -ENOENT;
3853  if (!old_dentry->d_inode)
3854  goto exit4;
3855  /* unless the source is a directory trailing slashes give -ENOTDIR */
3856  if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
3857  error = -ENOTDIR;
3858  if (oldnd.last.name[oldnd.last.len])
3859  goto exit4;
3860  if (newnd.last.name[newnd.last.len])
3861  goto exit4;
3862  }
3863  /* source should not be ancestor of target */
3864  error = -EINVAL;
3865  if (old_dentry == trap)
3866  goto exit4;
3867  new_dentry = lookup_hash(&newnd);
3868  error = PTR_ERR(new_dentry);
3869  if (IS_ERR(new_dentry))
3870  goto exit4;
3871  /* target should not be an ancestor of source */
3872  error = -ENOTEMPTY;
3873  if (new_dentry == trap)
3874  goto exit5;
3875 
3876  error = security_path_rename(&oldnd.path, old_dentry,
3877  &newnd.path, new_dentry);
3878  if (error)
3879  goto exit5;
3880  error = vfs_rename(old_dir->d_inode, old_dentry,
3881  new_dir->d_inode, new_dentry);
3882 exit5:
3883  dput(new_dentry);
3884 exit4:
3885  dput(old_dentry);
3886 exit3:
3887  unlock_rename(new_dir, old_dir);
3888  mnt_drop_write(oldnd.path.mnt);
3889 exit2:
3890  path_put(&newnd.path);
3891  putname(to);
3892 exit1:
3893  path_put(&oldnd.path);
3894  putname(from);
3895 exit:
3896  return error;
3897 }
3898 
3899 SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
3900 {
3901  return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
3902 }
3903 
3904 int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
3905 {
3906  int len;
3907 
3908  len = PTR_ERR(link);
3909  if (IS_ERR(link))
3910  goto out;
3911 
3912  len = strlen(link);
3913  if (len > (unsigned) buflen)
3914  len = buflen;
3915  if (copy_to_user(buffer, link, len))
3916  len = -EFAULT;
3917 out:
3918  return len;
3919 }
3920 
3921 /*
3922  * A helper for ->readlink(). This should be used *ONLY* for symlinks that
3923  * have ->follow_link() touching nd only in nd_set_link(). Using (or not
3924  * using) it for any given inode is up to filesystem.
3925  */
3926 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
3927 {
3928  struct nameidata nd;
3929  void *cookie;
3930  int res;
3931 
3932  nd.depth = 0;
3933  cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
3934  if (IS_ERR(cookie))
3935  return PTR_ERR(cookie);
3936 
3937  res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
3938  if (dentry->d_inode->i_op->put_link)
3939  dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
3940  return res;
3941 }
3942 
3943 int vfs_follow_link(struct nameidata *nd, const char *link)
3944 {
3945  return __vfs_follow_link(nd, link);
3946 }
3947 
3948 /* get the link contents into pagecache */
3949 static char *page_getlink(struct dentry * dentry, struct page **ppage)
3950 {
3951  char *kaddr;
3952  struct page *page;
3953  struct address_space *mapping = dentry->d_inode->i_mapping;
3954  page = read_mapping_page(mapping, 0, NULL);
3955  if (IS_ERR(page))
3956  return (char*)page;
3957  *ppage = page;
3958  kaddr = kmap(page);
3959  nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
3960  return kaddr;
3961 }
3962 
3963 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
3964 {
3965  struct page *page = NULL;
3966  char *s = page_getlink(dentry, &page);
3967  int res = vfs_readlink(dentry,buffer,buflen,s);
3968  if (page) {
3969  kunmap(page);
3970  page_cache_release(page);
3971  }
3972  return res;
3973 }
3974 
3975 void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
3976 {
3977  struct page *page = NULL;
3978  nd_set_link(nd, page_getlink(dentry, &page));
3979  return page;
3980 }
3981 
3982 void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
3983 {
3984  struct page *page = cookie;
3985 
3986  if (page) {
3987  kunmap(page);
3988  page_cache_release(page);
3989  }
3990 }
3991 
3992 /*
3993  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
3994  */
3995 int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
3996 {
3997  struct address_space *mapping = inode->i_mapping;
3998  struct page *page;
3999  void *fsdata;
4000  int err;
4001  char *kaddr;
4002  unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
4003  if (nofs)
4004  flags |= AOP_FLAG_NOFS;
4005 
4006 retry:
4007  err = pagecache_write_begin(NULL, mapping, 0, len-1,
4008  flags, &page, &fsdata);
4009  if (err)
4010  goto fail;
4011 
4012  kaddr = kmap_atomic(page);
4013  memcpy(kaddr, symname, len-1);
4014  kunmap_atomic(kaddr);
4015 
4016  err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
4017  page, fsdata);
4018  if (err < 0)
4019  goto fail;
4020  if (err < len-1)
4021  goto retry;
4022 
4023  mark_inode_dirty(inode);
4024  return 0;
4025 fail:
4026  return err;
4027 }
4028 
4029 int page_symlink(struct inode *inode, const char *symname, int len)
4030 {
4031  return __page_symlink(inode, symname, len,
4032  !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
4033 }
4034 
4036  .readlink = generic_readlink,
4037  .follow_link = page_follow_link_light,
4038  .put_link = page_put_link,
4039 };
4040 
4045 EXPORT_SYMBOL(get_write_access); /* nfsd */
4053 EXPORT_SYMBOL(page_symlink_inode_operations);