Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
file.c
Go to the documentation of this file.
1 /*
2  * linux/fs/file.c
3  *
4  * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
5  *
6  * Manage the dynamic fd arrays in the process files_struct.
7  */
8 
9 #include <linux/syscalls.h>
10 #include <linux/export.h>
11 #include <linux/fs.h>
12 #include <linux/mm.h>
13 #include <linux/mmzone.h>
14 #include <linux/time.h>
15 #include <linux/sched.h>
16 #include <linux/slab.h>
17 #include <linux/vmalloc.h>
18 #include <linux/file.h>
19 #include <linux/fdtable.h>
20 #include <linux/bitops.h>
21 #include <linux/interrupt.h>
22 #include <linux/spinlock.h>
23 #include <linux/rcupdate.h>
24 #include <linux/workqueue.h>
25 
26 struct fdtable_defer {
28  struct work_struct wq;
29  struct fdtable *next;
30 };
31 
32 int sysctl_nr_open __read_mostly = 1024*1024;
34 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
35 
36 /*
37  * We use this list to defer free fdtables that have vmalloced
38  * sets/arrays. By keeping a per-cpu list, we avoid having to embed
39  * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
40  * this per-task structure.
41  */
42 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
43 
44 static void *alloc_fdmem(size_t size)
45 {
46  /*
47  * Very large allocations can stress page reclaim, so fall back to
48  * vmalloc() if the allocation size will be considered "large" by the VM.
49  */
50  if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
51  void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
52  if (data != NULL)
53  return data;
54  }
55  return vmalloc(size);
56 }
57 
58 static void free_fdmem(void *ptr)
59 {
60  is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
61 }
62 
63 static void __free_fdtable(struct fdtable *fdt)
64 {
65  free_fdmem(fdt->fd);
66  free_fdmem(fdt->open_fds);
67  kfree(fdt);
68 }
69 
70 static void free_fdtable_work(struct work_struct *work)
71 {
72  struct fdtable_defer *f =
73  container_of(work, struct fdtable_defer, wq);
74  struct fdtable *fdt;
75 
76  spin_lock_bh(&f->lock);
77  fdt = f->next;
78  f->next = NULL;
79  spin_unlock_bh(&f->lock);
80  while(fdt) {
81  struct fdtable *next = fdt->next;
82 
83  __free_fdtable(fdt);
84  fdt = next;
85  }
86 }
87 
88 static void free_fdtable_rcu(struct rcu_head *rcu)
89 {
90  struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
91  struct fdtable_defer *fddef;
92 
93  BUG_ON(!fdt);
95 
96  if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
97  kfree(fdt->fd);
98  kfree(fdt->open_fds);
99  kfree(fdt);
100  } else {
101  fddef = &get_cpu_var(fdtable_defer_list);
102  spin_lock(&fddef->lock);
103  fdt->next = fddef->next;
104  fddef->next = fdt;
105  /* vmallocs are handled from the workqueue context */
106  schedule_work(&fddef->wq);
107  spin_unlock(&fddef->lock);
108  put_cpu_var(fdtable_defer_list);
109  }
110 }
111 
112 /*
113  * Expand the fdset in the files_struct. Called with the files spinlock
114  * held for write.
115  */
116 static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
117 {
118  unsigned int cpy, set;
119 
120  BUG_ON(nfdt->max_fds < ofdt->max_fds);
121 
122  cpy = ofdt->max_fds * sizeof(struct file *);
123  set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
124  memcpy(nfdt->fd, ofdt->fd, cpy);
125  memset((char *)(nfdt->fd) + cpy, 0, set);
126 
127  cpy = ofdt->max_fds / BITS_PER_BYTE;
128  set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
129  memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
130  memset((char *)(nfdt->open_fds) + cpy, 0, set);
131  memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
132  memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
133 }
134 
135 static struct fdtable * alloc_fdtable(unsigned int nr)
136 {
137  struct fdtable *fdt;
138  void *data;
139 
140  /*
141  * Figure out how many fds we actually want to support in this fdtable.
142  * Allocation steps are keyed to the size of the fdarray, since it
143  * grows far faster than any of the other dynamic data. We try to fit
144  * the fdarray into comfortable page-tuned chunks: starting at 1024B
145  * and growing in powers of two from there on.
146  */
147  nr /= (1024 / sizeof(struct file *));
148  nr = roundup_pow_of_two(nr + 1);
149  nr *= (1024 / sizeof(struct file *));
150  /*
151  * Note that this can drive nr *below* what we had passed if sysctl_nr_open
152  * had been set lower between the check in expand_files() and here. Deal
153  * with that in caller, it's cheaper that way.
154  *
155  * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
156  * bitmaps handling below becomes unpleasant, to put it mildly...
157  */
158  if (unlikely(nr > sysctl_nr_open))
159  nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
160 
161  fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
162  if (!fdt)
163  goto out;
164  fdt->max_fds = nr;
165  data = alloc_fdmem(nr * sizeof(struct file *));
166  if (!data)
167  goto out_fdt;
168  fdt->fd = data;
169 
170  data = alloc_fdmem(max_t(size_t,
171  2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
172  if (!data)
173  goto out_arr;
174  fdt->open_fds = data;
175  data += nr / BITS_PER_BYTE;
176  fdt->close_on_exec = data;
177  fdt->next = NULL;
178 
179  return fdt;
180 
181 out_arr:
182  free_fdmem(fdt->fd);
183 out_fdt:
184  kfree(fdt);
185 out:
186  return NULL;
187 }
188 
189 /*
190  * Expand the file descriptor table.
191  * This function will allocate a new fdtable and both fd array and fdset, of
192  * the given size.
193  * Return <0 error code on error; 1 on successful completion.
194  * The files->file_lock should be held on entry, and will be held on exit.
195  */
196 static int expand_fdtable(struct files_struct *files, int nr)
197  __releases(files->file_lock)
199 {
200  struct fdtable *new_fdt, *cur_fdt;
201 
202  spin_unlock(&files->file_lock);
203  new_fdt = alloc_fdtable(nr);
204  spin_lock(&files->file_lock);
205  if (!new_fdt)
206  return -ENOMEM;
207  /*
208  * extremely unlikely race - sysctl_nr_open decreased between the check in
209  * caller and alloc_fdtable(). Cheaper to catch it here...
210  */
211  if (unlikely(new_fdt->max_fds <= nr)) {
212  __free_fdtable(new_fdt);
213  return -EMFILE;
214  }
215  /*
216  * Check again since another task may have expanded the fd table while
217  * we dropped the lock
218  */
219  cur_fdt = files_fdtable(files);
220  if (nr >= cur_fdt->max_fds) {
221  /* Continue as planned */
222  copy_fdtable(new_fdt, cur_fdt);
223  rcu_assign_pointer(files->fdt, new_fdt);
224  if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
225  call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
226  } else {
227  /* Somebody else expanded, so undo our attempt */
228  __free_fdtable(new_fdt);
229  }
230  return 1;
231 }
232 
233 /*
234  * Expand files.
235  * This function will expand the file structures, if the requested size exceeds
236  * the current capacity and there is room for expansion.
237  * Return <0 error code on error; 0 when nothing done; 1 when files were
238  * expanded and execution may have blocked.
239  * The files->file_lock should be held on entry, and will be held on exit.
240  */
241 static int expand_files(struct files_struct *files, int nr)
242 {
243  struct fdtable *fdt;
244 
245  fdt = files_fdtable(files);
246 
247  /* Do we need to expand? */
248  if (nr < fdt->max_fds)
249  return 0;
250 
251  /* Can we expand? */
252  if (nr >= sysctl_nr_open)
253  return -EMFILE;
254 
255  /* All good, so we try */
256  return expand_fdtable(files, nr);
257 }
258 
259 static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
260 {
261  __set_bit(fd, fdt->close_on_exec);
262 }
263 
264 static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
265 {
266  __clear_bit(fd, fdt->close_on_exec);
267 }
268 
269 static inline void __set_open_fd(int fd, struct fdtable *fdt)
270 {
271  __set_bit(fd, fdt->open_fds);
272 }
273 
274 static inline void __clear_open_fd(int fd, struct fdtable *fdt)
275 {
276  __clear_bit(fd, fdt->open_fds);
277 }
278 
279 static int count_open_files(struct fdtable *fdt)
280 {
281  int size = fdt->max_fds;
282  int i;
283 
284  /* Find the last open fd */
285  for (i = size / BITS_PER_LONG; i > 0; ) {
286  if (fdt->open_fds[--i])
287  break;
288  }
289  i = (i + 1) * BITS_PER_LONG;
290  return i;
291 }
292 
293 /*
294  * Allocate a new files structure and copy contents from the
295  * passed in files structure.
296  * errorp will be valid only when the returned files_struct is NULL.
297  */
298 struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
299 {
300  struct files_struct *newf;
301  struct file **old_fds, **new_fds;
302  int open_files, size, i;
303  struct fdtable *old_fdt, *new_fdt;
304 
305  *errorp = -ENOMEM;
307  if (!newf)
308  goto out;
309 
310  atomic_set(&newf->count, 1);
311 
312  spin_lock_init(&newf->file_lock);
313  newf->next_fd = 0;
314  new_fdt = &newf->fdtab;
315  new_fdt->max_fds = NR_OPEN_DEFAULT;
316  new_fdt->close_on_exec = newf->close_on_exec_init;
317  new_fdt->open_fds = newf->open_fds_init;
318  new_fdt->fd = &newf->fd_array[0];
319  new_fdt->next = NULL;
320 
321  spin_lock(&oldf->file_lock);
322  old_fdt = files_fdtable(oldf);
323  open_files = count_open_files(old_fdt);
324 
325  /*
326  * Check whether we need to allocate a larger fd array and fd set.
327  */
328  while (unlikely(open_files > new_fdt->max_fds)) {
329  spin_unlock(&oldf->file_lock);
330 
331  if (new_fdt != &newf->fdtab)
332  __free_fdtable(new_fdt);
333 
334  new_fdt = alloc_fdtable(open_files - 1);
335  if (!new_fdt) {
336  *errorp = -ENOMEM;
337  goto out_release;
338  }
339 
340  /* beyond sysctl_nr_open; nothing to do */
341  if (unlikely(new_fdt->max_fds < open_files)) {
342  __free_fdtable(new_fdt);
343  *errorp = -EMFILE;
344  goto out_release;
345  }
346 
347  /*
348  * Reacquire the oldf lock and a pointer to its fd table
349  * who knows it may have a new bigger fd table. We need
350  * the latest pointer.
351  */
352  spin_lock(&oldf->file_lock);
353  old_fdt = files_fdtable(oldf);
354  open_files = count_open_files(old_fdt);
355  }
356 
357  old_fds = old_fdt->fd;
358  new_fds = new_fdt->fd;
359 
360  memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
361  memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
362 
363  for (i = open_files; i != 0; i--) {
364  struct file *f = *old_fds++;
365  if (f) {
366  get_file(f);
367  } else {
368  /*
369  * The fd may be claimed in the fd bitmap but not yet
370  * instantiated in the files array if a sibling thread
371  * is partway through open(). So make sure that this
372  * fd is available to the new process.
373  */
374  __clear_open_fd(open_files - i, new_fdt);
375  }
376  rcu_assign_pointer(*new_fds++, f);
377  }
378  spin_unlock(&oldf->file_lock);
379 
380  /* compute the remainder to be cleared */
381  size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
382 
383  /* This is long word aligned thus could use a optimized version */
384  memset(new_fds, 0, size);
385 
386  if (new_fdt->max_fds > open_files) {
387  int left = (new_fdt->max_fds - open_files) / 8;
388  int start = open_files / BITS_PER_LONG;
389 
390  memset(&new_fdt->open_fds[start], 0, left);
391  memset(&new_fdt->close_on_exec[start], 0, left);
392  }
393 
394  rcu_assign_pointer(newf->fdt, new_fdt);
395 
396  return newf;
397 
398 out_release:
400 out:
401  return NULL;
402 }
403 
404 static void close_files(struct files_struct * files)
405 {
406  int i, j;
407  struct fdtable *fdt;
408 
409  j = 0;
410 
411  /*
412  * It is safe to dereference the fd table without RCU or
413  * ->file_lock because this is the last reference to the
414  * files structure. But use RCU to shut RCU-lockdep up.
415  */
416  rcu_read_lock();
417  fdt = files_fdtable(files);
418  rcu_read_unlock();
419  for (;;) {
420  unsigned long set;
421  i = j * BITS_PER_LONG;
422  if (i >= fdt->max_fds)
423  break;
424  set = fdt->open_fds[j++];
425  while (set) {
426  if (set & 1) {
427  struct file * file = xchg(&fdt->fd[i], NULL);
428  if (file) {
429  filp_close(file, files);
430  cond_resched();
431  }
432  }
433  i++;
434  set >>= 1;
435  }
436  }
437 }
438 
440 {
441  struct files_struct *files;
442 
443  task_lock(task);
444  files = task->files;
445  if (files)
446  atomic_inc(&files->count);
447  task_unlock(task);
448 
449  return files;
450 }
451 
452 void put_files_struct(struct files_struct *files)
453 {
454  struct fdtable *fdt;
455 
456  if (atomic_dec_and_test(&files->count)) {
457  close_files(files);
458  /* not really needed, since nobody can see us */
459  rcu_read_lock();
460  fdt = files_fdtable(files);
461  rcu_read_unlock();
462  /* free the arrays if they are not embedded */
463  if (fdt != &files->fdtab)
464  __free_fdtable(fdt);
466  }
467 }
468 
469 void reset_files_struct(struct files_struct *files)
470 {
471  struct task_struct *tsk = current;
472  struct files_struct *old;
473 
474  old = tsk->files;
475  task_lock(tsk);
476  tsk->files = files;
477  task_unlock(tsk);
478  put_files_struct(old);
479 }
480 
481 void exit_files(struct task_struct *tsk)
482 {
483  struct files_struct * files = tsk->files;
484 
485  if (files) {
486  task_lock(tsk);
487  tsk->files = NULL;
488  task_unlock(tsk);
489  put_files_struct(files);
490  }
491 }
492 
493 static void __devinit fdtable_defer_list_init(int cpu)
494 {
495  struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
496  spin_lock_init(&fddef->lock);
497  INIT_WORK(&fddef->wq, free_fdtable_work);
498  fddef->next = NULL;
499 }
500 
502 {
503  int i;
505  fdtable_defer_list_init(i);
506  sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
507  -BITS_PER_LONG;
508 }
509 
511  .count = ATOMIC_INIT(1),
512  .fdt = &init_files.fdtab,
513  .fdtab = {
514  .max_fds = NR_OPEN_DEFAULT,
515  .fd = &init_files.fd_array[0],
516  .close_on_exec = init_files.close_on_exec_init,
517  .open_fds = init_files.open_fds_init,
518  },
519  .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
520 };
521 
523 {
524  atomic_inc(&init_files.count);
525  reset_files_struct(&init_files);
526 }
527 
528 /*
529  * allocate a file descriptor, mark it busy.
530  */
531 int __alloc_fd(struct files_struct *files,
532  unsigned start, unsigned end, unsigned flags)
533 {
534  unsigned int fd;
535  int error;
536  struct fdtable *fdt;
537 
538  spin_lock(&files->file_lock);
539 repeat:
540  fdt = files_fdtable(files);
541  fd = start;
542  if (fd < files->next_fd)
543  fd = files->next_fd;
544 
545  if (fd < fdt->max_fds)
546  fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
547 
548  /*
549  * N.B. For clone tasks sharing a files structure, this test
550  * will limit the total number of files that can be opened.
551  */
552  error = -EMFILE;
553  if (fd >= end)
554  goto out;
555 
556  error = expand_files(files, fd);
557  if (error < 0)
558  goto out;
559 
560  /*
561  * If we needed to expand the fs array we
562  * might have blocked - try again.
563  */
564  if (error)
565  goto repeat;
566 
567  if (start <= files->next_fd)
568  files->next_fd = fd + 1;
569 
570  __set_open_fd(fd, fdt);
571  if (flags & O_CLOEXEC)
572  __set_close_on_exec(fd, fdt);
573  else
574  __clear_close_on_exec(fd, fdt);
575  error = fd;
576 #if 1
577  /* Sanity check */
578  if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
579  printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
580  rcu_assign_pointer(fdt->fd[fd], NULL);
581  }
582 #endif
583 
584 out:
585  spin_unlock(&files->file_lock);
586  return error;
587 }
588 
589 static int alloc_fd(unsigned start, unsigned flags)
590 {
591  return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
592 }
593 
594 int get_unused_fd_flags(unsigned flags)
595 {
596  return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
597 }
599 
600 static void __put_unused_fd(struct files_struct *files, unsigned int fd)
601 {
602  struct fdtable *fdt = files_fdtable(files);
603  __clear_open_fd(fd, fdt);
604  if (fd < files->next_fd)
605  files->next_fd = fd;
606 }
607 
608 void put_unused_fd(unsigned int fd)
609 {
610  struct files_struct *files = current->files;
611  spin_lock(&files->file_lock);
612  __put_unused_fd(files, fd);
613  spin_unlock(&files->file_lock);
614 }
615 
617 
618 /*
619  * Install a file pointer in the fd array.
620  *
621  * The VFS is full of places where we drop the files lock between
622  * setting the open_fds bitmap and installing the file in the file
623  * array. At any such point, we are vulnerable to a dup2() race
624  * installing a file in the array before us. We need to detect this and
625  * fput() the struct file we are about to overwrite in this case.
626  *
627  * It should never happen - if we allow dup2() do it, _really_ bad things
628  * will follow.
629  *
630  * NOTE: __fd_install() variant is really, really low-level; don't
631  * use it unless you are forced to by truly lousy API shoved down
632  * your throat. 'files' *MUST* be either current->files or obtained
633  * by get_files_struct(current) done by whoever had given it to you,
634  * or really bad things will happen. Normally you want to use
635  * fd_install() instead.
636  */
637 
638 void __fd_install(struct files_struct *files, unsigned int fd,
639  struct file *file)
640 {
641  struct fdtable *fdt;
642  spin_lock(&files->file_lock);
643  fdt = files_fdtable(files);
644  BUG_ON(fdt->fd[fd] != NULL);
645  rcu_assign_pointer(fdt->fd[fd], file);
646  spin_unlock(&files->file_lock);
647 }
648 
649 void fd_install(unsigned int fd, struct file *file)
650 {
651  __fd_install(current->files, fd, file);
652 }
653 
655 
656 /*
657  * The same warnings as for __alloc_fd()/__fd_install() apply here...
658  */
659 int __close_fd(struct files_struct *files, unsigned fd)
660 {
661  struct file *file;
662  struct fdtable *fdt;
663 
664  spin_lock(&files->file_lock);
665  fdt = files_fdtable(files);
666  if (fd >= fdt->max_fds)
667  goto out_unlock;
668  file = fdt->fd[fd];
669  if (!file)
670  goto out_unlock;
671  rcu_assign_pointer(fdt->fd[fd], NULL);
672  __clear_close_on_exec(fd, fdt);
673  __put_unused_fd(files, fd);
674  spin_unlock(&files->file_lock);
675  return filp_close(file, files);
676 
677 out_unlock:
678  spin_unlock(&files->file_lock);
679  return -EBADF;
680 }
681 
682 void do_close_on_exec(struct files_struct *files)
683 {
684  unsigned i;
685  struct fdtable *fdt;
686 
687  /* exec unshares first */
688  spin_lock(&files->file_lock);
689  for (i = 0; ; i++) {
690  unsigned long set;
691  unsigned fd = i * BITS_PER_LONG;
692  fdt = files_fdtable(files);
693  if (fd >= fdt->max_fds)
694  break;
695  set = fdt->close_on_exec[i];
696  if (!set)
697  continue;
698  fdt->close_on_exec[i] = 0;
699  for ( ; set ; fd++, set >>= 1) {
700  struct file *file;
701  if (!(set & 1))
702  continue;
703  file = fdt->fd[fd];
704  if (!file)
705  continue;
706  rcu_assign_pointer(fdt->fd[fd], NULL);
707  __put_unused_fd(files, fd);
708  spin_unlock(&files->file_lock);
709  filp_close(file, files);
710  cond_resched();
711  spin_lock(&files->file_lock);
712  }
713 
714  }
715  spin_unlock(&files->file_lock);
716 }
717 
718 struct file *fget(unsigned int fd)
719 {
720  struct file *file;
721  struct files_struct *files = current->files;
722 
723  rcu_read_lock();
724  file = fcheck_files(files, fd);
725  if (file) {
726  /* File object ref couldn't be taken */
727  if (file->f_mode & FMODE_PATH ||
729  file = NULL;
730  }
731  rcu_read_unlock();
732 
733  return file;
734 }
735 
737 
738 struct file *fget_raw(unsigned int fd)
739 {
740  struct file *file;
741  struct files_struct *files = current->files;
742 
743  rcu_read_lock();
744  file = fcheck_files(files, fd);
745  if (file) {
746  /* File object ref couldn't be taken */
747  if (!atomic_long_inc_not_zero(&file->f_count))
748  file = NULL;
749  }
750  rcu_read_unlock();
751 
752  return file;
753 }
754 
756 
757 /*
758  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
759  *
760  * You can use this instead of fget if you satisfy all of the following
761  * conditions:
762  * 1) You must call fput_light before exiting the syscall and returning control
763  * to userspace (i.e. you cannot remember the returned struct file * after
764  * returning to userspace).
765  * 2) You must not call filp_close on the returned struct file * in between
766  * calls to fget_light and fput_light.
767  * 3) You must not clone the current task in between the calls to fget_light
768  * and fput_light.
769  *
770  * The fput_needed flag returned by fget_light should be passed to the
771  * corresponding fput_light.
772  */
773 struct file *fget_light(unsigned int fd, int *fput_needed)
774 {
775  struct file *file;
776  struct files_struct *files = current->files;
777 
778  *fput_needed = 0;
779  if (atomic_read(&files->count) == 1) {
780  file = fcheck_files(files, fd);
781  if (file && (file->f_mode & FMODE_PATH))
782  file = NULL;
783  } else {
784  rcu_read_lock();
785  file = fcheck_files(files, fd);
786  if (file) {
787  if (!(file->f_mode & FMODE_PATH) &&
789  *fput_needed = 1;
790  else
791  /* Didn't get the reference, someone's freed */
792  file = NULL;
793  }
794  rcu_read_unlock();
795  }
796 
797  return file;
798 }
800 
801 struct file *fget_raw_light(unsigned int fd, int *fput_needed)
802 {
803  struct file *file;
804  struct files_struct *files = current->files;
805 
806  *fput_needed = 0;
807  if (atomic_read(&files->count) == 1) {
808  file = fcheck_files(files, fd);
809  } else {
810  rcu_read_lock();
811  file = fcheck_files(files, fd);
812  if (file) {
813  if (atomic_long_inc_not_zero(&file->f_count))
814  *fput_needed = 1;
815  else
816  /* Didn't get the reference, someone's freed */
817  file = NULL;
818  }
819  rcu_read_unlock();
820  }
821 
822  return file;
823 }
824 
825 void set_close_on_exec(unsigned int fd, int flag)
826 {
827  struct files_struct *files = current->files;
828  struct fdtable *fdt;
829  spin_lock(&files->file_lock);
830  fdt = files_fdtable(files);
831  if (flag)
832  __set_close_on_exec(fd, fdt);
833  else
834  __clear_close_on_exec(fd, fdt);
835  spin_unlock(&files->file_lock);
836 }
837 
838 bool get_close_on_exec(unsigned int fd)
839 {
840  struct files_struct *files = current->files;
841  struct fdtable *fdt;
842  bool res;
843  rcu_read_lock();
844  fdt = files_fdtable(files);
845  res = close_on_exec(fd, fdt);
846  rcu_read_unlock();
847  return res;
848 }
849 
850 static int do_dup2(struct files_struct *files,
851  struct file *file, unsigned fd, unsigned flags)
852 {
853  struct file *tofree;
854  struct fdtable *fdt;
855 
856  /*
857  * We need to detect attempts to do dup2() over allocated but still
858  * not finished descriptor. NB: OpenBSD avoids that at the price of
859  * extra work in their equivalent of fget() - they insert struct
860  * file immediately after grabbing descriptor, mark it larval if
861  * more work (e.g. actual opening) is needed and make sure that
862  * fget() treats larval files as absent. Potentially interesting,
863  * but while extra work in fget() is trivial, locking implications
864  * and amount of surgery on open()-related paths in VFS are not.
865  * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
866  * deadlocks in rather amusing ways, AFAICS. All of that is out of
867  * scope of POSIX or SUS, since neither considers shared descriptor
868  * tables and this condition does not arise without those.
869  */
870  fdt = files_fdtable(files);
871  tofree = fdt->fd[fd];
872  if (!tofree && fd_is_open(fd, fdt))
873  goto Ebusy;
874  get_file(file);
875  rcu_assign_pointer(fdt->fd[fd], file);
876  __set_open_fd(fd, fdt);
877  if (flags & O_CLOEXEC)
878  __set_close_on_exec(fd, fdt);
879  else
880  __clear_close_on_exec(fd, fdt);
881  spin_unlock(&files->file_lock);
882 
883  if (tofree)
884  filp_close(tofree, files);
885 
886  return fd;
887 
888 Ebusy:
889  spin_unlock(&files->file_lock);
890  return -EBUSY;
891 }
892 
893 int replace_fd(unsigned fd, struct file *file, unsigned flags)
894 {
895  int err;
896  struct files_struct *files = current->files;
897 
898  if (!file)
899  return __close_fd(files, fd);
900 
901  if (fd >= rlimit(RLIMIT_NOFILE))
902  return -EBADF;
903 
904  spin_lock(&files->file_lock);
905  err = expand_files(files, fd);
906  if (unlikely(err < 0))
907  goto out_unlock;
908  return do_dup2(files, file, fd, flags);
909 
910 out_unlock:
911  spin_unlock(&files->file_lock);
912  return err;
913 }
914 
915 SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
916 {
917  int err = -EBADF;
918  struct file *file;
919  struct files_struct *files = current->files;
920 
921  if ((flags & ~O_CLOEXEC) != 0)
922  return -EINVAL;
923 
924  if (unlikely(oldfd == newfd))
925  return -EINVAL;
926 
927  if (newfd >= rlimit(RLIMIT_NOFILE))
928  return -EBADF;
929 
930  spin_lock(&files->file_lock);
931  err = expand_files(files, newfd);
932  file = fcheck(oldfd);
933  if (unlikely(!file))
934  goto Ebadf;
935  if (unlikely(err < 0)) {
936  if (err == -EMFILE)
937  goto Ebadf;
938  goto out_unlock;
939  }
940  return do_dup2(files, file, newfd, flags);
941 
942 Ebadf:
943  err = -EBADF;
944 out_unlock:
945  spin_unlock(&files->file_lock);
946  return err;
947 }
948 
949 SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
950 {
951  if (unlikely(newfd == oldfd)) { /* corner case */
952  struct files_struct *files = current->files;
953  int retval = oldfd;
954 
955  rcu_read_lock();
956  if (!fcheck_files(files, oldfd))
957  retval = -EBADF;
958  rcu_read_unlock();
959  return retval;
960  }
961  return sys_dup3(oldfd, newfd, 0);
962 }
963 
964 SYSCALL_DEFINE1(dup, unsigned int, fildes)
965 {
966  int ret = -EBADF;
967  struct file *file = fget_raw(fildes);
968 
969  if (file) {
970  ret = get_unused_fd();
971  if (ret >= 0)
972  fd_install(ret, file);
973  else
974  fput(file);
975  }
976  return ret;
977 }
978 
979 int f_dupfd(unsigned int from, struct file *file, unsigned flags)
980 {
981  int err;
982  if (from >= rlimit(RLIMIT_NOFILE))
983  return -EINVAL;
984  err = alloc_fd(from, flags);
985  if (err >= 0) {
986  get_file(file);
987  fd_install(err, file);
988  }
989  return err;
990 }
991 
992 int iterate_fd(struct files_struct *files, unsigned n,
993  int (*f)(const void *, struct file *, unsigned),
994  const void *p)
995 {
996  struct fdtable *fdt;
997  int res = 0;
998  if (!files)
999  return 0;
1000  spin_lock(&files->file_lock);
1001  for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
1002  struct file *file;
1003  file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
1004  if (!file)
1005  continue;
1006  res = f(p, file, n);
1007  if (res)
1008  break;
1009  }
1010  spin_unlock(&files->file_lock);
1011  return res;
1012 }