Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
splice.c
Go to the documentation of this file.
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <[email protected]>
16  * Copyright (C) 2005-2006 Linus Torvalds <[email protected]>
17  * Copyright (C) 2006 Ingo Molnar <[email protected]>
18  *
19  */
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/splice.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/swap.h>
27 #include <linux/writeback.h>
28 #include <linux/export.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
31 #include <linux/security.h>
32 #include <linux/gfp.h>
33 #include <linux/socket.h>
34 
35 /*
36  * Attempt to steal a page from a pipe buffer. This should perhaps go into
37  * a vm helper function, it's already simplified quite a bit by the
38  * addition of remove_mapping(). If success is returned, the caller may
39  * attempt to reuse this page for another destination.
40  */
41 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
42  struct pipe_buffer *buf)
43 {
44  struct page *page = buf->page;
45  struct address_space *mapping;
46 
47  lock_page(page);
48 
49  mapping = page_mapping(page);
50  if (mapping) {
51  WARN_ON(!PageUptodate(page));
52 
53  /*
54  * At least for ext2 with nobh option, we need to wait on
55  * writeback completing on this page, since we'll remove it
56  * from the pagecache. Otherwise truncate wont wait on the
57  * page, allowing the disk blocks to be reused by someone else
58  * before we actually wrote our data to them. fs corruption
59  * ensues.
60  */
61  wait_on_page_writeback(page);
62 
63  if (page_has_private(page) &&
65  goto out_unlock;
66 
67  /*
68  * If we succeeded in removing the mapping, set LRU flag
69  * and return good.
70  */
71  if (remove_mapping(mapping, page)) {
72  buf->flags |= PIPE_BUF_FLAG_LRU;
73  return 0;
74  }
75  }
76 
77  /*
78  * Raced with truncate or failed to remove page from current
79  * address space, unlock and return failure.
80  */
81 out_unlock:
82  unlock_page(page);
83  return 1;
84 }
85 
86 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
87  struct pipe_buffer *buf)
88 {
90  buf->flags &= ~PIPE_BUF_FLAG_LRU;
91 }
92 
93 /*
94  * Check whether the contents of buf is OK to access. Since the content
95  * is a page cache page, IO may be in flight.
96  */
97 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
98  struct pipe_buffer *buf)
99 {
100  struct page *page = buf->page;
101  int err;
102 
103  if (!PageUptodate(page)) {
104  lock_page(page);
105 
106  /*
107  * Page got truncated/unhashed. This will cause a 0-byte
108  * splice, if this is the first page.
109  */
110  if (!page->mapping) {
111  err = -ENODATA;
112  goto error;
113  }
114 
115  /*
116  * Uh oh, read-error from disk.
117  */
118  if (!PageUptodate(page)) {
119  err = -EIO;
120  goto error;
121  }
122 
123  /*
124  * Page is ok afterall, we are done.
125  */
126  unlock_page(page);
127  }
128 
129  return 0;
130 error:
131  unlock_page(page);
132  return err;
133 }
134 
136  .can_merge = 0,
137  .map = generic_pipe_buf_map,
138  .unmap = generic_pipe_buf_unmap,
139  .confirm = page_cache_pipe_buf_confirm,
140  .release = page_cache_pipe_buf_release,
141  .steal = page_cache_pipe_buf_steal,
142  .get = generic_pipe_buf_get,
143 };
144 
145 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
146  struct pipe_buffer *buf)
147 {
148  if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
149  return 1;
150 
151  buf->flags |= PIPE_BUF_FLAG_LRU;
152  return generic_pipe_buf_steal(pipe, buf);
153 }
154 
155 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
156  .can_merge = 0,
157  .map = generic_pipe_buf_map,
158  .unmap = generic_pipe_buf_unmap,
159  .confirm = generic_pipe_buf_confirm,
160  .release = page_cache_pipe_buf_release,
161  .steal = user_page_pipe_buf_steal,
162  .get = generic_pipe_buf_get,
163 };
164 
165 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
166 {
167  smp_mb();
168  if (waitqueue_active(&pipe->wait))
169  wake_up_interruptible(&pipe->wait);
171 }
172 
185  struct splice_pipe_desc *spd)
186 {
187  unsigned int spd_pages = spd->nr_pages;
188  int ret, do_wakeup, page_nr;
189 
190  ret = 0;
191  do_wakeup = 0;
192  page_nr = 0;
193 
194  pipe_lock(pipe);
195 
196  for (;;) {
197  if (!pipe->readers) {
198  send_sig(SIGPIPE, current, 0);
199  if (!ret)
200  ret = -EPIPE;
201  break;
202  }
203 
204  if (pipe->nrbufs < pipe->buffers) {
205  int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
206  struct pipe_buffer *buf = pipe->bufs + newbuf;
207 
208  buf->page = spd->pages[page_nr];
209  buf->offset = spd->partial[page_nr].offset;
210  buf->len = spd->partial[page_nr].len;
211  buf->private = spd->partial[page_nr].private;
212  buf->ops = spd->ops;
213  if (spd->flags & SPLICE_F_GIFT)
214  buf->flags |= PIPE_BUF_FLAG_GIFT;
215 
216  pipe->nrbufs++;
217  page_nr++;
218  ret += buf->len;
219 
220  if (pipe->inode)
221  do_wakeup = 1;
222 
223  if (!--spd->nr_pages)
224  break;
225  if (pipe->nrbufs < pipe->buffers)
226  continue;
227 
228  break;
229  }
230 
231  if (spd->flags & SPLICE_F_NONBLOCK) {
232  if (!ret)
233  ret = -EAGAIN;
234  break;
235  }
236 
237  if (signal_pending(current)) {
238  if (!ret)
239  ret = -ERESTARTSYS;
240  break;
241  }
242 
243  if (do_wakeup) {
244  smp_mb();
245  if (waitqueue_active(&pipe->wait))
248  do_wakeup = 0;
249  }
250 
251  pipe->waiting_writers++;
252  pipe_wait(pipe);
253  pipe->waiting_writers--;
254  }
255 
256  pipe_unlock(pipe);
257 
258  if (do_wakeup)
259  wakeup_pipe_readers(pipe);
260 
261  while (page_nr < spd_pages)
262  spd->spd_release(spd, page_nr++);
263 
264  return ret;
265 }
266 
267 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
268 {
269  page_cache_release(spd->pages[i]);
270 }
271 
272 /*
273  * Check if we need to grow the arrays holding pages and partial page
274  * descriptions.
275  */
276 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
277 {
278  unsigned int buffers = ACCESS_ONCE(pipe->buffers);
279 
280  spd->nr_pages_max = buffers;
281  if (buffers <= PIPE_DEF_BUFFERS)
282  return 0;
283 
284  spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
285  spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
286 
287  if (spd->pages && spd->partial)
288  return 0;
289 
290  kfree(spd->pages);
291  kfree(spd->partial);
292  return -ENOMEM;
293 }
294 
296 {
297  if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
298  return;
299 
300  kfree(spd->pages);
301  kfree(spd->partial);
302 }
303 
304 static int
305 __generic_file_splice_read(struct file *in, loff_t *ppos,
306  struct pipe_inode_info *pipe, size_t len,
307  unsigned int flags)
308 {
309  struct address_space *mapping = in->f_mapping;
310  unsigned int loff, nr_pages, req_pages;
311  struct page *pages[PIPE_DEF_BUFFERS];
312  struct partial_page partial[PIPE_DEF_BUFFERS];
313  struct page *page;
314  pgoff_t index, end_index;
315  loff_t isize;
316  int error, page_nr;
317  struct splice_pipe_desc spd = {
318  .pages = pages,
319  .partial = partial,
320  .nr_pages_max = PIPE_DEF_BUFFERS,
321  .flags = flags,
322  .ops = &page_cache_pipe_buf_ops,
323  .spd_release = spd_release_page,
324  };
325 
326  if (splice_grow_spd(pipe, &spd))
327  return -ENOMEM;
328 
329  index = *ppos >> PAGE_CACHE_SHIFT;
330  loff = *ppos & ~PAGE_CACHE_MASK;
331  req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
332  nr_pages = min(req_pages, spd.nr_pages_max);
333 
334  /*
335  * Lookup the (hopefully) full range of pages we need.
336  */
337  spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
338  index += spd.nr_pages;
339 
340  /*
341  * If find_get_pages_contig() returned fewer pages than we needed,
342  * readahead/allocate the rest and fill in the holes.
343  */
344  if (spd.nr_pages < nr_pages)
345  page_cache_sync_readahead(mapping, &in->f_ra, in,
346  index, req_pages - spd.nr_pages);
347 
348  error = 0;
349  while (spd.nr_pages < nr_pages) {
350  /*
351  * Page could be there, find_get_pages_contig() breaks on
352  * the first hole.
353  */
354  page = find_get_page(mapping, index);
355  if (!page) {
356  /*
357  * page didn't exist, allocate one.
358  */
359  page = page_cache_alloc_cold(mapping);
360  if (!page)
361  break;
362 
363  error = add_to_page_cache_lru(page, mapping, index,
364  GFP_KERNEL);
365  if (unlikely(error)) {
366  page_cache_release(page);
367  if (error == -EEXIST)
368  continue;
369  break;
370  }
371  /*
372  * add_to_page_cache() locks the page, unlock it
373  * to avoid convoluting the logic below even more.
374  */
375  unlock_page(page);
376  }
377 
378  spd.pages[spd.nr_pages++] = page;
379  index++;
380  }
381 
382  /*
383  * Now loop over the map and see if we need to start IO on any
384  * pages, fill in the partial map, etc.
385  */
386  index = *ppos >> PAGE_CACHE_SHIFT;
387  nr_pages = spd.nr_pages;
388  spd.nr_pages = 0;
389  for (page_nr = 0; page_nr < nr_pages; page_nr++) {
390  unsigned int this_len;
391 
392  if (!len)
393  break;
394 
395  /*
396  * this_len is the max we'll use from this page
397  */
398  this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
399  page = spd.pages[page_nr];
400 
401  if (PageReadahead(page))
402  page_cache_async_readahead(mapping, &in->f_ra, in,
403  page, index, req_pages - page_nr);
404 
405  /*
406  * If the page isn't uptodate, we may need to start io on it
407  */
408  if (!PageUptodate(page)) {
409  lock_page(page);
410 
411  /*
412  * Page was truncated, or invalidated by the
413  * filesystem. Redo the find/create, but this time the
414  * page is kept locked, so there's no chance of another
415  * race with truncate/invalidate.
416  */
417  if (!page->mapping) {
418  unlock_page(page);
419  page = find_or_create_page(mapping, index,
420  mapping_gfp_mask(mapping));
421 
422  if (!page) {
423  error = -ENOMEM;
424  break;
425  }
426  page_cache_release(spd.pages[page_nr]);
427  spd.pages[page_nr] = page;
428  }
429  /*
430  * page was already under io and is now done, great
431  */
432  if (PageUptodate(page)) {
433  unlock_page(page);
434  goto fill_it;
435  }
436 
437  /*
438  * need to read in the page
439  */
440  error = mapping->a_ops->readpage(in, page);
441  if (unlikely(error)) {
442  /*
443  * We really should re-lookup the page here,
444  * but it complicates things a lot. Instead
445  * lets just do what we already stored, and
446  * we'll get it the next time we are called.
447  */
448  if (error == AOP_TRUNCATED_PAGE)
449  error = 0;
450 
451  break;
452  }
453  }
454 fill_it:
455  /*
456  * i_size must be checked after PageUptodate.
457  */
458  isize = i_size_read(mapping->host);
459  end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
460  if (unlikely(!isize || index > end_index))
461  break;
462 
463  /*
464  * if this is the last page, see if we need to shrink
465  * the length and stop
466  */
467  if (end_index == index) {
468  unsigned int plen;
469 
470  /*
471  * max good bytes in this page
472  */
473  plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
474  if (plen <= loff)
475  break;
476 
477  /*
478  * force quit after adding this page
479  */
480  this_len = min(this_len, plen - loff);
481  len = this_len;
482  }
483 
484  spd.partial[page_nr].offset = loff;
485  spd.partial[page_nr].len = this_len;
486  len -= this_len;
487  loff = 0;
488  spd.nr_pages++;
489  index++;
490  }
491 
492  /*
493  * Release any pages at the end, if we quit early. 'page_nr' is how far
494  * we got, 'nr_pages' is how many pages are in the map.
495  */
496  while (page_nr < nr_pages)
497  page_cache_release(spd.pages[page_nr++]);
498  in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
499 
500  if (spd.nr_pages)
501  error = splice_to_pipe(pipe, &spd);
502 
503  splice_shrink_spd(&spd);
504  return error;
505 }
506 
521 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
522  struct pipe_inode_info *pipe, size_t len,
523  unsigned int flags)
524 {
525  loff_t isize, left;
526  int ret;
527 
528  isize = i_size_read(in->f_mapping->host);
529  if (unlikely(*ppos >= isize))
530  return 0;
531 
532  left = isize - *ppos;
533  if (unlikely(left < len))
534  len = left;
535 
536  ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
537  if (ret > 0) {
538  *ppos += ret;
539  file_accessed(in);
540  }
541 
542  return ret;
543 }
545 
546 static const struct pipe_buf_operations default_pipe_buf_ops = {
547  .can_merge = 0,
548  .map = generic_pipe_buf_map,
549  .unmap = generic_pipe_buf_unmap,
550  .confirm = generic_pipe_buf_confirm,
551  .release = generic_pipe_buf_release,
552  .steal = generic_pipe_buf_steal,
553  .get = generic_pipe_buf_get,
554 };
555 
556 static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
557  unsigned long vlen, loff_t offset)
558 {
559  mm_segment_t old_fs;
560  loff_t pos = offset;
561  ssize_t res;
562 
563  old_fs = get_fs();
564  set_fs(get_ds());
565  /* The cast to a user pointer is valid due to the set_fs() */
566  res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
567  set_fs(old_fs);
568 
569  return res;
570 }
571 
572 static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
573  loff_t pos)
574 {
575  mm_segment_t old_fs;
576  ssize_t res;
577 
578  old_fs = get_fs();
579  set_fs(get_ds());
580  /* The cast to a user pointer is valid due to the set_fs() */
581  res = vfs_write(file, (const char __user *)buf, count, &pos);
582  set_fs(old_fs);
583 
584  return res;
585 }
586 
587 ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
588  struct pipe_inode_info *pipe, size_t len,
589  unsigned int flags)
590 {
591  unsigned int nr_pages;
592  unsigned int nr_freed;
593  size_t offset;
594  struct page *pages[PIPE_DEF_BUFFERS];
595  struct partial_page partial[PIPE_DEF_BUFFERS];
596  struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
597  ssize_t res;
598  size_t this_len;
599  int error;
600  int i;
601  struct splice_pipe_desc spd = {
602  .pages = pages,
603  .partial = partial,
604  .nr_pages_max = PIPE_DEF_BUFFERS,
605  .flags = flags,
606  .ops = &default_pipe_buf_ops,
607  .spd_release = spd_release_page,
608  };
609 
610  if (splice_grow_spd(pipe, &spd))
611  return -ENOMEM;
612 
613  res = -ENOMEM;
614  vec = __vec;
615  if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
616  vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
617  if (!vec)
618  goto shrink_ret;
619  }
620 
621  offset = *ppos & ~PAGE_CACHE_MASK;
622  nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
623 
624  for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
625  struct page *page;
626 
627  page = alloc_page(GFP_USER);
628  error = -ENOMEM;
629  if (!page)
630  goto err;
631 
632  this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
633  vec[i].iov_base = (void __user *) page_address(page);
634  vec[i].iov_len = this_len;
635  spd.pages[i] = page;
636  spd.nr_pages++;
637  len -= this_len;
638  offset = 0;
639  }
640 
641  res = kernel_readv(in, vec, spd.nr_pages, *ppos);
642  if (res < 0) {
643  error = res;
644  goto err;
645  }
646 
647  error = 0;
648  if (!res)
649  goto err;
650 
651  nr_freed = 0;
652  for (i = 0; i < spd.nr_pages; i++) {
653  this_len = min_t(size_t, vec[i].iov_len, res);
654  spd.partial[i].offset = 0;
655  spd.partial[i].len = this_len;
656  if (!this_len) {
657  __free_page(spd.pages[i]);
658  spd.pages[i] = NULL;
659  nr_freed++;
660  }
661  res -= this_len;
662  }
663  spd.nr_pages -= nr_freed;
664 
665  res = splice_to_pipe(pipe, &spd);
666  if (res > 0)
667  *ppos += res;
668 
669 shrink_ret:
670  if (vec != __vec)
671  kfree(vec);
672  splice_shrink_spd(&spd);
673  return res;
674 
675 err:
676  for (i = 0; i < spd.nr_pages; i++)
677  __free_page(spd.pages[i]);
678 
679  res = error;
680  goto shrink_ret;
681 }
683 
684 /*
685  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
686  * using sendpage(). Return the number of bytes sent.
687  */
688 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
689  struct pipe_buffer *buf, struct splice_desc *sd)
690 {
691  struct file *file = sd->u.file;
692  loff_t pos = sd->pos;
693  int more;
694 
695  if (!likely(file->f_op && file->f_op->sendpage))
696  return -EINVAL;
697 
698  more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
699  if (sd->len < sd->total_len)
700  more |= MSG_SENDPAGE_NOTLAST;
701  return file->f_op->sendpage(file, buf->page, buf->offset,
702  sd->len, &pos, more);
703 }
704 
705 /*
706  * This is a little more tricky than the file -> pipe splicing. There are
707  * basically three cases:
708  *
709  * - Destination page already exists in the address space and there
710  * are users of it. For that case we have no other option that
711  * copying the data. Tough luck.
712  * - Destination page already exists in the address space, but there
713  * are no users of it. Make sure it's uptodate, then drop it. Fall
714  * through to last case.
715  * - Destination page does not exist, we can add the pipe page to
716  * the page cache and avoid the copy.
717  *
718  * If asked to move pages to the output file (SPLICE_F_MOVE is set in
719  * sd->flags), we attempt to migrate pages from the pipe to the output
720  * file address space page cache. This is possible if no one else has
721  * the pipe page referenced outside of the pipe and page cache. If
722  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
723  * a new page in the output file page cache and fill/dirty that.
724  */
725 int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
726  struct splice_desc *sd)
727 {
728  struct file *file = sd->u.file;
729  struct address_space *mapping = file->f_mapping;
730  unsigned int offset, this_len;
731  struct page *page;
732  void *fsdata;
733  int ret;
734 
735  offset = sd->pos & ~PAGE_CACHE_MASK;
736 
737  this_len = sd->len;
738  if (this_len + offset > PAGE_CACHE_SIZE)
739  this_len = PAGE_CACHE_SIZE - offset;
740 
741  ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
742  AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
743  if (unlikely(ret))
744  goto out;
745 
746  if (buf->page != page) {
747  char *src = buf->ops->map(pipe, buf, 1);
748  char *dst = kmap_atomic(page);
749 
750  memcpy(dst + offset, src + buf->offset, this_len);
751  flush_dcache_page(page);
752  kunmap_atomic(dst);
753  buf->ops->unmap(pipe, buf, src);
754  }
755  ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
756  page, fsdata);
757 out:
758  return ret;
759 }
761 
762 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
763 {
764  smp_mb();
765  if (waitqueue_active(&pipe->wait))
766  wake_up_interruptible(&pipe->wait);
768 }
769 
790 int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
791  splice_actor *actor)
792 {
793  int ret;
794 
795  while (pipe->nrbufs) {
796  struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
797  const struct pipe_buf_operations *ops = buf->ops;
798 
799  sd->len = buf->len;
800  if (sd->len > sd->total_len)
801  sd->len = sd->total_len;
802 
803  ret = buf->ops->confirm(pipe, buf);
804  if (unlikely(ret)) {
805  if (ret == -ENODATA)
806  ret = 0;
807  return ret;
808  }
809 
810  ret = actor(pipe, buf, sd);
811  if (ret <= 0)
812  return ret;
813 
814  buf->offset += ret;
815  buf->len -= ret;
816 
817  sd->num_spliced += ret;
818  sd->len -= ret;
819  sd->pos += ret;
820  sd->total_len -= ret;
821 
822  if (!buf->len) {
823  buf->ops = NULL;
824  ops->release(pipe, buf);
825  pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
826  pipe->nrbufs--;
827  if (pipe->inode)
828  sd->need_wakeup = true;
829  }
830 
831  if (!sd->total_len)
832  return 0;
833  }
834 
835  return 1;
836 }
838 
849 int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
850 {
851  while (!pipe->nrbufs) {
852  if (!pipe->writers)
853  return 0;
854 
855  if (!pipe->waiting_writers && sd->num_spliced)
856  return 0;
857 
858  if (sd->flags & SPLICE_F_NONBLOCK)
859  return -EAGAIN;
860 
861  if (signal_pending(current))
862  return -ERESTARTSYS;
863 
864  if (sd->need_wakeup) {
865  wakeup_pipe_writers(pipe);
866  sd->need_wakeup = false;
867  }
868 
869  pipe_wait(pipe);
870  }
871 
872  return 1;
873 }
875 
886 {
887  sd->num_spliced = 0;
888  sd->need_wakeup = false;
889 }
891 
902 void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
903 {
904  if (sd->need_wakeup)
905  wakeup_pipe_writers(pipe);
906 }
908 
923  splice_actor *actor)
924 {
925  int ret;
926 
928  do {
929  ret = splice_from_pipe_next(pipe, sd);
930  if (ret > 0)
931  ret = splice_from_pipe_feed(pipe, sd, actor);
932  } while (ret > 0);
933  splice_from_pipe_end(pipe, sd);
934 
935  return sd->num_spliced ? sd->num_spliced : ret;
936 }
938 
953 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
954  loff_t *ppos, size_t len, unsigned int flags,
955  splice_actor *actor)
956 {
957  ssize_t ret;
958  struct splice_desc sd = {
959  .total_len = len,
960  .flags = flags,
961  .pos = *ppos,
962  .u.file = out,
963  };
964 
965  pipe_lock(pipe);
966  ret = __splice_from_pipe(pipe, &sd, actor);
967  pipe_unlock(pipe);
968 
969  return ret;
970 }
971 
985 ssize_t
986 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
987  loff_t *ppos, size_t len, unsigned int flags)
988 {
989  struct address_space *mapping = out->f_mapping;
990  struct inode *inode = mapping->host;
991  struct splice_desc sd = {
992  .total_len = len,
993  .flags = flags,
994  .pos = *ppos,
995  .u.file = out,
996  };
997  ssize_t ret;
998 
999  sb_start_write(inode->i_sb);
1000 
1001  pipe_lock(pipe);
1002 
1004  do {
1005  ret = splice_from_pipe_next(pipe, &sd);
1006  if (ret <= 0)
1007  break;
1008 
1010  ret = file_remove_suid(out);
1011  if (!ret) {
1012  ret = file_update_time(out);
1013  if (!ret)
1014  ret = splice_from_pipe_feed(pipe, &sd,
1015  pipe_to_file);
1016  }
1017  mutex_unlock(&inode->i_mutex);
1018  } while (ret > 0);
1019  splice_from_pipe_end(pipe, &sd);
1020 
1021  pipe_unlock(pipe);
1022 
1023  if (sd.num_spliced)
1024  ret = sd.num_spliced;
1025 
1026  if (ret > 0) {
1027  unsigned long nr_pages;
1028  int err;
1029 
1030  nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1031 
1032  err = generic_write_sync(out, *ppos, ret);
1033  if (err)
1034  ret = err;
1035  else
1036  *ppos += ret;
1037  balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
1038  }
1039  sb_end_write(inode->i_sb);
1040 
1041  return ret;
1042 }
1043 
1045 
1046 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1047  struct splice_desc *sd)
1048 {
1049  int ret;
1050  void *data;
1051 
1052  data = buf->ops->map(pipe, buf, 0);
1053  ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1054  buf->ops->unmap(pipe, buf, data);
1055 
1056  return ret;
1057 }
1058 
1059 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
1060  struct file *out, loff_t *ppos,
1061  size_t len, unsigned int flags)
1062 {
1063  ssize_t ret;
1064 
1065  ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
1066  if (ret > 0)
1067  *ppos += ret;
1068 
1069  return ret;
1070 }
1071 
1085 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
1086  loff_t *ppos, size_t len, unsigned int flags)
1087 {
1088  return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
1089 }
1090 
1092 
1093 /*
1094  * Attempt to initiate a splice from pipe to file.
1095  */
1096 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
1097  loff_t *ppos, size_t len, unsigned int flags)
1098 {
1099  ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
1100  loff_t *, size_t, unsigned int);
1101  int ret;
1102 
1103  if (unlikely(!(out->f_mode & FMODE_WRITE)))
1104  return -EBADF;
1105 
1106  if (unlikely(out->f_flags & O_APPEND))
1107  return -EINVAL;
1108 
1109  ret = rw_verify_area(WRITE, out, ppos, len);
1110  if (unlikely(ret < 0))
1111  return ret;
1112 
1113  if (out->f_op && out->f_op->splice_write)
1114  splice_write = out->f_op->splice_write;
1115  else
1116  splice_write = default_file_splice_write;
1117 
1118  return splice_write(pipe, out, ppos, len, flags);
1119 }
1120 
1121 /*
1122  * Attempt to initiate a splice from a file to a pipe.
1123  */
1124 static long do_splice_to(struct file *in, loff_t *ppos,
1125  struct pipe_inode_info *pipe, size_t len,
1126  unsigned int flags)
1127 {
1128  ssize_t (*splice_read)(struct file *, loff_t *,
1129  struct pipe_inode_info *, size_t, unsigned int);
1130  int ret;
1131 
1132  if (unlikely(!(in->f_mode & FMODE_READ)))
1133  return -EBADF;
1134 
1135  ret = rw_verify_area(READ, in, ppos, len);
1136  if (unlikely(ret < 0))
1137  return ret;
1138 
1139  if (in->f_op && in->f_op->splice_read)
1140  splice_read = in->f_op->splice_read;
1141  else
1142  splice_read = default_file_splice_read;
1143 
1144  return splice_read(in, ppos, pipe, len, flags);
1145 }
1146 
1160 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1161  splice_direct_actor *actor)
1162 {
1163  struct pipe_inode_info *pipe;
1164  long ret, bytes;
1165  umode_t i_mode;
1166  size_t len;
1167  int i, flags;
1168 
1169  /*
1170  * We require the input being a regular file, as we don't want to
1171  * randomly drop data for eg socket -> socket splicing. Use the
1172  * piped splicing for that!
1173  */
1174  i_mode = in->f_path.dentry->d_inode->i_mode;
1175  if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1176  return -EINVAL;
1177 
1178  /*
1179  * neither in nor out is a pipe, setup an internal pipe attached to
1180  * 'out' and transfer the wanted data from 'in' to 'out' through that
1181  */
1182  pipe = current->splice_pipe;
1183  if (unlikely(!pipe)) {
1184  pipe = alloc_pipe_info(NULL);
1185  if (!pipe)
1186  return -ENOMEM;
1187 
1188  /*
1189  * We don't have an immediate reader, but we'll read the stuff
1190  * out of the pipe right after the splice_to_pipe(). So set
1191  * PIPE_READERS appropriately.
1192  */
1193  pipe->readers = 1;
1194 
1195  current->splice_pipe = pipe;
1196  }
1197 
1198  /*
1199  * Do the splice.
1200  */
1201  ret = 0;
1202  bytes = 0;
1203  len = sd->total_len;
1204  flags = sd->flags;
1205 
1206  /*
1207  * Don't block on output, we have to drain the direct pipe.
1208  */
1209  sd->flags &= ~SPLICE_F_NONBLOCK;
1210 
1211  while (len) {
1212  size_t read_len;
1213  loff_t pos = sd->pos, prev_pos = pos;
1214 
1215  ret = do_splice_to(in, &pos, pipe, len, flags);
1216  if (unlikely(ret <= 0))
1217  goto out_release;
1218 
1219  read_len = ret;
1220  sd->total_len = read_len;
1221 
1222  /*
1223  * NOTE: nonblocking mode only applies to the input. We
1224  * must not do the output in nonblocking mode as then we
1225  * could get stuck data in the internal pipe:
1226  */
1227  ret = actor(pipe, sd);
1228  if (unlikely(ret <= 0)) {
1229  sd->pos = prev_pos;
1230  goto out_release;
1231  }
1232 
1233  bytes += ret;
1234  len -= ret;
1235  sd->pos = pos;
1236 
1237  if (ret < read_len) {
1238  sd->pos = prev_pos + ret;
1239  goto out_release;
1240  }
1241  }
1242 
1243 done:
1244  pipe->nrbufs = pipe->curbuf = 0;
1245  file_accessed(in);
1246  return bytes;
1247 
1248 out_release:
1249  /*
1250  * If we did an incomplete transfer we must release
1251  * the pipe buffers in question:
1252  */
1253  for (i = 0; i < pipe->buffers; i++) {
1254  struct pipe_buffer *buf = pipe->bufs + i;
1255 
1256  if (buf->ops) {
1257  buf->ops->release(pipe, buf);
1258  buf->ops = NULL;
1259  }
1260  }
1261 
1262  if (!bytes)
1263  bytes = ret;
1264 
1265  goto done;
1266 }
1268 
1269 static int direct_splice_actor(struct pipe_inode_info *pipe,
1270  struct splice_desc *sd)
1271 {
1272  struct file *file = sd->u.file;
1273 
1274  return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
1275  sd->flags);
1276 }
1277 
1293 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1294  size_t len, unsigned int flags)
1295 {
1296  struct splice_desc sd = {
1297  .len = len,
1298  .total_len = len,
1299  .flags = flags,
1300  .pos = *ppos,
1301  .u.file = out,
1302  };
1303  long ret;
1304 
1305  ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1306  if (ret > 0)
1307  *ppos = sd.pos;
1308 
1309  return ret;
1310 }
1311 
1312 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1313  struct pipe_inode_info *opipe,
1314  size_t len, unsigned int flags);
1315 
1316 /*
1317  * Determine where to splice to/from.
1318  */
1319 static long do_splice(struct file *in, loff_t __user *off_in,
1320  struct file *out, loff_t __user *off_out,
1321  size_t len, unsigned int flags)
1322 {
1323  struct pipe_inode_info *ipipe;
1324  struct pipe_inode_info *opipe;
1325  loff_t offset, *off;
1326  long ret;
1327 
1328  ipipe = get_pipe_info(in);
1329  opipe = get_pipe_info(out);
1330 
1331  if (ipipe && opipe) {
1332  if (off_in || off_out)
1333  return -ESPIPE;
1334 
1335  if (!(in->f_mode & FMODE_READ))
1336  return -EBADF;
1337 
1338  if (!(out->f_mode & FMODE_WRITE))
1339  return -EBADF;
1340 
1341  /* Splicing to self would be fun, but... */
1342  if (ipipe == opipe)
1343  return -EINVAL;
1344 
1345  return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1346  }
1347 
1348  if (ipipe) {
1349  if (off_in)
1350  return -ESPIPE;
1351  if (off_out) {
1352  if (!(out->f_mode & FMODE_PWRITE))
1353  return -EINVAL;
1354  if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1355  return -EFAULT;
1356  off = &offset;
1357  } else
1358  off = &out->f_pos;
1359 
1360  ret = do_splice_from(ipipe, out, off, len, flags);
1361 
1362  if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1363  ret = -EFAULT;
1364 
1365  return ret;
1366  }
1367 
1368  if (opipe) {
1369  if (off_out)
1370  return -ESPIPE;
1371  if (off_in) {
1372  if (!(in->f_mode & FMODE_PREAD))
1373  return -EINVAL;
1374  if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1375  return -EFAULT;
1376  off = &offset;
1377  } else
1378  off = &in->f_pos;
1379 
1380  ret = do_splice_to(in, off, opipe, len, flags);
1381 
1382  if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1383  ret = -EFAULT;
1384 
1385  return ret;
1386  }
1387 
1388  return -EINVAL;
1389 }
1390 
1391 /*
1392  * Map an iov into an array of pages and offset/length tupples. With the
1393  * partial_page structure, we can map several non-contiguous ranges into
1394  * our ones pages[] map instead of splitting that operation into pieces.
1395  * Could easily be exported as a generic helper for other users, in which
1396  * case one would probably want to add a 'max_nr_pages' parameter as well.
1397  */
1398 static int get_iovec_page_array(const struct iovec __user *iov,
1399  unsigned int nr_vecs, struct page **pages,
1400  struct partial_page *partial, bool aligned,
1401  unsigned int pipe_buffers)
1402 {
1403  int buffers = 0, error = 0;
1404 
1405  while (nr_vecs) {
1406  unsigned long off, npages;
1407  struct iovec entry;
1408  void __user *base;
1409  size_t len;
1410  int i;
1411 
1412  error = -EFAULT;
1413  if (copy_from_user(&entry, iov, sizeof(entry)))
1414  break;
1415 
1416  base = entry.iov_base;
1417  len = entry.iov_len;
1418 
1419  /*
1420  * Sanity check this iovec. 0 read succeeds.
1421  */
1422  error = 0;
1423  if (unlikely(!len))
1424  break;
1425  error = -EFAULT;
1426  if (!access_ok(VERIFY_READ, base, len))
1427  break;
1428 
1429  /*
1430  * Get this base offset and number of pages, then map
1431  * in the user pages.
1432  */
1433  off = (unsigned long) base & ~PAGE_MASK;
1434 
1435  /*
1436  * If asked for alignment, the offset must be zero and the
1437  * length a multiple of the PAGE_SIZE.
1438  */
1439  error = -EINVAL;
1440  if (aligned && (off || len & ~PAGE_MASK))
1441  break;
1442 
1443  npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1444  if (npages > pipe_buffers - buffers)
1445  npages = pipe_buffers - buffers;
1446 
1447  error = get_user_pages_fast((unsigned long)base, npages,
1448  0, &pages[buffers]);
1449 
1450  if (unlikely(error <= 0))
1451  break;
1452 
1453  /*
1454  * Fill this contiguous range into the partial page map.
1455  */
1456  for (i = 0; i < error; i++) {
1457  const int plen = min_t(size_t, len, PAGE_SIZE - off);
1458 
1459  partial[buffers].offset = off;
1460  partial[buffers].len = plen;
1461 
1462  off = 0;
1463  len -= plen;
1464  buffers++;
1465  }
1466 
1467  /*
1468  * We didn't complete this iov, stop here since it probably
1469  * means we have to move some of this into a pipe to
1470  * be able to continue.
1471  */
1472  if (len)
1473  break;
1474 
1475  /*
1476  * Don't continue if we mapped fewer pages than we asked for,
1477  * or if we mapped the max number of pages that we have
1478  * room for.
1479  */
1480  if (error < npages || buffers == pipe_buffers)
1481  break;
1482 
1483  nr_vecs--;
1484  iov++;
1485  }
1486 
1487  if (buffers)
1488  return buffers;
1489 
1490  return error;
1491 }
1492 
1493 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1494  struct splice_desc *sd)
1495 {
1496  char *src;
1497  int ret;
1498 
1499  /*
1500  * See if we can use the atomic maps, by prefaulting in the
1501  * pages and doing an atomic copy
1502  */
1503  if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1504  src = buf->ops->map(pipe, buf, 1);
1505  ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1506  sd->len);
1507  buf->ops->unmap(pipe, buf, src);
1508  if (!ret) {
1509  ret = sd->len;
1510  goto out;
1511  }
1512  }
1513 
1514  /*
1515  * No dice, use slow non-atomic map and copy
1516  */
1517  src = buf->ops->map(pipe, buf, 0);
1518 
1519  ret = sd->len;
1520  if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1521  ret = -EFAULT;
1522 
1523  buf->ops->unmap(pipe, buf, src);
1524 out:
1525  if (ret > 0)
1526  sd->u.userptr += ret;
1527  return ret;
1528 }
1529 
1530 /*
1531  * For lack of a better implementation, implement vmsplice() to userspace
1532  * as a simple copy of the pipes pages to the user iov.
1533  */
1534 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1535  unsigned long nr_segs, unsigned int flags)
1536 {
1537  struct pipe_inode_info *pipe;
1538  struct splice_desc sd;
1539  ssize_t size;
1540  int error;
1541  long ret;
1542 
1543  pipe = get_pipe_info(file);
1544  if (!pipe)
1545  return -EBADF;
1546 
1547  pipe_lock(pipe);
1548 
1549  error = ret = 0;
1550  while (nr_segs) {
1551  void __user *base;
1552  size_t len;
1553 
1554  /*
1555  * Get user address base and length for this iovec.
1556  */
1557  error = get_user(base, &iov->iov_base);
1558  if (unlikely(error))
1559  break;
1560  error = get_user(len, &iov->iov_len);
1561  if (unlikely(error))
1562  break;
1563 
1564  /*
1565  * Sanity check this iovec. 0 read succeeds.
1566  */
1567  if (unlikely(!len))
1568  break;
1569  if (unlikely(!base)) {
1570  error = -EFAULT;
1571  break;
1572  }
1573 
1574  if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
1575  error = -EFAULT;
1576  break;
1577  }
1578 
1579  sd.len = 0;
1580  sd.total_len = len;
1581  sd.flags = flags;
1582  sd.u.userptr = base;
1583  sd.pos = 0;
1584 
1585  size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1586  if (size < 0) {
1587  if (!ret)
1588  ret = size;
1589 
1590  break;
1591  }
1592 
1593  ret += size;
1594 
1595  if (size < len)
1596  break;
1597 
1598  nr_segs--;
1599  iov++;
1600  }
1601 
1602  pipe_unlock(pipe);
1603 
1604  if (!ret)
1605  ret = error;
1606 
1607  return ret;
1608 }
1609 
1610 /*
1611  * vmsplice splices a user address range into a pipe. It can be thought of
1612  * as splice-from-memory, where the regular splice is splice-from-file (or
1613  * to file). In both cases the output is a pipe, naturally.
1614  */
1615 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1616  unsigned long nr_segs, unsigned int flags)
1617 {
1618  struct pipe_inode_info *pipe;
1619  struct page *pages[PIPE_DEF_BUFFERS];
1620  struct partial_page partial[PIPE_DEF_BUFFERS];
1621  struct splice_pipe_desc spd = {
1622  .pages = pages,
1623  .partial = partial,
1624  .nr_pages_max = PIPE_DEF_BUFFERS,
1625  .flags = flags,
1626  .ops = &user_page_pipe_buf_ops,
1627  .spd_release = spd_release_page,
1628  };
1629  long ret;
1630 
1631  pipe = get_pipe_info(file);
1632  if (!pipe)
1633  return -EBADF;
1634 
1635  if (splice_grow_spd(pipe, &spd))
1636  return -ENOMEM;
1637 
1638  spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1639  spd.partial, false,
1640  spd.nr_pages_max);
1641  if (spd.nr_pages <= 0)
1642  ret = spd.nr_pages;
1643  else
1644  ret = splice_to_pipe(pipe, &spd);
1645 
1646  splice_shrink_spd(&spd);
1647  return ret;
1648 }
1649 
1650 /*
1651  * Note that vmsplice only really supports true splicing _from_ user memory
1652  * to a pipe, not the other way around. Splicing from user memory is a simple
1653  * operation that can be supported without any funky alignment restrictions
1654  * or nasty vm tricks. We simply map in the user memory and fill them into
1655  * a pipe. The reverse isn't quite as easy, though. There are two possible
1656  * solutions for that:
1657  *
1658  * - memcpy() the data internally, at which point we might as well just
1659  * do a regular read() on the buffer anyway.
1660  * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1661  * has restriction limitations on both ends of the pipe).
1662  *
1663  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1664  *
1665  */
1666 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1667  unsigned long, nr_segs, unsigned int, flags)
1668 {
1669  struct fd f;
1670  long error;
1671 
1672  if (unlikely(nr_segs > UIO_MAXIOV))
1673  return -EINVAL;
1674  else if (unlikely(!nr_segs))
1675  return 0;
1676 
1677  error = -EBADF;
1678  f = fdget(fd);
1679  if (f.file) {
1680  if (f.file->f_mode & FMODE_WRITE)
1681  error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1682  else if (f.file->f_mode & FMODE_READ)
1683  error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1684 
1685  fdput(f);
1686  }
1687 
1688  return error;
1689 }
1690 
1691 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1692  int, fd_out, loff_t __user *, off_out,
1693  size_t, len, unsigned int, flags)
1694 {
1695  struct fd in, out;
1696  long error;
1697 
1698  if (unlikely(!len))
1699  return 0;
1700 
1701  error = -EBADF;
1702  in = fdget(fd_in);
1703  if (in.file) {
1704  if (in.file->f_mode & FMODE_READ) {
1705  out = fdget(fd_out);
1706  if (out.file) {
1707  if (out.file->f_mode & FMODE_WRITE)
1708  error = do_splice(in.file, off_in,
1709  out.file, off_out,
1710  len, flags);
1711  fdput(out);
1712  }
1713  }
1714  fdput(in);
1715  }
1716  return error;
1717 }
1718 
1719 /*
1720  * Make sure there's data to read. Wait for input if we can, otherwise
1721  * return an appropriate error.
1722  */
1723 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1724 {
1725  int ret;
1726 
1727  /*
1728  * Check ->nrbufs without the inode lock first. This function
1729  * is speculative anyways, so missing one is ok.
1730  */
1731  if (pipe->nrbufs)
1732  return 0;
1733 
1734  ret = 0;
1735  pipe_lock(pipe);
1736 
1737  while (!pipe->nrbufs) {
1738  if (signal_pending(current)) {
1739  ret = -ERESTARTSYS;
1740  break;
1741  }
1742  if (!pipe->writers)
1743  break;
1744  if (!pipe->waiting_writers) {
1745  if (flags & SPLICE_F_NONBLOCK) {
1746  ret = -EAGAIN;
1747  break;
1748  }
1749  }
1750  pipe_wait(pipe);
1751  }
1752 
1753  pipe_unlock(pipe);
1754  return ret;
1755 }
1756 
1757 /*
1758  * Make sure there's writeable room. Wait for room if we can, otherwise
1759  * return an appropriate error.
1760  */
1761 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1762 {
1763  int ret;
1764 
1765  /*
1766  * Check ->nrbufs without the inode lock first. This function
1767  * is speculative anyways, so missing one is ok.
1768  */
1769  if (pipe->nrbufs < pipe->buffers)
1770  return 0;
1771 
1772  ret = 0;
1773  pipe_lock(pipe);
1774 
1775  while (pipe->nrbufs >= pipe->buffers) {
1776  if (!pipe->readers) {
1777  send_sig(SIGPIPE, current, 0);
1778  ret = -EPIPE;
1779  break;
1780  }
1781  if (flags & SPLICE_F_NONBLOCK) {
1782  ret = -EAGAIN;
1783  break;
1784  }
1785  if (signal_pending(current)) {
1786  ret = -ERESTARTSYS;
1787  break;
1788  }
1789  pipe->waiting_writers++;
1790  pipe_wait(pipe);
1791  pipe->waiting_writers--;
1792  }
1793 
1794  pipe_unlock(pipe);
1795  return ret;
1796 }
1797 
1798 /*
1799  * Splice contents of ipipe to opipe.
1800  */
1801 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1802  struct pipe_inode_info *opipe,
1803  size_t len, unsigned int flags)
1804 {
1805  struct pipe_buffer *ibuf, *obuf;
1806  int ret = 0, nbuf;
1807  bool input_wakeup = false;
1808 
1809 
1810 retry:
1811  ret = ipipe_prep(ipipe, flags);
1812  if (ret)
1813  return ret;
1814 
1815  ret = opipe_prep(opipe, flags);
1816  if (ret)
1817  return ret;
1818 
1819  /*
1820  * Potential ABBA deadlock, work around it by ordering lock
1821  * grabbing by pipe info address. Otherwise two different processes
1822  * could deadlock (one doing tee from A -> B, the other from B -> A).
1823  */
1824  pipe_double_lock(ipipe, opipe);
1825 
1826  do {
1827  if (!opipe->readers) {
1828  send_sig(SIGPIPE, current, 0);
1829  if (!ret)
1830  ret = -EPIPE;
1831  break;
1832  }
1833 
1834  if (!ipipe->nrbufs && !ipipe->writers)
1835  break;
1836 
1837  /*
1838  * Cannot make any progress, because either the input
1839  * pipe is empty or the output pipe is full.
1840  */
1841  if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1842  /* Already processed some buffers, break */
1843  if (ret)
1844  break;
1845 
1846  if (flags & SPLICE_F_NONBLOCK) {
1847  ret = -EAGAIN;
1848  break;
1849  }
1850 
1851  /*
1852  * We raced with another reader/writer and haven't
1853  * managed to process any buffers. A zero return
1854  * value means EOF, so retry instead.
1855  */
1856  pipe_unlock(ipipe);
1857  pipe_unlock(opipe);
1858  goto retry;
1859  }
1860 
1861  ibuf = ipipe->bufs + ipipe->curbuf;
1862  nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1863  obuf = opipe->bufs + nbuf;
1864 
1865  if (len >= ibuf->len) {
1866  /*
1867  * Simply move the whole buffer from ipipe to opipe
1868  */
1869  *obuf = *ibuf;
1870  ibuf->ops = NULL;
1871  opipe->nrbufs++;
1872  ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1873  ipipe->nrbufs--;
1874  input_wakeup = true;
1875  } else {
1876  /*
1877  * Get a reference to this pipe buffer,
1878  * so we can copy the contents over.
1879  */
1880  ibuf->ops->get(ipipe, ibuf);
1881  *obuf = *ibuf;
1882 
1883  /*
1884  * Don't inherit the gift flag, we need to
1885  * prevent multiple steals of this page.
1886  */
1887  obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1888 
1889  obuf->len = len;
1890  opipe->nrbufs++;
1891  ibuf->offset += obuf->len;
1892  ibuf->len -= obuf->len;
1893  }
1894  ret += obuf->len;
1895  len -= obuf->len;
1896  } while (len);
1897 
1898  pipe_unlock(ipipe);
1899  pipe_unlock(opipe);
1900 
1901  /*
1902  * If we put data in the output pipe, wakeup any potential readers.
1903  */
1904  if (ret > 0)
1905  wakeup_pipe_readers(opipe);
1906 
1907  if (input_wakeup)
1908  wakeup_pipe_writers(ipipe);
1909 
1910  return ret;
1911 }
1912 
1913 /*
1914  * Link contents of ipipe to opipe.
1915  */
1916 static int link_pipe(struct pipe_inode_info *ipipe,
1917  struct pipe_inode_info *opipe,
1918  size_t len, unsigned int flags)
1919 {
1920  struct pipe_buffer *ibuf, *obuf;
1921  int ret = 0, i = 0, nbuf;
1922 
1923  /*
1924  * Potential ABBA deadlock, work around it by ordering lock
1925  * grabbing by pipe info address. Otherwise two different processes
1926  * could deadlock (one doing tee from A -> B, the other from B -> A).
1927  */
1928  pipe_double_lock(ipipe, opipe);
1929 
1930  do {
1931  if (!opipe->readers) {
1932  send_sig(SIGPIPE, current, 0);
1933  if (!ret)
1934  ret = -EPIPE;
1935  break;
1936  }
1937 
1938  /*
1939  * If we have iterated all input buffers or ran out of
1940  * output room, break.
1941  */
1942  if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1943  break;
1944 
1945  ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1946  nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1947 
1948  /*
1949  * Get a reference to this pipe buffer,
1950  * so we can copy the contents over.
1951  */
1952  ibuf->ops->get(ipipe, ibuf);
1953 
1954  obuf = opipe->bufs + nbuf;
1955  *obuf = *ibuf;
1956 
1957  /*
1958  * Don't inherit the gift flag, we need to
1959  * prevent multiple steals of this page.
1960  */
1961  obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1962 
1963  if (obuf->len > len)
1964  obuf->len = len;
1965 
1966  opipe->nrbufs++;
1967  ret += obuf->len;
1968  len -= obuf->len;
1969  i++;
1970  } while (len);
1971 
1972  /*
1973  * return EAGAIN if we have the potential of some data in the
1974  * future, otherwise just return 0
1975  */
1976  if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1977  ret = -EAGAIN;
1978 
1979  pipe_unlock(ipipe);
1980  pipe_unlock(opipe);
1981 
1982  /*
1983  * If we put data in the output pipe, wakeup any potential readers.
1984  */
1985  if (ret > 0)
1986  wakeup_pipe_readers(opipe);
1987 
1988  return ret;
1989 }
1990 
1991 /*
1992  * This is a tee(1) implementation that works on pipes. It doesn't copy
1993  * any data, it simply references the 'in' pages on the 'out' pipe.
1994  * The 'flags' used are the SPLICE_F_* variants, currently the only
1995  * applicable one is SPLICE_F_NONBLOCK.
1996  */
1997 static long do_tee(struct file *in, struct file *out, size_t len,
1998  unsigned int flags)
1999 {
2000  struct pipe_inode_info *ipipe = get_pipe_info(in);
2001  struct pipe_inode_info *opipe = get_pipe_info(out);
2002  int ret = -EINVAL;
2003 
2004  /*
2005  * Duplicate the contents of ipipe to opipe without actually
2006  * copying the data.
2007  */
2008  if (ipipe && opipe && ipipe != opipe) {
2009  /*
2010  * Keep going, unless we encounter an error. The ipipe/opipe
2011  * ordering doesn't really matter.
2012  */
2013  ret = ipipe_prep(ipipe, flags);
2014  if (!ret) {
2015  ret = opipe_prep(opipe, flags);
2016  if (!ret)
2017  ret = link_pipe(ipipe, opipe, len, flags);
2018  }
2019  }
2020 
2021  return ret;
2022 }
2023 
2024 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
2025 {
2026  struct fd in;
2027  int error;
2028 
2029  if (unlikely(!len))
2030  return 0;
2031 
2032  error = -EBADF;
2033  in = fdget(fdin);
2034  if (in.file) {
2035  if (in.file->f_mode & FMODE_READ) {
2036  struct fd out = fdget(fdout);
2037  if (out.file) {
2038  if (out.file->f_mode & FMODE_WRITE)
2039  error = do_tee(in.file, out.file,
2040  len, flags);
2041  fdput(out);
2042  }
2043  }
2044  fdput(in);
2045  }
2046 
2047  return error;
2048 }