Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
sock.c
Go to the documentation of this file.
1 /*
2  * INET An implementation of the TCP/IP protocol suite for the LINUX
3  * operating system. INET is implemented using the BSD Socket
4  * interface as the means of communication with the user level.
5  *
6  * Generic socket support routines. Memory allocators, socket lock/release
7  * handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors: Ross Biro
11  * Fred N. van Kempen, <[email protected]>
12  * Florian La Roche, <[email protected]>
13  * Alan Cox, <[email protected]>
14  *
15  * Fixes:
16  * Alan Cox : Numerous verify_area() problems
17  * Alan Cox : Connecting on a connecting socket
18  * now returns an error for tcp.
19  * Alan Cox : sock->protocol is set correctly.
20  * and is not sometimes left as 0.
21  * Alan Cox : connect handles icmp errors on a
22  * connect properly. Unfortunately there
23  * is a restart syscall nasty there. I
24  * can't match BSD without hacking the C
25  * library. Ideas urgently sought!
26  * Alan Cox : Disallow bind() to addresses that are
27  * not ours - especially broadcast ones!!
28  * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29  * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30  * instead they leave that for the DESTROY timer.
31  * Alan Cox : Clean up error flag in accept
32  * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33  * was buggy. Put a remove_sock() in the handler
34  * for memory when we hit 0. Also altered the timer
35  * code. The ACK stuff can wait and needs major
36  * TCP layer surgery.
37  * Alan Cox : Fixed TCP ack bug, removed remove sock
38  * and fixed timer/inet_bh race.
39  * Alan Cox : Added zapped flag for TCP
40  * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41  * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43  * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  * Rick Sladkey : Relaxed UDP rules for matching packets.
46  * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47  * Pauline Middelink : identd support
48  * Alan Cox : Fixed connect() taking signals I think.
49  * Alan Cox : SO_LINGER supported
50  * Alan Cox : Error reporting fixes
51  * Anonymous : inet_create tidied up (sk->reuse setting)
52  * Alan Cox : inet sockets don't set sk->type!
53  * Alan Cox : Split socket option code
54  * Alan Cox : Callbacks
55  * Alan Cox : Nagle flag for Charles & Johannes stuff
56  * Alex : Removed restriction on inet fioctl
57  * Alan Cox : Splitting INET from NET core
58  * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59  * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60  * Alan Cox : Split IP from generic code
61  * Alan Cox : New kfree_skbmem()
62  * Alan Cox : Make SO_DEBUG superuser only.
63  * Alan Cox : Allow anyone to clear SO_DEBUG
64  * (compatibility fix)
65  * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66  * Alan Cox : Allocator for a socket is settable.
67  * Alan Cox : SO_ERROR includes soft errors.
68  * Alan Cox : Allow NULL arguments on some SO_ opts
69  * Alan Cox : Generic socket allocation to make hooks
70  * easier (suggested by Craig Metz).
71  * Michael Pall : SO_ERROR returns positive errno again
72  * Steve Whitehouse: Added default destructor to free
73  * protocol private data.
74  * Steve Whitehouse: Added various other default routines
75  * common to several socket families.
76  * Chris Evans : Call suser() check last on F_SETOWN
77  * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79  * Andi Kleen : Fix write_space callback
80  * Chris Evans : Security fixes - signedness again
81  * Arnaldo C. Melo : cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  * This program is free software; you can redistribute it and/or
87  * modify it under the terms of the GNU General Public License
88  * as published by the Free Software Foundation; either version
89  * 2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 #ifdef CONFIG_MEMCG_KMEM
146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148  struct proto *proto;
149  int ret = 0;
150 
151  mutex_lock(&proto_list_mutex);
152  list_for_each_entry(proto, &proto_list, node) {
153  if (proto->init_cgroup) {
154  ret = proto->init_cgroup(memcg, ss);
155  if (ret)
156  goto out;
157  }
158  }
159 
160  mutex_unlock(&proto_list_mutex);
161  return ret;
162 out:
163  list_for_each_entry_continue_reverse(proto, &proto_list, node)
164  if (proto->destroy_cgroup)
165  proto->destroy_cgroup(memcg);
166  mutex_unlock(&proto_list_mutex);
167  return ret;
168 }
169 
170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172  struct proto *proto;
173 
174  mutex_lock(&proto_list_mutex);
175  list_for_each_entry_reverse(proto, &proto_list, node)
176  if (proto->destroy_cgroup)
177  proto->destroy_cgroup(memcg);
178  mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181 
182 /*
183  * Each address family might have different locking rules, so we have
184  * one slock key per address family:
185  */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188 
191 
192 /*
193  * Make lock validator output more readable. (we pre-construct these
194  * strings build-time, so that runtime initialization of socket
195  * locks is fast):
196  */
197 static const char *const af_family_key_strings[AF_MAX+1] = {
198  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" ,
199  "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK",
200  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" ,
201  "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" ,
202  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
203  "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
204  "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
205  "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
206  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
207  "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
208  "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
209  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
210  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
211  "sk_lock-AF_NFC" , "sk_lock-AF_MAX"
212 };
213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214  "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
215  "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK",
216  "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" ,
217  "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" ,
218  "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
219  "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
220  "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
221  "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
222  "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
223  "slock-27" , "slock-28" , "slock-AF_CAN" ,
224  "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
225  "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
226  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
227  "slock-AF_NFC" , "slock-AF_MAX"
228 };
229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230  "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
231  "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
232  "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
233  "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" ,
234  "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
235  "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
236  "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
237  "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
238  "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
239  "clock-27" , "clock-28" , "clock-AF_CAN" ,
240  "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
241  "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
242  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
243  "clock-AF_NFC" , "clock-AF_MAX"
244 };
245 
246 /*
247  * sk_callback_lock locking rules are per-address-family,
248  * so split the lock classes by using a per-AF key:
249  */
250 static struct lock_class_key af_callback_keys[AF_MAX];
251 
252 /* Take into consideration the size of the struct sk_buff overhead in the
253  * determination of these values, since that is non-constant across
254  * platforms. This makes socket queueing behavior and performance
255  * not depend upon such differences.
256  */
257 #define _SK_MEM_PACKETS 256
258 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
259 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261 
262 /* Run time adjustable parameters. */
264 EXPORT_SYMBOL(sysctl_wmem_max);
265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266 EXPORT_SYMBOL(sysctl_rmem_max);
267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269 
270 /* Maximal space eaten by iovec or ancillary data plus some space */
271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272 EXPORT_SYMBOL(sysctl_optmem_max);
273 
275 EXPORT_SYMBOL_GPL(memalloc_socks);
276 
285 void sk_set_memalloc(struct sock *sk)
286 {
287  sock_set_flag(sk, SOCK_MEMALLOC);
289  static_key_slow_inc(&memalloc_socks);
290 }
292 
293 void sk_clear_memalloc(struct sock *sk)
294 {
295  sock_reset_flag(sk, SOCK_MEMALLOC);
297  static_key_slow_dec(&memalloc_socks);
298 
299  /*
300  * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301  * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302  * it has rmem allocations there is a risk that the user of the
303  * socket cannot make forward progress due to exceeding the rmem
304  * limits. By rights, sk_clear_memalloc() should only be called
305  * on sockets being torn down but warn and reset the accounting if
306  * that assumption breaks.
307  */
308  if (WARN_ON(sk->sk_forward_alloc))
309  sk_mem_reclaim(sk);
310 }
312 
313 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314 {
315  int ret;
316  unsigned long pflags = current->flags;
317 
318  /* these should have been dropped before queueing */
319  BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320 
321  current->flags |= PF_MEMALLOC;
322  ret = sk->sk_backlog_rcv(sk, skb);
323  tsk_restore_flags(current, pflags, PF_MEMALLOC);
324 
325  return ret;
326 }
328 
329 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
330 {
331  struct timeval tv;
332 
333  if (optlen < sizeof(tv))
334  return -EINVAL;
335  if (copy_from_user(&tv, optval, sizeof(tv)))
336  return -EFAULT;
337  if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
338  return -EDOM;
339 
340  if (tv.tv_sec < 0) {
341  static int warned __read_mostly;
342 
343  *timeo_p = 0;
344  if (warned < 10 && net_ratelimit()) {
345  warned++;
346  pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
347  __func__, current->comm, task_pid_nr(current));
348  }
349  return 0;
350  }
351  *timeo_p = MAX_SCHEDULE_TIMEOUT;
352  if (tv.tv_sec == 0 && tv.tv_usec == 0)
353  return 0;
354  if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
355  *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
356  return 0;
357 }
358 
359 static void sock_warn_obsolete_bsdism(const char *name)
360 {
361  static int warned;
362  static char warncomm[TASK_COMM_LEN];
363  if (strcmp(warncomm, current->comm) && warned < 5) {
364  strcpy(warncomm, current->comm);
365  pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
366  warncomm, name);
367  warned++;
368  }
369 }
370 
371 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
372 
373 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374 {
375  if (sk->sk_flags & flags) {
376  sk->sk_flags &= ~flags;
377  if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
379  }
380 }
381 
382 
383 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
384 {
385  int err;
386  int skb_len;
387  unsigned long flags;
388  struct sk_buff_head *list = &sk->sk_receive_queue;
389 
390  if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
391  atomic_inc(&sk->sk_drops);
392  trace_sock_rcvqueue_full(sk, skb);
393  return -ENOMEM;
394  }
395 
396  err = sk_filter(sk, skb);
397  if (err)
398  return err;
399 
400  if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
401  atomic_inc(&sk->sk_drops);
402  return -ENOBUFS;
403  }
404 
405  skb->dev = NULL;
406  skb_set_owner_r(skb, sk);
407 
408  /* Cache the SKB length before we tack it onto the receive
409  * queue. Once it is added it no longer belongs to us and
410  * may be freed by other threads of control pulling packets
411  * from the queue.
412  */
413  skb_len = skb->len;
414 
415  /* we escape from rcu protected region, make sure we dont leak
416  * a norefcounted dst
417  */
418  skb_dst_force(skb);
419 
420  spin_lock_irqsave(&list->lock, flags);
421  skb->dropcount = atomic_read(&sk->sk_drops);
422  __skb_queue_tail(list, skb);
423  spin_unlock_irqrestore(&list->lock, flags);
424 
425  if (!sock_flag(sk, SOCK_DEAD))
426  sk->sk_data_ready(sk, skb_len);
427  return 0;
428 }
430 
431 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432 {
433  int rc = NET_RX_SUCCESS;
434 
435  if (sk_filter(sk, skb))
436  goto discard_and_relse;
437 
438  skb->dev = NULL;
439 
440  if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441  atomic_inc(&sk->sk_drops);
442  goto discard_and_relse;
443  }
444  if (nested)
446  else
447  bh_lock_sock(sk);
448  if (!sock_owned_by_user(sk)) {
449  /*
450  * trylock + unlock semantics:
451  */
452  mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
453 
454  rc = sk_backlog_rcv(sk, skb);
455 
456  mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457  } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
458  bh_unlock_sock(sk);
459  atomic_inc(&sk->sk_drops);
460  goto discard_and_relse;
461  }
462 
463  bh_unlock_sock(sk);
464 out:
465  sock_put(sk);
466  return rc;
467 discard_and_relse:
468  kfree_skb(skb);
469  goto out;
470 }
472 
473 void sk_reset_txq(struct sock *sk)
474 {
475  sk_tx_queue_clear(sk);
476 }
478 
480 {
481  struct dst_entry *dst = __sk_dst_get(sk);
482 
483  if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484  sk_tx_queue_clear(sk);
486  dst_release(dst);
487  return NULL;
488  }
489 
490  return dst;
491 }
493 
495 {
496  struct dst_entry *dst = sk_dst_get(sk);
497 
498  if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
499  sk_dst_reset(sk);
500  dst_release(dst);
501  return NULL;
502  }
503 
504  return dst;
505 }
507 
508 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
509 {
510  int ret = -ENOPROTOOPT;
511 #ifdef CONFIG_NETDEVICES
512  struct net *net = sock_net(sk);
513  char devname[IFNAMSIZ];
514  int index;
515 
516  /* Sorry... */
517  ret = -EPERM;
518  if (!capable(CAP_NET_RAW))
519  goto out;
520 
521  ret = -EINVAL;
522  if (optlen < 0)
523  goto out;
524 
525  /* Bind this socket to a particular device like "eth0",
526  * as specified in the passed interface name. If the
527  * name is "" or the option length is zero the socket
528  * is not bound.
529  */
530  if (optlen > IFNAMSIZ - 1)
531  optlen = IFNAMSIZ - 1;
532  memset(devname, 0, sizeof(devname));
533 
534  ret = -EFAULT;
535  if (copy_from_user(devname, optval, optlen))
536  goto out;
537 
538  index = 0;
539  if (devname[0] != '\0') {
540  struct net_device *dev;
541 
542  rcu_read_lock();
543  dev = dev_get_by_name_rcu(net, devname);
544  if (dev)
545  index = dev->ifindex;
546  rcu_read_unlock();
547  ret = -ENODEV;
548  if (!dev)
549  goto out;
550  }
551 
552  lock_sock(sk);
553  sk->sk_bound_dev_if = index;
554  sk_dst_reset(sk);
555  release_sock(sk);
556 
557  ret = 0;
558 
559 out:
560 #endif
561 
562  return ret;
563 }
564 
565 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
566 {
567  if (valbool)
568  sock_set_flag(sk, bit);
569  else
570  sock_reset_flag(sk, bit);
571 }
572 
573 /*
574  * This is meant for all protocols to use and covers goings on
575  * at the socket level. Everything here is generic.
576  */
577 
578 int sock_setsockopt(struct socket *sock, int level, int optname,
579  char __user *optval, unsigned int optlen)
580 {
581  struct sock *sk = sock->sk;
582  int val;
583  int valbool;
584  struct linger ling;
585  int ret = 0;
586 
587  /*
588  * Options without arguments
589  */
590 
591  if (optname == SO_BINDTODEVICE)
592  return sock_bindtodevice(sk, optval, optlen);
593 
594  if (optlen < sizeof(int))
595  return -EINVAL;
596 
597  if (get_user(val, (int __user *)optval))
598  return -EFAULT;
599 
600  valbool = val ? 1 : 0;
601 
602  lock_sock(sk);
603 
604  switch (optname) {
605  case SO_DEBUG:
606  if (val && !capable(CAP_NET_ADMIN))
607  ret = -EACCES;
608  else
609  sock_valbool_flag(sk, SOCK_DBG, valbool);
610  break;
611  case SO_REUSEADDR:
612  sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
613  break;
614  case SO_TYPE:
615  case SO_PROTOCOL:
616  case SO_DOMAIN:
617  case SO_ERROR:
618  ret = -ENOPROTOOPT;
619  break;
620  case SO_DONTROUTE:
621  sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
622  break;
623  case SO_BROADCAST:
624  sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
625  break;
626  case SO_SNDBUF:
627  /* Don't error on this BSD doesn't and if you think
628  * about it this is right. Otherwise apps have to
629  * play 'guess the biggest size' games. RCVBUF/SNDBUF
630  * are treated in BSD as hints
631  */
632  val = min_t(u32, val, sysctl_wmem_max);
633 set_sndbuf:
635  sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
636  /* Wake up sending tasks if we upped the value. */
637  sk->sk_write_space(sk);
638  break;
639 
640  case SO_SNDBUFFORCE:
641  if (!capable(CAP_NET_ADMIN)) {
642  ret = -EPERM;
643  break;
644  }
645  goto set_sndbuf;
646 
647  case SO_RCVBUF:
648  /* Don't error on this BSD doesn't and if you think
649  * about it this is right. Otherwise apps have to
650  * play 'guess the biggest size' games. RCVBUF/SNDBUF
651  * are treated in BSD as hints
652  */
653  val = min_t(u32, val, sysctl_rmem_max);
654 set_rcvbuf:
656  /*
657  * We double it on the way in to account for
658  * "struct sk_buff" etc. overhead. Applications
659  * assume that the SO_RCVBUF setting they make will
660  * allow that much actual data to be received on that
661  * socket.
662  *
663  * Applications are unaware that "struct sk_buff" and
664  * other overheads allocate from the receive buffer
665  * during socket buffer allocation.
666  *
667  * And after considering the possible alternatives,
668  * returning the value we actually used in getsockopt
669  * is the most desirable behavior.
670  */
671  sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
672  break;
673 
674  case SO_RCVBUFFORCE:
675  if (!capable(CAP_NET_ADMIN)) {
676  ret = -EPERM;
677  break;
678  }
679  goto set_rcvbuf;
680 
681  case SO_KEEPALIVE:
682 #ifdef CONFIG_INET
683  if (sk->sk_protocol == IPPROTO_TCP &&
684  sk->sk_type == SOCK_STREAM)
685  tcp_set_keepalive(sk, valbool);
686 #endif
687  sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
688  break;
689 
690  case SO_OOBINLINE:
691  sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
692  break;
693 
694  case SO_NO_CHECK:
695  sk->sk_no_check = valbool;
696  break;
697 
698  case SO_PRIORITY:
699  if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
700  sk->sk_priority = val;
701  else
702  ret = -EPERM;
703  break;
704 
705  case SO_LINGER:
706  if (optlen < sizeof(ling)) {
707  ret = -EINVAL; /* 1003.1g */
708  break;
709  }
710  if (copy_from_user(&ling, optval, sizeof(ling))) {
711  ret = -EFAULT;
712  break;
713  }
714  if (!ling.l_onoff)
715  sock_reset_flag(sk, SOCK_LINGER);
716  else {
717 #if (BITS_PER_LONG == 32)
718  if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
720  else
721 #endif
722  sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
723  sock_set_flag(sk, SOCK_LINGER);
724  }
725  break;
726 
727  case SO_BSDCOMPAT:
728  sock_warn_obsolete_bsdism("setsockopt");
729  break;
730 
731  case SO_PASSCRED:
732  if (valbool)
733  set_bit(SOCK_PASSCRED, &sock->flags);
734  else
735  clear_bit(SOCK_PASSCRED, &sock->flags);
736  break;
737 
738  case SO_TIMESTAMP:
739  case SO_TIMESTAMPNS:
740  if (valbool) {
741  if (optname == SO_TIMESTAMP)
742  sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
743  else
744  sock_set_flag(sk, SOCK_RCVTSTAMPNS);
745  sock_set_flag(sk, SOCK_RCVTSTAMP);
747  } else {
748  sock_reset_flag(sk, SOCK_RCVTSTAMP);
749  sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
750  }
751  break;
752 
753  case SO_TIMESTAMPING:
754  if (val & ~SOF_TIMESTAMPING_MASK) {
755  ret = -EINVAL;
756  break;
757  }
758  sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
760  sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
762  sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
767  else
768  sock_disable_timestamp(sk,
770  sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
772  sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
774  sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
776  break;
777 
778  case SO_RCVLOWAT:
779  if (val < 0)
780  val = INT_MAX;
781  sk->sk_rcvlowat = val ? : 1;
782  break;
783 
784  case SO_RCVTIMEO:
785  ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
786  break;
787 
788  case SO_SNDTIMEO:
789  ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
790  break;
791 
792  case SO_ATTACH_FILTER:
793  ret = -EINVAL;
794  if (optlen == sizeof(struct sock_fprog)) {
795  struct sock_fprog fprog;
796 
797  ret = -EFAULT;
798  if (copy_from_user(&fprog, optval, sizeof(fprog)))
799  break;
800 
801  ret = sk_attach_filter(&fprog, sk);
802  }
803  break;
804 
805  case SO_DETACH_FILTER:
806  ret = sk_detach_filter(sk);
807  break;
808 
809  case SO_PASSSEC:
810  if (valbool)
811  set_bit(SOCK_PASSSEC, &sock->flags);
812  else
813  clear_bit(SOCK_PASSSEC, &sock->flags);
814  break;
815  case SO_MARK:
816  if (!capable(CAP_NET_ADMIN))
817  ret = -EPERM;
818  else
819  sk->sk_mark = val;
820  break;
821 
822  /* We implement the SO_SNDLOWAT etc to
823  not be settable (1003.1g 5.3) */
824  case SO_RXQ_OVFL:
825  sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
826  break;
827 
828  case SO_WIFI_STATUS:
829  sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
830  break;
831 
832  case SO_PEEK_OFF:
833  if (sock->ops->set_peek_off)
834  sock->ops->set_peek_off(sk, val);
835  else
836  ret = -EOPNOTSUPP;
837  break;
838 
839  case SO_NOFCS:
840  sock_valbool_flag(sk, SOCK_NOFCS, valbool);
841  break;
842 
843  default:
844  ret = -ENOPROTOOPT;
845  break;
846  }
847  release_sock(sk);
848  return ret;
849 }
851 
852 
853 void cred_to_ucred(struct pid *pid, const struct cred *cred,
854  struct ucred *ucred)
855 {
856  ucred->pid = pid_vnr(pid);
857  ucred->uid = ucred->gid = -1;
858  if (cred) {
859  struct user_namespace *current_ns = current_user_ns();
860 
861  ucred->uid = from_kuid_munged(current_ns, cred->euid);
862  ucred->gid = from_kgid_munged(current_ns, cred->egid);
863  }
864 }
866 
867 int sock_getsockopt(struct socket *sock, int level, int optname,
868  char __user *optval, int __user *optlen)
869 {
870  struct sock *sk = sock->sk;
871 
872  union {
873  int val;
874  struct linger ling;
875  struct timeval tm;
876  } v;
877 
878  int lv = sizeof(int);
879  int len;
880 
881  if (get_user(len, optlen))
882  return -EFAULT;
883  if (len < 0)
884  return -EINVAL;
885 
886  memset(&v, 0, sizeof(v));
887 
888  switch (optname) {
889  case SO_DEBUG:
890  v.val = sock_flag(sk, SOCK_DBG);
891  break;
892 
893  case SO_DONTROUTE:
894  v.val = sock_flag(sk, SOCK_LOCALROUTE);
895  break;
896 
897  case SO_BROADCAST:
898  v.val = sock_flag(sk, SOCK_BROADCAST);
899  break;
900 
901  case SO_SNDBUF:
902  v.val = sk->sk_sndbuf;
903  break;
904 
905  case SO_RCVBUF:
906  v.val = sk->sk_rcvbuf;
907  break;
908 
909  case SO_REUSEADDR:
910  v.val = sk->sk_reuse;
911  break;
912 
913  case SO_KEEPALIVE:
914  v.val = sock_flag(sk, SOCK_KEEPOPEN);
915  break;
916 
917  case SO_TYPE:
918  v.val = sk->sk_type;
919  break;
920 
921  case SO_PROTOCOL:
922  v.val = sk->sk_protocol;
923  break;
924 
925  case SO_DOMAIN:
926  v.val = sk->sk_family;
927  break;
928 
929  case SO_ERROR:
930  v.val = -sock_error(sk);
931  if (v.val == 0)
932  v.val = xchg(&sk->sk_err_soft, 0);
933  break;
934 
935  case SO_OOBINLINE:
936  v.val = sock_flag(sk, SOCK_URGINLINE);
937  break;
938 
939  case SO_NO_CHECK:
940  v.val = sk->sk_no_check;
941  break;
942 
943  case SO_PRIORITY:
944  v.val = sk->sk_priority;
945  break;
946 
947  case SO_LINGER:
948  lv = sizeof(v.ling);
949  v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
950  v.ling.l_linger = sk->sk_lingertime / HZ;
951  break;
952 
953  case SO_BSDCOMPAT:
954  sock_warn_obsolete_bsdism("getsockopt");
955  break;
956 
957  case SO_TIMESTAMP:
958  v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
959  !sock_flag(sk, SOCK_RCVTSTAMPNS);
960  break;
961 
962  case SO_TIMESTAMPNS:
963  v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
964  break;
965 
966  case SO_TIMESTAMPING:
967  v.val = 0;
968  if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
970  if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
972  if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
974  if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
976  if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
978  if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
980  if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
982  break;
983 
984  case SO_RCVTIMEO:
985  lv = sizeof(struct timeval);
986  if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
987  v.tm.tv_sec = 0;
988  v.tm.tv_usec = 0;
989  } else {
990  v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
991  v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
992  }
993  break;
994 
995  case SO_SNDTIMEO:
996  lv = sizeof(struct timeval);
997  if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
998  v.tm.tv_sec = 0;
999  v.tm.tv_usec = 0;
1000  } else {
1001  v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1002  v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1003  }
1004  break;
1005 
1006  case SO_RCVLOWAT:
1007  v.val = sk->sk_rcvlowat;
1008  break;
1009 
1010  case SO_SNDLOWAT:
1011  v.val = 1;
1012  break;
1013 
1014  case SO_PASSCRED:
1015  v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1016  break;
1017 
1018  case SO_PEERCRED:
1019  {
1020  struct ucred peercred;
1021  if (len > sizeof(peercred))
1022  len = sizeof(peercred);
1023  cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1024  if (copy_to_user(optval, &peercred, len))
1025  return -EFAULT;
1026  goto lenout;
1027  }
1028 
1029  case SO_PEERNAME:
1030  {
1031  char address[128];
1032 
1033  if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1034  return -ENOTCONN;
1035  if (lv < len)
1036  return -EINVAL;
1037  if (copy_to_user(optval, address, len))
1038  return -EFAULT;
1039  goto lenout;
1040  }
1041 
1042  /* Dubious BSD thing... Probably nobody even uses it, but
1043  * the UNIX standard wants it for whatever reason... -DaveM
1044  */
1045  case SO_ACCEPTCONN:
1046  v.val = sk->sk_state == TCP_LISTEN;
1047  break;
1048 
1049  case SO_PASSSEC:
1050  v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1051  break;
1052 
1053  case SO_PEERSEC:
1054  return security_socket_getpeersec_stream(sock, optval, optlen, len);
1055 
1056  case SO_MARK:
1057  v.val = sk->sk_mark;
1058  break;
1059 
1060  case SO_RXQ_OVFL:
1061  v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1062  break;
1063 
1064  case SO_WIFI_STATUS:
1065  v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1066  break;
1067 
1068  case SO_PEEK_OFF:
1069  if (!sock->ops->set_peek_off)
1070  return -EOPNOTSUPP;
1071 
1072  v.val = sk->sk_peek_off;
1073  break;
1074  case SO_NOFCS:
1075  v.val = sock_flag(sk, SOCK_NOFCS);
1076  break;
1077  default:
1078  return -ENOPROTOOPT;
1079  }
1080 
1081  if (len > lv)
1082  len = lv;
1083  if (copy_to_user(optval, &v, len))
1084  return -EFAULT;
1085 lenout:
1086  if (put_user(len, optlen))
1087  return -EFAULT;
1088  return 0;
1089 }
1090 
1091 /*
1092  * Initialize an sk_lock.
1093  *
1094  * (We also register the sk_lock with the lock validator.)
1095  */
1096 static inline void sock_lock_init(struct sock *sk)
1097 {
1099  af_family_slock_key_strings[sk->sk_family],
1100  af_family_slock_keys + sk->sk_family,
1101  af_family_key_strings[sk->sk_family],
1102  af_family_keys + sk->sk_family);
1103 }
1104 
1105 /*
1106  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1107  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1108  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1109  */
1110 static void sock_copy(struct sock *nsk, const struct sock *osk)
1111 {
1112 #ifdef CONFIG_SECURITY_NETWORK
1113  void *sptr = nsk->sk_security;
1114 #endif
1115  memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1116 
1117  memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1118  osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1119 
1120 #ifdef CONFIG_SECURITY_NETWORK
1121  nsk->sk_security = sptr;
1122  security_sk_clone(osk, nsk);
1123 #endif
1124 }
1125 
1126 /*
1127  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1128  * un-modified. Special care is taken when initializing object to zero.
1129  */
1130 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1131 {
1132  if (offsetof(struct sock, sk_node.next) != 0)
1133  memset(sk, 0, offsetof(struct sock, sk_node.next));
1134  memset(&sk->sk_node.pprev, 0,
1135  size - offsetof(struct sock, sk_node.pprev));
1136 }
1137 
1138 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1139 {
1140  unsigned long nulls1, nulls2;
1141 
1142  nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1143  nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1144  if (nulls1 > nulls2)
1145  swap(nulls1, nulls2);
1146 
1147  if (nulls1 != 0)
1148  memset((char *)sk, 0, nulls1);
1149  memset((char *)sk + nulls1 + sizeof(void *), 0,
1150  nulls2 - nulls1 - sizeof(void *));
1151  memset((char *)sk + nulls2 + sizeof(void *), 0,
1152  size - nulls2 - sizeof(void *));
1153 }
1155 
1156 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1157  int family)
1158 {
1159  struct sock *sk;
1160  struct kmem_cache *slab;
1161 
1162  slab = prot->slab;
1163  if (slab != NULL) {
1164  sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1165  if (!sk)
1166  return sk;
1167  if (priority & __GFP_ZERO) {
1168  if (prot->clear_sk)
1169  prot->clear_sk(sk, prot->obj_size);
1170  else
1171  sk_prot_clear_nulls(sk, prot->obj_size);
1172  }
1173  } else
1174  sk = kmalloc(prot->obj_size, priority);
1175 
1176  if (sk != NULL) {
1178 
1179  if (security_sk_alloc(sk, family, priority))
1180  goto out_free;
1181 
1182  if (!try_module_get(prot->owner))
1183  goto out_free_sec;
1184  sk_tx_queue_clear(sk);
1185  }
1186 
1187  return sk;
1188 
1189 out_free_sec:
1190  security_sk_free(sk);
1191 out_free:
1192  if (slab != NULL)
1193  kmem_cache_free(slab, sk);
1194  else
1195  kfree(sk);
1196  return NULL;
1197 }
1198 
1199 static void sk_prot_free(struct proto *prot, struct sock *sk)
1200 {
1201  struct kmem_cache *slab;
1202  struct module *owner;
1203 
1204  owner = prot->owner;
1205  slab = prot->slab;
1206 
1207  security_sk_free(sk);
1208  if (slab != NULL)
1209  kmem_cache_free(slab, sk);
1210  else
1211  kfree(sk);
1212  module_put(owner);
1213 }
1214 
1215 #ifdef CONFIG_CGROUPS
1216 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1217 void sock_update_classid(struct sock *sk)
1218 {
1219  u32 classid;
1220 
1221  rcu_read_lock(); /* doing current task, which cannot vanish. */
1222  classid = task_cls_classid(current);
1223  rcu_read_unlock();
1224  if (classid != sk->sk_classid)
1225  sk->sk_classid = classid;
1226 }
1227 EXPORT_SYMBOL(sock_update_classid);
1228 #endif
1229 
1230 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1231 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1232 {
1233  if (in_interrupt())
1234  return;
1235 
1236  sk->sk_cgrp_prioidx = task_netprioidx(task);
1237 }
1239 #endif
1240 #endif
1241 
1249 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1250  struct proto *prot)
1251 {
1252  struct sock *sk;
1253 
1254  sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1255  if (sk) {
1256  sk->sk_family = family;
1257  /*
1258  * See comment in struct sock definition to understand
1259  * why we need sk_prot_creator -acme
1260  */
1261  sk->sk_prot = sk->sk_prot_creator = prot;
1262  sock_lock_init(sk);
1263  sock_net_set(sk, get_net(net));
1264  atomic_set(&sk->sk_wmem_alloc, 1);
1265 
1266  sock_update_classid(sk);
1268  }
1269 
1270  return sk;
1271 }
1273 
1274 static void __sk_free(struct sock *sk)
1275 {
1276  struct sk_filter *filter;
1277 
1278  if (sk->sk_destruct)
1279  sk->sk_destruct(sk);
1280 
1281  filter = rcu_dereference_check(sk->sk_filter,
1282  atomic_read(&sk->sk_wmem_alloc) == 0);
1283  if (filter) {
1284  sk_filter_uncharge(sk, filter);
1286  }
1287 
1288  sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1289 
1290  if (atomic_read(&sk->sk_omem_alloc))
1291  pr_debug("%s: optmem leakage (%d bytes) detected\n",
1292  __func__, atomic_read(&sk->sk_omem_alloc));
1293 
1294  if (sk->sk_peer_cred)
1295  put_cred(sk->sk_peer_cred);
1296  put_pid(sk->sk_peer_pid);
1297  put_net(sock_net(sk));
1298  sk_prot_free(sk->sk_prot_creator, sk);
1299 }
1300 
1301 void sk_free(struct sock *sk)
1302 {
1303  /*
1304  * We subtract one from sk_wmem_alloc and can know if
1305  * some packets are still in some tx queue.
1306  * If not null, sock_wfree() will call __sk_free(sk) later
1307  */
1309  __sk_free(sk);
1310 }
1312 
1313 /*
1314  * Last sock_put should drop reference to sk->sk_net. It has already
1315  * been dropped in sk_change_net. Taking reference to stopping namespace
1316  * is not an option.
1317  * Take reference to a socket to remove it from hash _alive_ and after that
1318  * destroy it in the context of init_net.
1319  */
1320 void sk_release_kernel(struct sock *sk)
1321 {
1322  if (sk == NULL || sk->sk_socket == NULL)
1323  return;
1324 
1325  sock_hold(sk);
1326  sock_release(sk->sk_socket);
1327  release_net(sock_net(sk));
1328  sock_net_set(sk, get_net(&init_net));
1329  sock_put(sk);
1330 }
1332 
1333 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1334 {
1336  sock_update_memcg(newsk);
1337 }
1338 
1346 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1347 {
1348  struct sock *newsk;
1349 
1350  newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1351  if (newsk != NULL) {
1352  struct sk_filter *filter;
1353 
1354  sock_copy(newsk, sk);
1355 
1356  /* SANITY */
1357  get_net(sock_net(newsk));
1358  sk_node_init(&newsk->sk_node);
1359  sock_lock_init(newsk);
1360  bh_lock_sock(newsk);
1361  newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
1362  newsk->sk_backlog.len = 0;
1363 
1364  atomic_set(&newsk->sk_rmem_alloc, 0);
1365  /*
1366  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1367  */
1368  atomic_set(&newsk->sk_wmem_alloc, 1);
1369  atomic_set(&newsk->sk_omem_alloc, 0);
1370  skb_queue_head_init(&newsk->sk_receive_queue);
1371  skb_queue_head_init(&newsk->sk_write_queue);
1372 #ifdef CONFIG_NET_DMA
1373  skb_queue_head_init(&newsk->sk_async_wait_queue);
1374 #endif
1375 
1376  spin_lock_init(&newsk->sk_dst_lock);
1377  rwlock_init(&newsk->sk_callback_lock);
1379  af_callback_keys + newsk->sk_family,
1380  af_family_clock_key_strings[newsk->sk_family]);
1381 
1382  newsk->sk_dst_cache = NULL;
1383  newsk->sk_wmem_queued = 0;
1384  newsk->sk_forward_alloc = 0;
1385  newsk->sk_send_head = NULL;
1387 
1388  sock_reset_flag(newsk, SOCK_DONE);
1389  skb_queue_head_init(&newsk->sk_error_queue);
1390 
1391  filter = rcu_dereference_protected(newsk->sk_filter, 1);
1392  if (filter != NULL)
1393  sk_filter_charge(newsk, filter);
1394 
1395  if (unlikely(xfrm_sk_clone_policy(newsk))) {
1396  /* It is still raw copy of parent, so invalidate
1397  * destructor and make plain sk_free() */
1398  newsk->sk_destruct = NULL;
1399  bh_unlock_sock(newsk);
1400  sk_free(newsk);
1401  newsk = NULL;
1402  goto out;
1403  }
1404 
1405  newsk->sk_err = 0;
1406  newsk->sk_priority = 0;
1407  /*
1408  * Before updating sk_refcnt, we must commit prior changes to memory
1409  * (Documentation/RCU/rculist_nulls.txt for details)
1410  */
1411  smp_wmb();
1412  atomic_set(&newsk->sk_refcnt, 2);
1413 
1414  /*
1415  * Increment the counter in the same struct proto as the master
1416  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1417  * is the same as sk->sk_prot->socks, as this field was copied
1418  * with memcpy).
1419  *
1420  * This _changes_ the previous behaviour, where
1421  * tcp_create_openreq_child always was incrementing the
1422  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1423  * to be taken into account in all callers. -acme
1424  */
1425  sk_refcnt_debug_inc(newsk);
1426  sk_set_socket(newsk, NULL);
1427  newsk->sk_wq = NULL;
1428 
1429  sk_update_clone(sk, newsk);
1430 
1431  if (newsk->sk_prot->sockets_allocated)
1432  sk_sockets_allocated_inc(newsk);
1433 
1434  if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1436  }
1437 out:
1438  return newsk;
1439 }
1441 
1442 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1443 {
1444  __sk_dst_set(sk, dst);
1445  sk->sk_route_caps = dst->dev->features;
1446  if (sk->sk_route_caps & NETIF_F_GSO)
1448  sk->sk_route_caps &= ~sk->sk_route_nocaps;
1449  if (sk_can_gso(sk)) {
1450  if (dst->header_len) {
1452  } else {
1454  sk->sk_gso_max_size = dst->dev->gso_max_size;
1455  sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1456  }
1457  }
1458 }
1460 
1461 /*
1462  * Simple resource managers for sockets.
1463  */
1464 
1465 
1466 /*
1467  * Write buffer destructor automatically called from kfree_skb.
1468  */
1469 void sock_wfree(struct sk_buff *skb)
1470 {
1471  struct sock *sk = skb->sk;
1472  unsigned int len = skb->truesize;
1473 
1474  if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1475  /*
1476  * Keep a reference on sk_wmem_alloc, this will be released
1477  * after sk_write_space() call
1478  */
1479  atomic_sub(len - 1, &sk->sk_wmem_alloc);
1480  sk->sk_write_space(sk);
1481  len = 1;
1482  }
1483  /*
1484  * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1485  * could not do because of in-flight packets
1486  */
1487  if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1488  __sk_free(sk);
1489 }
1491 
1492 /*
1493  * Read buffer destructor automatically called from kfree_skb.
1494  */
1495 void sock_rfree(struct sk_buff *skb)
1496 {
1497  struct sock *sk = skb->sk;
1498  unsigned int len = skb->truesize;
1499 
1500  atomic_sub(len, &sk->sk_rmem_alloc);
1501  sk_mem_uncharge(sk, len);
1502 }
1504 
1505 void sock_edemux(struct sk_buff *skb)
1506 {
1507  struct sock *sk = skb->sk;
1508 
1509 #ifdef CONFIG_INET
1510  if (sk->sk_state == TCP_TIME_WAIT)
1511  inet_twsk_put(inet_twsk(sk));
1512  else
1513 #endif
1514  sock_put(sk);
1515 }
1517 
1519 {
1520  kuid_t uid;
1521 
1523  uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1525  return uid;
1526 }
1528 
1529 unsigned long sock_i_ino(struct sock *sk)
1530 {
1531  unsigned long ino;
1532 
1534  ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1536  return ino;
1537 }
1539 
1540 /*
1541  * Allocate a skb from the socket's send buffer.
1542  */
1543 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1544  gfp_t priority)
1545 {
1546  if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1547  struct sk_buff *skb = alloc_skb(size, priority);
1548  if (skb) {
1549  skb_set_owner_w(skb, sk);
1550  return skb;
1551  }
1552  }
1553  return NULL;
1554 }
1556 
1557 /*
1558  * Allocate a skb from the socket's receive buffer.
1559  */
1560 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1561  gfp_t priority)
1562 {
1563  if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1564  struct sk_buff *skb = alloc_skb(size, priority);
1565  if (skb) {
1566  skb_set_owner_r(skb, sk);
1567  return skb;
1568  }
1569  }
1570  return NULL;
1571 }
1572 
1573 /*
1574  * Allocate a memory block from the socket's option memory buffer.
1575  */
1576 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1577 {
1578  if ((unsigned int)size <= sysctl_optmem_max &&
1579  atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1580  void *mem;
1581  /* First do the add, to avoid the race if kmalloc
1582  * might sleep.
1583  */
1584  atomic_add(size, &sk->sk_omem_alloc);
1585  mem = kmalloc(size, priority);
1586  if (mem)
1587  return mem;
1588  atomic_sub(size, &sk->sk_omem_alloc);
1589  }
1590  return NULL;
1591 }
1593 
1594 /*
1595  * Free an option memory block.
1596  */
1597 void sock_kfree_s(struct sock *sk, void *mem, int size)
1598 {
1599  kfree(mem);
1600  atomic_sub(size, &sk->sk_omem_alloc);
1601 }
1603 
1604 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1605  I think, these locks should be removed for datagram sockets.
1606  */
1607 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1608 {
1609  DEFINE_WAIT(wait);
1610 
1611  clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1612  for (;;) {
1613  if (!timeo)
1614  break;
1615  if (signal_pending(current))
1616  break;
1617  set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1618  prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1619  if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1620  break;
1621  if (sk->sk_shutdown & SEND_SHUTDOWN)
1622  break;
1623  if (sk->sk_err)
1624  break;
1625  timeo = schedule_timeout(timeo);
1626  }
1627  finish_wait(sk_sleep(sk), &wait);
1628  return timeo;
1629 }
1630 
1631 
1632 /*
1633  * Generic send/receive buffer handlers
1634  */
1635 
1636 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1637  unsigned long data_len, int noblock,
1638  int *errcode)
1639 {
1640  struct sk_buff *skb;
1641  gfp_t gfp_mask;
1642  long timeo;
1643  int err;
1644  int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1645 
1646  err = -EMSGSIZE;
1647  if (npages > MAX_SKB_FRAGS)
1648  goto failure;
1649 
1650  gfp_mask = sk->sk_allocation;
1651  if (gfp_mask & __GFP_WAIT)
1652  gfp_mask |= __GFP_REPEAT;
1653 
1654  timeo = sock_sndtimeo(sk, noblock);
1655  while (1) {
1656  err = sock_error(sk);
1657  if (err != 0)
1658  goto failure;
1659 
1660  err = -EPIPE;
1661  if (sk->sk_shutdown & SEND_SHUTDOWN)
1662  goto failure;
1663 
1664  if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1665  skb = alloc_skb(header_len, gfp_mask);
1666  if (skb) {
1667  int i;
1668 
1669  /* No pages, we're done... */
1670  if (!data_len)
1671  break;
1672 
1673  skb->truesize += data_len;
1674  skb_shinfo(skb)->nr_frags = npages;
1675  for (i = 0; i < npages; i++) {
1676  struct page *page;
1677 
1678  page = alloc_pages(sk->sk_allocation, 0);
1679  if (!page) {
1680  err = -ENOBUFS;
1681  skb_shinfo(skb)->nr_frags = i;
1682  kfree_skb(skb);
1683  goto failure;
1684  }
1685 
1686  __skb_fill_page_desc(skb, i,
1687  page, 0,
1688  (data_len >= PAGE_SIZE ?
1689  PAGE_SIZE :
1690  data_len));
1691  data_len -= PAGE_SIZE;
1692  }
1693 
1694  /* Full success... */
1695  break;
1696  }
1697  err = -ENOBUFS;
1698  goto failure;
1699  }
1700  set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1701  set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1702  err = -EAGAIN;
1703  if (!timeo)
1704  goto failure;
1705  if (signal_pending(current))
1706  goto interrupted;
1707  timeo = sock_wait_for_wmem(sk, timeo);
1708  }
1709 
1710  skb_set_owner_w(skb, sk);
1711  return skb;
1712 
1713 interrupted:
1714  err = sock_intr_errno(timeo);
1715 failure:
1716  *errcode = err;
1717  return NULL;
1718 }
1720 
1721 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1722  int noblock, int *errcode)
1723 {
1724  return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1725 }
1727 
1728 /* On 32bit arches, an skb frag is limited to 2^15 */
1729 #define SKB_FRAG_PAGE_ORDER get_order(32768)
1730 
1731 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1732 {
1733  int order;
1734 
1735  if (pfrag->page) {
1736  if (atomic_read(&pfrag->page->_count) == 1) {
1737  pfrag->offset = 0;
1738  return true;
1739  }
1740  if (pfrag->offset < pfrag->size)
1741  return true;
1742  put_page(pfrag->page);
1743  }
1744 
1745  /* We restrict high order allocations to users that can afford to wait */
1746  order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1747 
1748  do {
1749  gfp_t gfp = sk->sk_allocation;
1750 
1751  if (order)
1752  gfp |= __GFP_COMP | __GFP_NOWARN;
1753  pfrag->page = alloc_pages(gfp, order);
1754  if (likely(pfrag->page)) {
1755  pfrag->offset = 0;
1756  pfrag->size = PAGE_SIZE << order;
1757  return true;
1758  }
1759  } while (--order >= 0);
1760 
1761  sk_enter_memory_pressure(sk);
1762  sk_stream_moderate_sndbuf(sk);
1763  return false;
1764 }
1766 
1767 static void __lock_sock(struct sock *sk)
1768  __releases(&sk->sk_lock.slock)
1769  __acquires(&sk->sk_lock.slock)
1770 {
1771  DEFINE_WAIT(wait);
1772 
1773  for (;;) {
1776  spin_unlock_bh(&sk->sk_lock.slock);
1777  schedule();
1778  spin_lock_bh(&sk->sk_lock.slock);
1779  if (!sock_owned_by_user(sk))
1780  break;
1781  }
1782  finish_wait(&sk->sk_lock.wq, &wait);
1783 }
1784 
1785 static void __release_sock(struct sock *sk)
1786  __releases(&sk->sk_lock.slock)
1787  __acquires(&sk->sk_lock.slock)
1788 {
1789  struct sk_buff *skb = sk->sk_backlog.head;
1790 
1791  do {
1792  sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1793  bh_unlock_sock(sk);
1794 
1795  do {
1796  struct sk_buff *next = skb->next;
1797 
1798  prefetch(next);
1799  WARN_ON_ONCE(skb_dst_is_noref(skb));
1800  skb->next = NULL;
1801  sk_backlog_rcv(sk, skb);
1802 
1803  /*
1804  * We are in process context here with softirqs
1805  * disabled, use cond_resched_softirq() to preempt.
1806  * This is safe to do because we've taken the backlog
1807  * queue private:
1808  */
1810 
1811  skb = next;
1812  } while (skb != NULL);
1813 
1814  bh_lock_sock(sk);
1815  } while ((skb = sk->sk_backlog.head) != NULL);
1816 
1817  /*
1818  * Doing the zeroing here guarantee we can not loop forever
1819  * while a wild producer attempts to flood us.
1820  */
1821  sk->sk_backlog.len = 0;
1822 }
1823 
1834 int sk_wait_data(struct sock *sk, long *timeo)
1835 {
1836  int rc;
1837  DEFINE_WAIT(wait);
1838 
1839  prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1840  set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1841  rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1842  clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1843  finish_wait(sk_sleep(sk), &wait);
1844  return rc;
1845 }
1847 
1858 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1859 {
1860  struct proto *prot = sk->sk_prot;
1861  int amt = sk_mem_pages(size);
1862  long allocated;
1863  int parent_status = UNDER_LIMIT;
1864 
1865  sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1866 
1867  allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1868 
1869  /* Under limit. */
1870  if (parent_status == UNDER_LIMIT &&
1871  allocated <= sk_prot_mem_limits(sk, 0)) {
1872  sk_leave_memory_pressure(sk);
1873  return 1;
1874  }
1875 
1876  /* Under pressure. (we or our parents) */
1877  if ((parent_status > SOFT_LIMIT) ||
1878  allocated > sk_prot_mem_limits(sk, 1))
1879  sk_enter_memory_pressure(sk);
1880 
1881  /* Over hard limit (we or our parents) */
1882  if ((parent_status == OVER_LIMIT) ||
1883  (allocated > sk_prot_mem_limits(sk, 2)))
1884  goto suppress_allocation;
1885 
1886  /* guarantee minimum buffer size under pressure */
1887  if (kind == SK_MEM_RECV) {
1888  if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1889  return 1;
1890 
1891  } else { /* SK_MEM_SEND */
1892  if (sk->sk_type == SOCK_STREAM) {
1893  if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1894  return 1;
1895  } else if (atomic_read(&sk->sk_wmem_alloc) <
1896  prot->sysctl_wmem[0])
1897  return 1;
1898  }
1899 
1900  if (sk_has_memory_pressure(sk)) {
1901  int alloc;
1902 
1903  if (!sk_under_memory_pressure(sk))
1904  return 1;
1905  alloc = sk_sockets_allocated_read_positive(sk);
1906  if (sk_prot_mem_limits(sk, 2) > alloc *
1907  sk_mem_pages(sk->sk_wmem_queued +
1908  atomic_read(&sk->sk_rmem_alloc) +
1909  sk->sk_forward_alloc))
1910  return 1;
1911  }
1912 
1913 suppress_allocation:
1914 
1915  if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1916  sk_stream_moderate_sndbuf(sk);
1917 
1918  /* Fail only if socket is _under_ its sndbuf.
1919  * In this case we cannot block, so that we have to fail.
1920  */
1921  if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1922  return 1;
1923  }
1924 
1925  trace_sock_exceed_buf_limit(sk, prot, allocated);
1926 
1927  /* Alas. Undo changes. */
1928  sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1929 
1930  sk_memory_allocated_sub(sk, amt);
1931 
1932  return 0;
1933 }
1935 
1940 void __sk_mem_reclaim(struct sock *sk)
1941 {
1942  sk_memory_allocated_sub(sk,
1944  sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1945 
1946  if (sk_under_memory_pressure(sk) &&
1947  (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1948  sk_leave_memory_pressure(sk);
1949 }
1951 
1952 
1953 /*
1954  * Set of default routines for initialising struct proto_ops when
1955  * the protocol does not support a particular function. In certain
1956  * cases where it makes no sense for a protocol to have a "do nothing"
1957  * function, some default processing is provided.
1958  */
1959 
1960 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1961 {
1962  return -EOPNOTSUPP;
1963 }
1965 
1967  int len, int flags)
1968 {
1969  return -EOPNOTSUPP;
1970 }
1972 
1973 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1974 {
1975  return -EOPNOTSUPP;
1976 }
1978 
1979 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1980 {
1981  return -EOPNOTSUPP;
1982 }
1984 
1986  int *len, int peer)
1987 {
1988  return -EOPNOTSUPP;
1989 }
1991 
1992 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1993 {
1994  return 0;
1995 }
1997 
1998 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1999 {
2000  return -EOPNOTSUPP;
2001 }
2003 
2005 {
2006  return -EOPNOTSUPP;
2007 }
2009 
2010 int sock_no_shutdown(struct socket *sock, int how)
2011 {
2012  return -EOPNOTSUPP;
2013 }
2015 
2016 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2017  char __user *optval, unsigned int optlen)
2018 {
2019  return -EOPNOTSUPP;
2020 }
2022 
2023 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2024  char __user *optval, int __user *optlen)
2025 {
2026  return -EOPNOTSUPP;
2027 }
2029 
2030 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2031  size_t len)
2032 {
2033  return -EOPNOTSUPP;
2034 }
2036 
2037 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2038  size_t len, int flags)
2039 {
2040  return -EOPNOTSUPP;
2041 }
2043 
2044 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2045 {
2046  /* Mirror missing mmap method error code */
2047  return -ENODEV;
2048 }
2050 
2051 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2052 {
2053  ssize_t res;
2054  struct msghdr msg = {.msg_flags = flags};
2055  struct kvec iov;
2056  char *kaddr = kmap(page);
2057  iov.iov_base = kaddr + offset;
2058  iov.iov_len = size;
2059  res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2060  kunmap(page);
2061  return res;
2062 }
2064 
2065 /*
2066  * Default Socket Callbacks
2067  */
2068 
2069 static void sock_def_wakeup(struct sock *sk)
2070 {
2071  struct socket_wq *wq;
2072 
2073  rcu_read_lock();
2074  wq = rcu_dereference(sk->sk_wq);
2075  if (wq_has_sleeper(wq))
2077  rcu_read_unlock();
2078 }
2079 
2080 static void sock_def_error_report(struct sock *sk)
2081 {
2082  struct socket_wq *wq;
2083 
2084  rcu_read_lock();
2085  wq = rcu_dereference(sk->sk_wq);
2086  if (wq_has_sleeper(wq))
2088  sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2089  rcu_read_unlock();
2090 }
2091 
2092 static void sock_def_readable(struct sock *sk, int len)
2093 {
2094  struct socket_wq *wq;
2095 
2096  rcu_read_lock();
2097  wq = rcu_dereference(sk->sk_wq);
2098  if (wq_has_sleeper(wq))
2101  sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2102  rcu_read_unlock();
2103 }
2104 
2105 static void sock_def_write_space(struct sock *sk)
2106 {
2107  struct socket_wq *wq;
2108 
2109  rcu_read_lock();
2110 
2111  /* Do not wake up a writer until he can make "significant"
2112  * progress. --DaveM
2113  */
2114  if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2115  wq = rcu_dereference(sk->sk_wq);
2116  if (wq_has_sleeper(wq))
2119 
2120  /* Should agree with poll, otherwise some programs break */
2121  if (sock_writeable(sk))
2122  sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2123  }
2124 
2125  rcu_read_unlock();
2126 }
2127 
2128 static void sock_def_destruct(struct sock *sk)
2129 {
2130  kfree(sk->sk_protinfo);
2131 }
2132 
2133 void sk_send_sigurg(struct sock *sk)
2134 {
2135  if (sk->sk_socket && sk->sk_socket->file)
2136  if (send_sigurg(&sk->sk_socket->file->f_owner))
2137  sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2138 }
2140 
2141 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2142  unsigned long expires)
2143 {
2144  if (!mod_timer(timer, expires))
2145  sock_hold(sk);
2146 }
2148 
2149 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2150 {
2151  if (timer_pending(timer) && del_timer(timer))
2152  __sock_put(sk);
2153 }
2155 
2156 void sock_init_data(struct socket *sock, struct sock *sk)
2157 {
2158  skb_queue_head_init(&sk->sk_receive_queue);
2159  skb_queue_head_init(&sk->sk_write_queue);
2160  skb_queue_head_init(&sk->sk_error_queue);
2161 #ifdef CONFIG_NET_DMA
2162  skb_queue_head_init(&sk->sk_async_wait_queue);
2163 #endif
2164 
2165  sk->sk_send_head = NULL;
2166 
2167  init_timer(&sk->sk_timer);
2168 
2169  sk->sk_allocation = GFP_KERNEL;
2172  sk->sk_state = TCP_CLOSE;
2173  sk_set_socket(sk, sock);
2174 
2175  sock_set_flag(sk, SOCK_ZAPPED);
2176 
2177  if (sock) {
2178  sk->sk_type = sock->type;
2179  sk->sk_wq = sock->wq;
2180  sock->sk = sk;
2181  } else
2182  sk->sk_wq = NULL;
2183 
2187  af_callback_keys + sk->sk_family,
2188  af_family_clock_key_strings[sk->sk_family]);
2189 
2190  sk->sk_state_change = sock_def_wakeup;
2191  sk->sk_data_ready = sock_def_readable;
2192  sk->sk_write_space = sock_def_write_space;
2193  sk->sk_error_report = sock_def_error_report;
2194  sk->sk_destruct = sock_def_destruct;
2195 
2196  sk->sk_frag.page = NULL;
2197  sk->sk_frag.offset = 0;
2198  sk->sk_peek_off = -1;
2199 
2200  sk->sk_peer_pid = NULL;
2201  sk->sk_peer_cred = NULL;
2202  sk->sk_write_pending = 0;
2203  sk->sk_rcvlowat = 1;
2206 
2207  sk->sk_stamp = ktime_set(-1L, 0);
2208 
2209  /*
2210  * Before updating sk_refcnt, we must commit prior changes to memory
2211  * (Documentation/RCU/rculist_nulls.txt for details)
2212  */
2213  smp_wmb();
2214  atomic_set(&sk->sk_refcnt, 1);
2215  atomic_set(&sk->sk_drops, 0);
2216 }
2218 
2219 void lock_sock_nested(struct sock *sk, int subclass)
2220 {
2221  might_sleep();
2222  spin_lock_bh(&sk->sk_lock.slock);
2223  if (sk->sk_lock.owned)
2224  __lock_sock(sk);
2225  sk->sk_lock.owned = 1;
2226  spin_unlock(&sk->sk_lock.slock);
2227  /*
2228  * The sk_lock has mutex_lock() semantics here:
2229  */
2230  mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2231  local_bh_enable();
2232 }
2234 
2235 void release_sock(struct sock *sk)
2236 {
2237  /*
2238  * The sk_lock has mutex_unlock() semantics:
2239  */
2240  mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2241 
2242  spin_lock_bh(&sk->sk_lock.slock);
2243  if (sk->sk_backlog.tail)
2244  __release_sock(sk);
2245 
2246  if (sk->sk_prot->release_cb)
2247  sk->sk_prot->release_cb(sk);
2248 
2249  sk->sk_lock.owned = 0;
2250  if (waitqueue_active(&sk->sk_lock.wq))
2251  wake_up(&sk->sk_lock.wq);
2252  spin_unlock_bh(&sk->sk_lock.slock);
2253 }
2255 
2266 bool lock_sock_fast(struct sock *sk)
2267 {
2268  might_sleep();
2269  spin_lock_bh(&sk->sk_lock.slock);
2270 
2271  if (!sk->sk_lock.owned)
2272  /*
2273  * Note : We must disable BH
2274  */
2275  return false;
2276 
2277  __lock_sock(sk);
2278  sk->sk_lock.owned = 1;
2279  spin_unlock(&sk->sk_lock.slock);
2280  /*
2281  * The sk_lock has mutex_lock() semantics here:
2282  */
2283  mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2284  local_bh_enable();
2285  return true;
2286 }
2288 
2289 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2290 {
2291  struct timeval tv;
2292  if (!sock_flag(sk, SOCK_TIMESTAMP))
2294  tv = ktime_to_timeval(sk->sk_stamp);
2295  if (tv.tv_sec == -1)
2296  return -ENOENT;
2297  if (tv.tv_sec == 0) {
2298  sk->sk_stamp = ktime_get_real();
2299  tv = ktime_to_timeval(sk->sk_stamp);
2300  }
2301  return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2302 }
2304 
2305 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2306 {
2307  struct timespec ts;
2308  if (!sock_flag(sk, SOCK_TIMESTAMP))
2310  ts = ktime_to_timespec(sk->sk_stamp);
2311  if (ts.tv_sec == -1)
2312  return -ENOENT;
2313  if (ts.tv_sec == 0) {
2314  sk->sk_stamp = ktime_get_real();
2315  ts = ktime_to_timespec(sk->sk_stamp);
2316  }
2317  return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2318 }
2320 
2321 void sock_enable_timestamp(struct sock *sk, int flag)
2322 {
2323  if (!sock_flag(sk, flag)) {
2324  unsigned long previous_flags = sk->sk_flags;
2325 
2326  sock_set_flag(sk, flag);
2327  /*
2328  * we just set one of the two flags which require net
2329  * time stamping, but time stamping might have been on
2330  * already because of the other one
2331  */
2332  if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2334  }
2335 }
2336 
2337 /*
2338  * Get a socket option on an socket.
2339  *
2340  * FIX: POSIX 1003.1g is very ambiguous here. It states that
2341  * asynchronous errors should be reported by getsockopt. We assume
2342  * this means if you specify SO_ERROR (otherwise whats the point of it).
2343  */
2344 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2345  char __user *optval, int __user *optlen)
2346 {
2347  struct sock *sk = sock->sk;
2348 
2349  return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2350 }
2352 
2353 #ifdef CONFIG_COMPAT
2354 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2355  char __user *optval, int __user *optlen)
2356 {
2357  struct sock *sk = sock->sk;
2358 
2359  if (sk->sk_prot->compat_getsockopt != NULL)
2360  return sk->sk_prot->compat_getsockopt(sk, level, optname,
2361  optval, optlen);
2362  return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2363 }
2365 #endif
2366 
2367 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2368  struct msghdr *msg, size_t size, int flags)
2369 {
2370  struct sock *sk = sock->sk;
2371  int addr_len = 0;
2372  int err;
2373 
2374  err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2375  flags & ~MSG_DONTWAIT, &addr_len);
2376  if (err >= 0)
2377  msg->msg_namelen = addr_len;
2378  return err;
2379 }
2381 
2382 /*
2383  * Set socket options on an inet socket.
2384  */
2385 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2386  char __user *optval, unsigned int optlen)
2387 {
2388  struct sock *sk = sock->sk;
2389 
2390  return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2391 }
2393 
2394 #ifdef CONFIG_COMPAT
2395 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2396  char __user *optval, unsigned int optlen)
2397 {
2398  struct sock *sk = sock->sk;
2399 
2400  if (sk->sk_prot->compat_setsockopt != NULL)
2401  return sk->sk_prot->compat_setsockopt(sk, level, optname,
2402  optval, optlen);
2403  return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2404 }
2406 #endif
2407 
2408 void sk_common_release(struct sock *sk)
2409 {
2410  if (sk->sk_prot->destroy)
2411  sk->sk_prot->destroy(sk);
2412 
2413  /*
2414  * Observation: when sock_common_release is called, processes have
2415  * no access to socket. But net still has.
2416  * Step one, detach it from networking:
2417  *
2418  * A. Remove from hash tables.
2419  */
2420 
2421  sk->sk_prot->unhash(sk);
2422 
2423  /*
2424  * In this point socket cannot receive new packets, but it is possible
2425  * that some packets are in flight because some CPU runs receiver and
2426  * did hash table lookup before we unhashed socket. They will achieve
2427  * receive queue and will be purged by socket destructor.
2428  *
2429  * Also we still have packets pending on receive queue and probably,
2430  * our own packets waiting in device queues. sock_destroy will drain
2431  * receive queue, but transmitted packets will delay socket destruction
2432  * until the last reference will be released.
2433  */
2434 
2435  sock_orphan(sk);
2436 
2437  xfrm_sk_free_policy(sk);
2438 
2440 
2441  if (sk->sk_frag.page) {
2442  put_page(sk->sk_frag.page);
2443  sk->sk_frag.page = NULL;
2444  }
2445 
2446  sock_put(sk);
2447 }
2449 
2450 #ifdef CONFIG_PROC_FS
2451 #define PROTO_INUSE_NR 64 /* should be enough for the first time */
2452 struct prot_inuse {
2453  int val[PROTO_INUSE_NR];
2454 };
2455 
2456 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2457 
2458 #ifdef CONFIG_NET_NS
2459 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2460 {
2461  __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2462 }
2463 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2464 
2465 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2466 {
2467  int cpu, idx = prot->inuse_idx;
2468  int res = 0;
2469 
2471  res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2472 
2473  return res >= 0 ? res : 0;
2474 }
2475 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2476 
2477 static int __net_init sock_inuse_init_net(struct net *net)
2478 {
2479  net->core.inuse = alloc_percpu(struct prot_inuse);
2480  return net->core.inuse ? 0 : -ENOMEM;
2481 }
2482 
2483 static void __net_exit sock_inuse_exit_net(struct net *net)
2484 {
2485  free_percpu(net->core.inuse);
2486 }
2487 
2488 static struct pernet_operations net_inuse_ops = {
2489  .init = sock_inuse_init_net,
2490  .exit = sock_inuse_exit_net,
2491 };
2492 
2493 static __init int net_inuse_init(void)
2494 {
2495  if (register_pernet_subsys(&net_inuse_ops))
2496  panic("Cannot initialize net inuse counters");
2497 
2498  return 0;
2499 }
2500 
2501 core_initcall(net_inuse_init);
2502 #else
2503 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2504 
2505 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2506 {
2507  __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2508 }
2509 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2510 
2511 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2512 {
2513  int cpu, idx = prot->inuse_idx;
2514  int res = 0;
2515 
2517  res += per_cpu(prot_inuse, cpu).val[idx];
2518 
2519  return res >= 0 ? res : 0;
2520 }
2521 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2522 #endif
2523 
2524 static void assign_proto_idx(struct proto *prot)
2525 {
2526  prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2527 
2528  if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2529  pr_err("PROTO_INUSE_NR exhausted\n");
2530  return;
2531  }
2532 
2533  set_bit(prot->inuse_idx, proto_inuse_idx);
2534 }
2535 
2536 static void release_proto_idx(struct proto *prot)
2537 {
2538  if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2539  clear_bit(prot->inuse_idx, proto_inuse_idx);
2540 }
2541 #else
2542 static inline void assign_proto_idx(struct proto *prot)
2543 {
2544 }
2545 
2546 static inline void release_proto_idx(struct proto *prot)
2547 {
2548 }
2549 #endif
2550 
2551 int proto_register(struct proto *prot, int alloc_slab)
2552 {
2553  if (alloc_slab) {
2554  prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2556  NULL);
2557 
2558  if (prot->slab == NULL) {
2559  pr_crit("%s: Can't create sock SLAB cache!\n",
2560  prot->name);
2561  goto out;
2562  }
2563 
2564  if (prot->rsk_prot != NULL) {
2565  prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2566  if (prot->rsk_prot->slab_name == NULL)
2567  goto out_free_sock_slab;
2568 
2569  prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2570  prot->rsk_prot->obj_size, 0,
2572 
2573  if (prot->rsk_prot->slab == NULL) {
2574  pr_crit("%s: Can't create request sock SLAB cache!\n",
2575  prot->name);
2576  goto out_free_request_sock_slab_name;
2577  }
2578  }
2579 
2580  if (prot->twsk_prot != NULL) {
2581  prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2582 
2583  if (prot->twsk_prot->twsk_slab_name == NULL)
2584  goto out_free_request_sock_slab;
2585 
2586  prot->twsk_prot->twsk_slab =
2587  kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2588  prot->twsk_prot->twsk_obj_size,
2589  0,
2591  prot->slab_flags,
2592  NULL);
2593  if (prot->twsk_prot->twsk_slab == NULL)
2594  goto out_free_timewait_sock_slab_name;
2595  }
2596  }
2597 
2598  mutex_lock(&proto_list_mutex);
2599  list_add(&prot->node, &proto_list);
2600  assign_proto_idx(prot);
2601  mutex_unlock(&proto_list_mutex);
2602  return 0;
2603 
2604 out_free_timewait_sock_slab_name:
2605  kfree(prot->twsk_prot->twsk_slab_name);
2606 out_free_request_sock_slab:
2607  if (prot->rsk_prot && prot->rsk_prot->slab) {
2608  kmem_cache_destroy(prot->rsk_prot->slab);
2609  prot->rsk_prot->slab = NULL;
2610  }
2611 out_free_request_sock_slab_name:
2612  if (prot->rsk_prot)
2613  kfree(prot->rsk_prot->slab_name);
2614 out_free_sock_slab:
2615  kmem_cache_destroy(prot->slab);
2616  prot->slab = NULL;
2617 out:
2618  return -ENOBUFS;
2619 }
2621 
2622 void proto_unregister(struct proto *prot)
2623 {
2624  mutex_lock(&proto_list_mutex);
2625  release_proto_idx(prot);
2626  list_del(&prot->node);
2627  mutex_unlock(&proto_list_mutex);
2628 
2629  if (prot->slab != NULL) {
2630  kmem_cache_destroy(prot->slab);
2631  prot->slab = NULL;
2632  }
2633 
2634  if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2635  kmem_cache_destroy(prot->rsk_prot->slab);
2636  kfree(prot->rsk_prot->slab_name);
2637  prot->rsk_prot->slab = NULL;
2638  }
2639 
2640  if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2641  kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2642  kfree(prot->twsk_prot->twsk_slab_name);
2643  prot->twsk_prot->twsk_slab = NULL;
2644  }
2645 }
2647 
2648 #ifdef CONFIG_PROC_FS
2649 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2650  __acquires(proto_list_mutex)
2651 {
2652  mutex_lock(&proto_list_mutex);
2653  return seq_list_start_head(&proto_list, *pos);
2654 }
2655 
2656 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2657 {
2658  return seq_list_next(v, &proto_list, pos);
2659 }
2660 
2661 static void proto_seq_stop(struct seq_file *seq, void *v)
2662  __releases(proto_list_mutex)
2663 {
2664  mutex_unlock(&proto_list_mutex);
2665 }
2666 
2667 static char proto_method_implemented(const void *method)
2668 {
2669  return method == NULL ? 'n' : 'y';
2670 }
2671 static long sock_prot_memory_allocated(struct proto *proto)
2672 {
2673  return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2674 }
2675 
2676 static char *sock_prot_memory_pressure(struct proto *proto)
2677 {
2678  return proto->memory_pressure != NULL ?
2679  proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2680 }
2681 
2682 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2683 {
2684 
2685  seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
2686  "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2687  proto->name,
2688  proto->obj_size,
2689  sock_prot_inuse_get(seq_file_net(seq), proto),
2690  sock_prot_memory_allocated(proto),
2691  sock_prot_memory_pressure(proto),
2692  proto->max_header,
2693  proto->slab == NULL ? "no" : "yes",
2694  module_name(proto->owner),
2695  proto_method_implemented(proto->close),
2696  proto_method_implemented(proto->connect),
2697  proto_method_implemented(proto->disconnect),
2698  proto_method_implemented(proto->accept),
2699  proto_method_implemented(proto->ioctl),
2700  proto_method_implemented(proto->init),
2701  proto_method_implemented(proto->destroy),
2702  proto_method_implemented(proto->shutdown),
2703  proto_method_implemented(proto->setsockopt),
2704  proto_method_implemented(proto->getsockopt),
2705  proto_method_implemented(proto->sendmsg),
2706  proto_method_implemented(proto->recvmsg),
2707  proto_method_implemented(proto->sendpage),
2708  proto_method_implemented(proto->bind),
2709  proto_method_implemented(proto->backlog_rcv),
2710  proto_method_implemented(proto->hash),
2711  proto_method_implemented(proto->unhash),
2712  proto_method_implemented(proto->get_port),
2713  proto_method_implemented(proto->enter_memory_pressure));
2714 }
2715 
2716 static int proto_seq_show(struct seq_file *seq, void *v)
2717 {
2718  if (v == &proto_list)
2719  seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2720  "protocol",
2721  "size",
2722  "sockets",
2723  "memory",
2724  "press",
2725  "maxhdr",
2726  "slab",
2727  "module",
2728  "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2729  else
2730  proto_seq_printf(seq, list_entry(v, struct proto, node));
2731  return 0;
2732 }
2733 
2734 static const struct seq_operations proto_seq_ops = {
2735  .start = proto_seq_start,
2736  .next = proto_seq_next,
2737  .stop = proto_seq_stop,
2738  .show = proto_seq_show,
2739 };
2740 
2741 static int proto_seq_open(struct inode *inode, struct file *file)
2742 {
2743  return seq_open_net(inode, file, &proto_seq_ops,
2744  sizeof(struct seq_net_private));
2745 }
2746 
2747 static const struct file_operations proto_seq_fops = {
2748  .owner = THIS_MODULE,
2749  .open = proto_seq_open,
2750  .read = seq_read,
2751  .llseek = seq_lseek,
2752  .release = seq_release_net,
2753 };
2754 
2755 static __net_init int proto_init_net(struct net *net)
2756 {
2757  if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2758  return -ENOMEM;
2759 
2760  return 0;
2761 }
2762 
2763 static __net_exit void proto_exit_net(struct net *net)
2764 {
2765  proc_net_remove(net, "protocols");
2766 }
2767 
2768 
2769 static __net_initdata struct pernet_operations proto_net_ops = {
2770  .init = proto_init_net,
2771  .exit = proto_exit_net,
2772 };
2773 
2774 static int __init proto_init(void)
2775 {
2776  return register_pernet_subsys(&proto_net_ops);
2777 }
2778 
2779 subsys_initcall(proto_init);
2780 
2781 #endif /* PROC_FS */