Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
tcp_ipv4.c
Go to the documentation of this file.
1 /*
2  * INET An implementation of the TCP/IP protocol suite for the LINUX
3  * operating system. INET is implemented using the BSD Socket
4  * interface as the means of communication with the user level.
5  *
6  * Implementation of the Transmission Control Protocol(TCP).
7  *
8  * IPv4 specific functions
9  *
10  *
11  * code split from:
12  * linux/ipv4/tcp.c
13  * linux/ipv4/tcp_input.c
14  * linux/ipv4/tcp_output.c
15  *
16  * See tcp.c for author information
17  *
18  * This program is free software; you can redistribute it and/or
19  * modify it under the terms of the GNU General Public License
20  * as published by the Free Software Foundation; either version
21  * 2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  * David S. Miller : New socket lookup architecture.
27  * This code is dedicated to John Dyson.
28  * David S. Miller : Change semantics of established hash,
29  * half is devoted to TIME_WAIT sockets
30  * and the rest go in the other half.
31  * Andi Kleen : Add support for syncookies and fixed
32  * some bugs: ip options weren't passed to
33  * the TCP layer, missed a check for an
34  * ACK bit.
35  * Andi Kleen : Implemented fast path mtu discovery.
36  * Fixed many serious bugs in the
37  * request_sock handling and moved
38  * most of it into the af independent code.
39  * Added tail drop and some other bugfixes.
40  * Added new listen semantics.
41  * Mike McLagan : Routing by source
42  * Juan Jose Ciarlante: ip_dynaddr bits
43  * Andi Kleen: various fixes.
44  * Vitaly E. Lavrov : Transparent proxy revived after year
45  * coma.
46  * Andi Kleen : Fix new listen.
47  * Andi Kleen : Fix accept error reporting.
48  * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49  * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50  * a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95  __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97 
100 
101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103  return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104  ip_hdr(skb)->saddr,
105  tcp_hdr(skb)->dest,
106  tcp_hdr(skb)->source);
107 }
108 
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111  const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112  struct tcp_sock *tp = tcp_sk(sk);
113 
114  /* With PAWS, it is safe from the viewpoint
115  of data integrity. Even without PAWS it is safe provided sequence
116  spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 
118  Actually, the idea is close to VJ's one, only timestamp cache is
119  held not per host, but per port pair and TW bucket is used as state
120  holder.
121 
122  If TW bucket has been already destroyed we fall back to VJ's scheme
123  and use initial timestamp retrieved from peer table.
124  */
125  if (tcptw->tw_ts_recent_stamp &&
126  (twp == NULL || (sysctl_tcp_tw_reuse &&
127  get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128  tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129  if (tp->write_seq == 0)
130  tp->write_seq = 1;
131  tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
132  tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133  sock_hold(sktw);
134  return 1;
135  }
136 
137  return 0;
138 }
140 
141 static int tcp_repair_connect(struct sock *sk)
142 {
143  tcp_connect_init(sk);
145 
146  return 0;
147 }
148 
149 /* This will initiate an outgoing connection. */
150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
151 {
152  struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
153  struct inet_sock *inet = inet_sk(sk);
154  struct tcp_sock *tp = tcp_sk(sk);
155  __be16 orig_sport, orig_dport;
156  __be32 daddr, nexthop;
157  struct flowi4 *fl4;
158  struct rtable *rt;
159  int err;
160  struct ip_options_rcu *inet_opt;
161 
162  if (addr_len < sizeof(struct sockaddr_in))
163  return -EINVAL;
164 
165  if (usin->sin_family != AF_INET)
166  return -EAFNOSUPPORT;
167 
168  nexthop = daddr = usin->sin_addr.s_addr;
169  inet_opt = rcu_dereference_protected(inet->inet_opt,
170  sock_owned_by_user(sk));
171  if (inet_opt && inet_opt->opt.srr) {
172  if (!daddr)
173  return -EINVAL;
174  nexthop = inet_opt->opt.faddr;
175  }
176 
177  orig_sport = inet->inet_sport;
178  orig_dport = usin->sin_port;
179  fl4 = &inet->cork.fl.u.ip4;
180  rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
181  RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
182  IPPROTO_TCP,
183  orig_sport, orig_dport, sk, true);
184  if (IS_ERR(rt)) {
185  err = PTR_ERR(rt);
186  if (err == -ENETUNREACH)
188  return err;
189  }
190 
191  if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
192  ip_rt_put(rt);
193  return -ENETUNREACH;
194  }
195 
196  if (!inet_opt || !inet_opt->opt.srr)
197  daddr = fl4->daddr;
198 
199  if (!inet->inet_saddr)
200  inet->inet_saddr = fl4->saddr;
201  inet->inet_rcv_saddr = inet->inet_saddr;
202 
203  if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
204  /* Reset inherited state */
205  tp->rx_opt.ts_recent = 0;
206  tp->rx_opt.ts_recent_stamp = 0;
207  if (likely(!tp->repair))
208  tp->write_seq = 0;
209  }
210 
211  if (tcp_death_row.sysctl_tw_recycle &&
212  !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
213  tcp_fetch_timewait_stamp(sk, &rt->dst);
214 
215  inet->inet_dport = usin->sin_port;
216  inet->inet_daddr = daddr;
217 
218  inet_csk(sk)->icsk_ext_hdr_len = 0;
219  if (inet_opt)
220  inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
221 
222  tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
223 
224  /* Socket identity is still unknown (sport may be zero).
225  * However we set state to SYN-SENT and not releasing socket
226  * lock select source port, enter ourselves into the hash tables and
227  * complete initialization after this.
228  */
230  err = inet_hash_connect(&tcp_death_row, sk);
231  if (err)
232  goto failure;
233 
234  rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
235  inet->inet_sport, inet->inet_dport, sk);
236  if (IS_ERR(rt)) {
237  err = PTR_ERR(rt);
238  rt = NULL;
239  goto failure;
240  }
241  /* OK, now commit destination to socket. */
243  sk_setup_caps(sk, &rt->dst);
244 
245  if (!tp->write_seq && likely(!tp->repair))
247  inet->inet_daddr,
248  inet->inet_sport,
249  usin->sin_port);
250 
251  inet->inet_id = tp->write_seq ^ jiffies;
252 
253  if (likely(!tp->repair))
254  err = tcp_connect(sk);
255  else
256  err = tcp_repair_connect(sk);
257 
258  rt = NULL;
259  if (err)
260  goto failure;
261 
262  return 0;
263 
264 failure:
265  /*
266  * This unhashes the socket and releases the local port,
267  * if necessary.
268  */
270  ip_rt_put(rt);
271  sk->sk_route_caps = 0;
272  inet->inet_dport = 0;
273  return err;
274 }
276 
277 /*
278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
279  * It can be called through tcp_release_cb() if socket was owned by user
280  * at the time tcp_v4_err() was called to handle ICMP message.
281  */
282 static void tcp_v4_mtu_reduced(struct sock *sk)
283 {
284  struct dst_entry *dst;
285  struct inet_sock *inet = inet_sk(sk);
286  u32 mtu = tcp_sk(sk)->mtu_info;
287 
288  /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
289  * send out by Linux are always <576bytes so they should go through
290  * unfragmented).
291  */
292  if (sk->sk_state == TCP_LISTEN)
293  return;
294 
295  dst = inet_csk_update_pmtu(sk, mtu);
296  if (!dst)
297  return;
298 
299  /* Something is about to be wrong... Remember soft error
300  * for the case, if this connection will not able to recover.
301  */
302  if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
303  sk->sk_err_soft = EMSGSIZE;
304 
305  mtu = dst_mtu(dst);
306 
307  if (inet->pmtudisc != IP_PMTUDISC_DONT &&
308  inet_csk(sk)->icsk_pmtu_cookie > mtu) {
309  tcp_sync_mss(sk, mtu);
310 
311  /* Resend the TCP packet because it's
312  * clear that the old packet has been
313  * dropped. This is the new "fast" path mtu
314  * discovery.
315  */
317  } /* else let the usual retransmit timer handle it */
318 }
319 
320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
321 {
322  struct dst_entry *dst = __sk_dst_check(sk, 0);
323 
324  if (dst)
325  dst->ops->redirect(dst, sk, skb);
326 }
327 
328 /*
329  * This routine is called by the ICMP module when it gets some
330  * sort of error condition. If err < 0 then the socket should
331  * be closed and the error returned to the user. If err > 0
332  * it's just the icmp type << 8 | icmp code. After adjustment
333  * header points to the first 8 bytes of the tcp header. We need
334  * to find the appropriate port.
335  *
336  * The locking strategy used here is very "optimistic". When
337  * someone else accesses the socket the ICMP is just dropped
338  * and for some paths there is no check at all.
339  * A more general error queue to queue errors for later handling
340  * is probably better.
341  *
342  */
343 
344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
345 {
346  const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
347  struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
348  struct inet_connection_sock *icsk;
349  struct tcp_sock *tp;
350  struct inet_sock *inet;
351  const int type = icmp_hdr(icmp_skb)->type;
352  const int code = icmp_hdr(icmp_skb)->code;
353  struct sock *sk;
354  struct sk_buff *skb;
355  struct request_sock *req;
356  __u32 seq;
357  __u32 remaining;
358  int err;
359  struct net *net = dev_net(icmp_skb->dev);
360 
361  if (icmp_skb->len < (iph->ihl << 2) + 8) {
363  return;
364  }
365 
366  sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
367  iph->saddr, th->source, inet_iif(icmp_skb));
368  if (!sk) {
370  return;
371  }
372  if (sk->sk_state == TCP_TIME_WAIT) {
373  inet_twsk_put(inet_twsk(sk));
374  return;
375  }
376 
377  bh_lock_sock(sk);
378  /* If too many ICMPs get dropped on busy
379  * servers this needs to be solved differently.
380  * We do take care of PMTU discovery (RFC1191) special case :
381  * we can receive locally generated ICMP messages while socket is held.
382  */
383  if (sock_owned_by_user(sk) &&
384  type != ICMP_DEST_UNREACH &&
385  code != ICMP_FRAG_NEEDED)
387 
388  if (sk->sk_state == TCP_CLOSE)
389  goto out;
390 
391  if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
393  goto out;
394  }
395 
396  icsk = inet_csk(sk);
397  tp = tcp_sk(sk);
398  req = tp->fastopen_rsk;
399  seq = ntohl(th->seq);
400  if (sk->sk_state != TCP_LISTEN &&
401  !between(seq, tp->snd_una, tp->snd_nxt) &&
402  (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
403  /* For a Fast Open socket, allow seq to be snt_isn. */
405  goto out;
406  }
407 
408  switch (type) {
409  case ICMP_REDIRECT:
410  do_redirect(icmp_skb, sk);
411  goto out;
412  case ICMP_SOURCE_QUENCH:
413  /* Just silently ignore these. */
414  goto out;
415  case ICMP_PARAMETERPROB:
416  err = EPROTO;
417  break;
418  case ICMP_DEST_UNREACH:
419  if (code > NR_ICMP_UNREACH)
420  goto out;
421 
422  if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
423  tp->mtu_info = info;
424  if (!sock_owned_by_user(sk)) {
425  tcp_v4_mtu_reduced(sk);
426  } else {
428  sock_hold(sk);
429  }
430  goto out;
431  }
432 
433  err = icmp_err_convert[code].errno;
434  /* check if icmp_skb allows revert of backoff
435  * (see draft-zimmermann-tcp-lcd) */
436  if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
437  break;
438  if (seq != tp->snd_una || !icsk->icsk_retransmits ||
439  !icsk->icsk_backoff)
440  break;
441 
442  /* XXX (TFO) - revisit the following logic for TFO */
443 
444  if (sock_owned_by_user(sk))
445  break;
446 
447  icsk->icsk_backoff--;
448  inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
449  TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
450  tcp_bound_rto(sk);
451 
452  skb = tcp_write_queue_head(sk);
453  BUG_ON(!skb);
454 
455  remaining = icsk->icsk_rto - min(icsk->icsk_rto,
456  tcp_time_stamp - TCP_SKB_CB(skb)->when);
457 
458  if (remaining) {
459  inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
460  remaining, TCP_RTO_MAX);
461  } else {
462  /* RTO revert clocked out retransmission.
463  * Will retransmit now */
465  }
466 
467  break;
468  case ICMP_TIME_EXCEEDED:
469  err = EHOSTUNREACH;
470  break;
471  default:
472  goto out;
473  }
474 
475  /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
476  * than following the TCP_SYN_RECV case and closing the socket,
477  * we ignore the ICMP error and keep trying like a fully established
478  * socket. Is this the right thing to do?
479  */
480  if (req && req->sk == NULL)
481  goto out;
482 
483  switch (sk->sk_state) {
484  struct request_sock *req, **prev;
485  case TCP_LISTEN:
486  if (sock_owned_by_user(sk))
487  goto out;
488 
489  req = inet_csk_search_req(sk, &prev, th->dest,
490  iph->daddr, iph->saddr);
491  if (!req)
492  goto out;
493 
494  /* ICMPs are not backlogged, hence we cannot get
495  an established socket here.
496  */
497  WARN_ON(req->sk);
498 
499  if (seq != tcp_rsk(req)->snt_isn) {
501  goto out;
502  }
503 
504  /*
505  * Still in SYN_RECV, just remove it silently.
506  * There is no good way to pass the error to the newly
507  * created socket, and POSIX does not want network
508  * errors returned from accept().
509  */
510  inet_csk_reqsk_queue_drop(sk, req, prev);
511  goto out;
512 
513  case TCP_SYN_SENT:
514  case TCP_SYN_RECV: /* Cannot happen.
515  It can f.e. if SYNs crossed,
516  or Fast Open.
517  */
518  if (!sock_owned_by_user(sk)) {
519  sk->sk_err = err;
520 
521  sk->sk_error_report(sk);
522 
523  tcp_done(sk);
524  } else {
525  sk->sk_err_soft = err;
526  }
527  goto out;
528  }
529 
530  /* If we've already connected we will keep trying
531  * until we time out, or the user gives up.
532  *
533  * rfc1122 4.2.3.9 allows to consider as hard errors
534  * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
535  * but it is obsoleted by pmtu discovery).
536  *
537  * Note, that in modern internet, where routing is unreliable
538  * and in each dark corner broken firewalls sit, sending random
539  * errors ordered by their masters even this two messages finally lose
540  * their original sense (even Linux sends invalid PORT_UNREACHs)
541  *
542  * Now we are in compliance with RFCs.
543  * --ANK (980905)
544  */
545 
546  inet = inet_sk(sk);
547  if (!sock_owned_by_user(sk) && inet->recverr) {
548  sk->sk_err = err;
549  sk->sk_error_report(sk);
550  } else { /* Only an error on timeout */
551  sk->sk_err_soft = err;
552  }
553 
554 out:
555  bh_unlock_sock(sk);
556  sock_put(sk);
557 }
558 
559 static void __tcp_v4_send_check(struct sk_buff *skb,
561 {
562  struct tcphdr *th = tcp_hdr(skb);
563 
564  if (skb->ip_summed == CHECKSUM_PARTIAL) {
565  th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
566  skb->csum_start = skb_transport_header(skb) - skb->head;
567  skb->csum_offset = offsetof(struct tcphdr, check);
568  } else {
569  th->check = tcp_v4_check(skb->len, saddr, daddr,
570  csum_partial(th,
571  th->doff << 2,
572  skb->csum));
573  }
574 }
575 
576 /* This routine computes an IPv4 TCP checksum. */
577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
578 {
579  const struct inet_sock *inet = inet_sk(sk);
580 
581  __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
582 }
584 
586 {
587  const struct iphdr *iph;
588  struct tcphdr *th;
589 
590  if (!pskb_may_pull(skb, sizeof(*th)))
591  return -EINVAL;
592 
593  iph = ip_hdr(skb);
594  th = tcp_hdr(skb);
595 
596  th->check = 0;
598  __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
599  return 0;
600 }
601 
602 /*
603  * This routine will send an RST to the other tcp.
604  *
605  * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
606  * for reset.
607  * Answer: if a packet caused RST, it is not for a socket
608  * existing in our system, if it is matched to a socket,
609  * it is just duplicate segment or bug in other side's TCP.
610  * So that we build reply only basing on parameters
611  * arrived with segment.
612  * Exception: precedence violation. We do not implement it in any case.
613  */
614 
615 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
616 {
617  const struct tcphdr *th = tcp_hdr(skb);
618  struct {
619  struct tcphdr th;
620 #ifdef CONFIG_TCP_MD5SIG
622 #endif
623  } rep;
624  struct ip_reply_arg arg;
625 #ifdef CONFIG_TCP_MD5SIG
626  struct tcp_md5sig_key *key;
627  const __u8 *hash_location = NULL;
628  unsigned char newhash[16];
629  int genhash;
630  struct sock *sk1 = NULL;
631 #endif
632  struct net *net;
633 
634  /* Never send a reset in response to a reset. */
635  if (th->rst)
636  return;
637 
638  if (skb_rtable(skb)->rt_type != RTN_LOCAL)
639  return;
640 
641  /* Swap the send and the receive. */
642  memset(&rep, 0, sizeof(rep));
643  rep.th.dest = th->source;
644  rep.th.source = th->dest;
645  rep.th.doff = sizeof(struct tcphdr) / 4;
646  rep.th.rst = 1;
647 
648  if (th->ack) {
649  rep.th.seq = th->ack_seq;
650  } else {
651  rep.th.ack = 1;
652  rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
653  skb->len - (th->doff << 2));
654  }
655 
656  memset(&arg, 0, sizeof(arg));
657  arg.iov[0].iov_base = (unsigned char *)&rep;
658  arg.iov[0].iov_len = sizeof(rep.th);
659 
660 #ifdef CONFIG_TCP_MD5SIG
661  hash_location = tcp_parse_md5sig_option(th);
662  if (!sk && hash_location) {
663  /*
664  * active side is lost. Try to find listening socket through
665  * source port, and then find md5 key through listening socket.
666  * we are not loose security here:
667  * Incoming packet is checked with md5 hash with finding key,
668  * no RST generated if md5 hash doesn't match.
669  */
670  sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
671  &tcp_hashinfo, ip_hdr(skb)->daddr,
672  ntohs(th->source), inet_iif(skb));
673  /* don't send rst if it can't find key */
674  if (!sk1)
675  return;
676  rcu_read_lock();
677  key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
678  &ip_hdr(skb)->saddr, AF_INET);
679  if (!key)
680  goto release_sk1;
681 
682  genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
683  if (genhash || memcmp(hash_location, newhash, 16) != 0)
684  goto release_sk1;
685  } else {
686  key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
687  &ip_hdr(skb)->saddr,
688  AF_INET) : NULL;
689  }
690 
691  if (key) {
692  rep.opt[0] = htonl((TCPOPT_NOP << 24) |
693  (TCPOPT_NOP << 16) |
694  (TCPOPT_MD5SIG << 8) |
696  /* Update length and the length the header thinks exists */
697  arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
698  rep.th.doff = arg.iov[0].iov_len / 4;
699 
700  tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
701  key, ip_hdr(skb)->saddr,
702  ip_hdr(skb)->daddr, &rep.th);
703  }
704 #endif
705  arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
706  ip_hdr(skb)->saddr, /* XXX */
707  arg.iov[0].iov_len, IPPROTO_TCP, 0);
708  arg.csumoffset = offsetof(struct tcphdr, check) / 2;
709  arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
710  /* When socket is gone, all binding information is lost.
711  * routing might fail in this case. No choice here, if we choose to force
712  * input interface, we will misroute in case of asymmetric route.
713  */
714  if (sk)
715  arg.bound_dev_if = sk->sk_bound_dev_if;
716 
717  net = dev_net(skb_dst(skb)->dev);
718  arg.tos = ip_hdr(skb)->tos;
719  ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
720  ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
721 
724 
725 #ifdef CONFIG_TCP_MD5SIG
726 release_sk1:
727  if (sk1) {
728  rcu_read_unlock();
729  sock_put(sk1);
730  }
731 #endif
732 }
733 
734 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
735  outside socket context is ugly, certainly. What can I do?
736  */
737 
738 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
739  u32 win, u32 ts, int oif,
740  struct tcp_md5sig_key *key,
741  int reply_flags, u8 tos)
742 {
743  const struct tcphdr *th = tcp_hdr(skb);
744  struct {
745  struct tcphdr th;
747 #ifdef CONFIG_TCP_MD5SIG
748  + (TCPOLEN_MD5SIG_ALIGNED >> 2)
749 #endif
750  ];
751  } rep;
752  struct ip_reply_arg arg;
753  struct net *net = dev_net(skb_dst(skb)->dev);
754 
755  memset(&rep.th, 0, sizeof(struct tcphdr));
756  memset(&arg, 0, sizeof(arg));
757 
758  arg.iov[0].iov_base = (unsigned char *)&rep;
759  arg.iov[0].iov_len = sizeof(rep.th);
760  if (ts) {
761  rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
762  (TCPOPT_TIMESTAMP << 8) |
764  rep.opt[1] = htonl(tcp_time_stamp);
765  rep.opt[2] = htonl(ts);
766  arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
767  }
768 
769  /* Swap the send and the receive. */
770  rep.th.dest = th->source;
771  rep.th.source = th->dest;
772  rep.th.doff = arg.iov[0].iov_len / 4;
773  rep.th.seq = htonl(seq);
774  rep.th.ack_seq = htonl(ack);
775  rep.th.ack = 1;
776  rep.th.window = htons(win);
777 
778 #ifdef CONFIG_TCP_MD5SIG
779  if (key) {
780  int offset = (ts) ? 3 : 0;
781 
782  rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
783  (TCPOPT_NOP << 16) |
784  (TCPOPT_MD5SIG << 8) |
786  arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
787  rep.th.doff = arg.iov[0].iov_len/4;
788 
789  tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
790  key, ip_hdr(skb)->saddr,
791  ip_hdr(skb)->daddr, &rep.th);
792  }
793 #endif
794  arg.flags = reply_flags;
795  arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
796  ip_hdr(skb)->saddr, /* XXX */
797  arg.iov[0].iov_len, IPPROTO_TCP, 0);
798  arg.csumoffset = offsetof(struct tcphdr, check) / 2;
799  if (oif)
800  arg.bound_dev_if = oif;
801  arg.tos = tos;
802  ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
803  ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
804 
806 }
807 
808 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
809 {
810  struct inet_timewait_sock *tw = inet_twsk(sk);
811  struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
812 
813  tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
814  tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
815  tcptw->tw_ts_recent,
816  tw->tw_bound_dev_if,
817  tcp_twsk_md5_key(tcptw),
819  tw->tw_tos
820  );
821 
822  inet_twsk_put(tw);
823 }
824 
825 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
826  struct request_sock *req)
827 {
828  /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
829  * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
830  */
831  tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
832  tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
833  tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
834  req->ts_recent,
835  0,
836  tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
837  AF_INET),
838  inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
839  ip_hdr(skb)->tos);
840 }
841 
842 /*
843  * Send a SYN-ACK after having received a SYN.
844  * This still operates on a request_sock only, not on a big
845  * socket.
846  */
847 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
848  struct request_sock *req,
849  struct request_values *rvp,
850  u16 queue_mapping,
851  bool nocache)
852 {
853  const struct inet_request_sock *ireq = inet_rsk(req);
854  struct flowi4 fl4;
855  int err = -1;
856  struct sk_buff * skb;
857 
858  /* First, grab a route. */
859  if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
860  return -1;
861 
862  skb = tcp_make_synack(sk, dst, req, rvp, NULL);
863 
864  if (skb) {
865  __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
866 
867  skb_set_queue_mapping(skb, queue_mapping);
868  err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
869  ireq->rmt_addr,
870  ireq->opt);
871  err = net_xmit_eval(err);
872  if (!tcp_rsk(req)->snt_synack && !err)
873  tcp_rsk(req)->snt_synack = tcp_time_stamp;
874  }
875 
876  return err;
877 }
878 
879 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
880  struct request_values *rvp)
881 {
882  TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
883  return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
884 }
885 
886 /*
887  * IPv4 request_sock destructor.
888  */
889 static void tcp_v4_reqsk_destructor(struct request_sock *req)
890 {
891  kfree(inet_rsk(req)->opt);
892 }
893 
894 /*
895  * Return true if a syncookie should be sent
896  */
897 bool tcp_syn_flood_action(struct sock *sk,
898  const struct sk_buff *skb,
899  const char *proto)
900 {
901  const char *msg = "Dropping request";
902  bool want_cookie = false;
903  struct listen_sock *lopt;
904 
905 
906 
907 #ifdef CONFIG_SYN_COOKIES
908  if (sysctl_tcp_syncookies) {
909  msg = "Sending cookies";
910  want_cookie = true;
912  } else
913 #endif
915 
916  lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
917  if (!lopt->synflood_warned) {
918  lopt->synflood_warned = 1;
919  pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
920  proto, ntohs(tcp_hdr(skb)->dest), msg);
921  }
922  return want_cookie;
923 }
925 
926 /*
927  * Save and compile IPv4 options into the request_sock if needed.
928  */
929 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
930 {
931  const struct ip_options *opt = &(IPCB(skb)->opt);
932  struct ip_options_rcu *dopt = NULL;
933 
934  if (opt && opt->optlen) {
935  int opt_size = sizeof(*dopt) + opt->optlen;
936 
937  dopt = kmalloc(opt_size, GFP_ATOMIC);
938  if (dopt) {
939  if (ip_options_echo(&dopt->opt, skb)) {
940  kfree(dopt);
941  dopt = NULL;
942  }
943  }
944  }
945  return dopt;
946 }
947 
948 #ifdef CONFIG_TCP_MD5SIG
949 /*
950  * RFC2385 MD5 checksumming requires a mapping of
951  * IP address->MD5 Key.
952  * We need to maintain these in the sk structure.
953  */
954 
955 /* Find the Key structure for an address. */
956 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
957  const union tcp_md5_addr *addr,
958  int family)
959 {
960  struct tcp_sock *tp = tcp_sk(sk);
961  struct tcp_md5sig_key *key;
962  struct hlist_node *pos;
963  unsigned int size = sizeof(struct in_addr);
964  struct tcp_md5sig_info *md5sig;
965 
966  /* caller either holds rcu_read_lock() or socket lock */
967  md5sig = rcu_dereference_check(tp->md5sig_info,
968  sock_owned_by_user(sk) ||
969  lockdep_is_held(&sk->sk_lock.slock));
970  if (!md5sig)
971  return NULL;
972 #if IS_ENABLED(CONFIG_IPV6)
973  if (family == AF_INET6)
974  size = sizeof(struct in6_addr);
975 #endif
976  hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
977  if (key->family != family)
978  continue;
979  if (!memcmp(&key->addr, addr, size))
980  return key;
981  }
982  return NULL;
983 }
984 EXPORT_SYMBOL(tcp_md5_do_lookup);
985 
986 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
987  struct sock *addr_sk)
988 {
989  union tcp_md5_addr *addr;
990 
991  addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
992  return tcp_md5_do_lookup(sk, addr, AF_INET);
993 }
995 
996 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
997  struct request_sock *req)
998 {
999  union tcp_md5_addr *addr;
1000 
1001  addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1002  return tcp_md5_do_lookup(sk, addr, AF_INET);
1003 }
1004 
1005 /* This can be called on a newly created socket, from other files */
1006 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1007  int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1008 {
1009  /* Add Key to the list */
1010  struct tcp_md5sig_key *key;
1011  struct tcp_sock *tp = tcp_sk(sk);
1012  struct tcp_md5sig_info *md5sig;
1013 
1014  key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1015  if (key) {
1016  /* Pre-existing entry - just update that one. */
1017  memcpy(key->key, newkey, newkeylen);
1018  key->keylen = newkeylen;
1019  return 0;
1020  }
1021 
1022  md5sig = rcu_dereference_protected(tp->md5sig_info,
1023  sock_owned_by_user(sk));
1024  if (!md5sig) {
1025  md5sig = kmalloc(sizeof(*md5sig), gfp);
1026  if (!md5sig)
1027  return -ENOMEM;
1028 
1029  sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1030  INIT_HLIST_HEAD(&md5sig->head);
1031  rcu_assign_pointer(tp->md5sig_info, md5sig);
1032  }
1033 
1034  key = sock_kmalloc(sk, sizeof(*key), gfp);
1035  if (!key)
1036  return -ENOMEM;
1037  if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1038  sock_kfree_s(sk, key, sizeof(*key));
1039  return -ENOMEM;
1040  }
1041 
1042  memcpy(key->key, newkey, newkeylen);
1043  key->keylen = newkeylen;
1044  key->family = family;
1045  memcpy(&key->addr, addr,
1046  (family == AF_INET6) ? sizeof(struct in6_addr) :
1047  sizeof(struct in_addr));
1048  hlist_add_head_rcu(&key->node, &md5sig->head);
1049  return 0;
1050 }
1052 
1053 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1054 {
1055  struct tcp_sock *tp = tcp_sk(sk);
1056  struct tcp_md5sig_key *key;
1057  struct tcp_md5sig_info *md5sig;
1058 
1059  key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1060  if (!key)
1061  return -ENOENT;
1062  hlist_del_rcu(&key->node);
1063  atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1064  kfree_rcu(key, rcu);
1065  md5sig = rcu_dereference_protected(tp->md5sig_info,
1066  sock_owned_by_user(sk));
1067  if (hlist_empty(&md5sig->head))
1069  return 0;
1070 }
1072 
1073 void tcp_clear_md5_list(struct sock *sk)
1074 {
1075  struct tcp_sock *tp = tcp_sk(sk);
1076  struct tcp_md5sig_key *key;
1077  struct hlist_node *pos, *n;
1078  struct tcp_md5sig_info *md5sig;
1079 
1080  md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1081 
1082  if (!hlist_empty(&md5sig->head))
1084  hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1085  hlist_del_rcu(&key->node);
1086  atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1087  kfree_rcu(key, rcu);
1088  }
1089 }
1090 
1091 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1092  int optlen)
1093 {
1094  struct tcp_md5sig cmd;
1095  struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1096 
1097  if (optlen < sizeof(cmd))
1098  return -EINVAL;
1099 
1100  if (copy_from_user(&cmd, optval, sizeof(cmd)))
1101  return -EFAULT;
1102 
1103  if (sin->sin_family != AF_INET)
1104  return -EINVAL;
1105 
1106  if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1107  return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108  AF_INET);
1109 
1110  if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1111  return -EINVAL;
1112 
1113  return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1114  AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1115  GFP_KERNEL);
1116 }
1117 
1118 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1119  __be32 daddr, __be32 saddr, int nbytes)
1120 {
1121  struct tcp4_pseudohdr *bp;
1122  struct scatterlist sg;
1123 
1124  bp = &hp->md5_blk.ip4;
1125 
1126  /*
1127  * 1. the TCP pseudo-header (in the order: source IP address,
1128  * destination IP address, zero-padded protocol number, and
1129  * segment length)
1130  */
1131  bp->saddr = saddr;
1132  bp->daddr = daddr;
1133  bp->pad = 0;
1134  bp->protocol = IPPROTO_TCP;
1135  bp->len = cpu_to_be16(nbytes);
1136 
1137  sg_init_one(&sg, bp, sizeof(*bp));
1138  return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1139 }
1140 
1141 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1142  __be32 daddr, __be32 saddr, const struct tcphdr *th)
1143 {
1144  struct tcp_md5sig_pool *hp;
1145  struct hash_desc *desc;
1146 
1147  hp = tcp_get_md5sig_pool();
1148  if (!hp)
1149  goto clear_hash_noput;
1150  desc = &hp->md5_desc;
1151 
1152  if (crypto_hash_init(desc))
1153  goto clear_hash;
1154  if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1155  goto clear_hash;
1156  if (tcp_md5_hash_header(hp, th))
1157  goto clear_hash;
1158  if (tcp_md5_hash_key(hp, key))
1159  goto clear_hash;
1160  if (crypto_hash_final(desc, md5_hash))
1161  goto clear_hash;
1162 
1164  return 0;
1165 
1166 clear_hash:
1168 clear_hash_noput:
1169  memset(md5_hash, 0, 16);
1170  return 1;
1171 }
1172 
1173 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1174  const struct sock *sk, const struct request_sock *req,
1175  const struct sk_buff *skb)
1176 {
1177  struct tcp_md5sig_pool *hp;
1178  struct hash_desc *desc;
1179  const struct tcphdr *th = tcp_hdr(skb);
1180  __be32 saddr, daddr;
1181 
1182  if (sk) {
1183  saddr = inet_sk(sk)->inet_saddr;
1184  daddr = inet_sk(sk)->inet_daddr;
1185  } else if (req) {
1186  saddr = inet_rsk(req)->loc_addr;
1187  daddr = inet_rsk(req)->rmt_addr;
1188  } else {
1189  const struct iphdr *iph = ip_hdr(skb);
1190  saddr = iph->saddr;
1191  daddr = iph->daddr;
1192  }
1193 
1194  hp = tcp_get_md5sig_pool();
1195  if (!hp)
1196  goto clear_hash_noput;
1197  desc = &hp->md5_desc;
1198 
1199  if (crypto_hash_init(desc))
1200  goto clear_hash;
1201 
1202  if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1203  goto clear_hash;
1204  if (tcp_md5_hash_header(hp, th))
1205  goto clear_hash;
1206  if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1207  goto clear_hash;
1208  if (tcp_md5_hash_key(hp, key))
1209  goto clear_hash;
1210  if (crypto_hash_final(desc, md5_hash))
1211  goto clear_hash;
1212 
1214  return 0;
1215 
1216 clear_hash:
1218 clear_hash_noput:
1219  memset(md5_hash, 0, 16);
1220  return 1;
1221 }
1223 
1224 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1225 {
1226  /*
1227  * This gets called for each TCP segment that arrives
1228  * so we want to be efficient.
1229  * We have 3 drop cases:
1230  * o No MD5 hash and one expected.
1231  * o MD5 hash and we're not expecting one.
1232  * o MD5 hash and its wrong.
1233  */
1234  const __u8 *hash_location = NULL;
1235  struct tcp_md5sig_key *hash_expected;
1236  const struct iphdr *iph = ip_hdr(skb);
1237  const struct tcphdr *th = tcp_hdr(skb);
1238  int genhash;
1239  unsigned char newhash[16];
1240 
1241  hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1242  AF_INET);
1243  hash_location = tcp_parse_md5sig_option(th);
1244 
1245  /* We've parsed the options - do we have a hash? */
1246  if (!hash_expected && !hash_location)
1247  return false;
1248 
1249  if (hash_expected && !hash_location) {
1251  return true;
1252  }
1253 
1254  if (!hash_expected && hash_location) {
1256  return true;
1257  }
1258 
1259  /* Okay, so this is hash_expected and hash_location -
1260  * so we need to calculate the checksum.
1261  */
1262  genhash = tcp_v4_md5_hash_skb(newhash,
1263  hash_expected,
1264  NULL, NULL, skb);
1265 
1266  if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1267  net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1268  &iph->saddr, ntohs(th->source),
1269  &iph->daddr, ntohs(th->dest),
1270  genhash ? " tcp_v4_calc_md5_hash failed"
1271  : "");
1272  return true;
1273  }
1274  return false;
1275 }
1276 
1277 #endif
1278 
1280  .family = PF_INET,
1281  .obj_size = sizeof(struct tcp_request_sock),
1282  .rtx_syn_ack = tcp_v4_rtx_synack,
1283  .send_ack = tcp_v4_reqsk_send_ack,
1284  .destructor = tcp_v4_reqsk_destructor,
1285  .send_reset = tcp_v4_send_reset,
1286  .syn_ack_timeout = tcp_syn_ack_timeout,
1287 };
1288 
1289 #ifdef CONFIG_TCP_MD5SIG
1290 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1291  .md5_lookup = tcp_v4_reqsk_md5_lookup,
1292  .calc_md5_hash = tcp_v4_md5_hash_skb,
1293 };
1294 #endif
1295 
1296 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1297  struct request_sock *req,
1298  struct tcp_fastopen_cookie *foc,
1299  struct tcp_fastopen_cookie *valid_foc)
1300 {
1301  bool skip_cookie = false;
1302  struct fastopen_queue *fastopenq;
1303 
1304  if (likely(!fastopen_cookie_present(foc))) {
1305  /* See include/net/tcp.h for the meaning of these knobs */
1306  if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1307  ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1308  (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1309  skip_cookie = true; /* no cookie to validate */
1310  else
1311  return false;
1312  }
1313  fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1314  /* A FO option is present; bump the counter. */
1316 
1317  /* Make sure the listener has enabled fastopen, and we don't
1318  * exceed the max # of pending TFO requests allowed before trying
1319  * to validating the cookie in order to avoid burning CPU cycles
1320  * unnecessarily.
1321  *
1322  * XXX (TFO) - The implication of checking the max_qlen before
1323  * processing a cookie request is that clients can't differentiate
1324  * between qlen overflow causing Fast Open to be disabled
1325  * temporarily vs a server not supporting Fast Open at all.
1326  */
1327  if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1328  fastopenq == NULL || fastopenq->max_qlen == 0)
1329  return false;
1330 
1331  if (fastopenq->qlen >= fastopenq->max_qlen) {
1332  struct request_sock *req1;
1333  spin_lock(&fastopenq->lock);
1334  req1 = fastopenq->rskq_rst_head;
1335  if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1336  spin_unlock(&fastopenq->lock);
1337  NET_INC_STATS_BH(sock_net(sk),
1339  /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1340  foc->len = -1;
1341  return false;
1342  }
1343  fastopenq->rskq_rst_head = req1->dl_next;
1344  fastopenq->qlen--;
1345  spin_unlock(&fastopenq->lock);
1346  reqsk_free(req1);
1347  }
1348  if (skip_cookie) {
1349  tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1350  return true;
1351  }
1352  if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1353  if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1354  tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1355  if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1356  memcmp(&foc->val[0], &valid_foc->val[0],
1358  return false;
1359  valid_foc->len = -1;
1360  }
1361  /* Acknowledge the data received from the peer. */
1362  tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1363  return true;
1364  } else if (foc->len == 0) { /* Client requesting a cookie */
1365  tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1366  NET_INC_STATS_BH(sock_net(sk),
1368  } else {
1369  /* Client sent a cookie with wrong size. Treat it
1370  * the same as invalid and return a valid one.
1371  */
1372  tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1373  }
1374  return false;
1375 }
1376 
1377 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1378  struct sk_buff *skb,
1379  struct sk_buff *skb_synack,
1380  struct request_sock *req,
1381  struct request_values *rvp)
1382 {
1383  struct tcp_sock *tp = tcp_sk(sk);
1384  struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1385  const struct inet_request_sock *ireq = inet_rsk(req);
1386  struct sock *child;
1387  int err;
1388 
1389  req->retrans = 0;
1390  req->sk = NULL;
1391 
1392  child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1393  if (child == NULL) {
1394  NET_INC_STATS_BH(sock_net(sk),
1396  kfree_skb(skb_synack);
1397  return -1;
1398  }
1399  err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1400  ireq->rmt_addr, ireq->opt);
1401  err = net_xmit_eval(err);
1402  if (!err)
1403  tcp_rsk(req)->snt_synack = tcp_time_stamp;
1404  /* XXX (TFO) - is it ok to ignore error and continue? */
1405 
1406  spin_lock(&queue->fastopenq->lock);
1407  queue->fastopenq->qlen++;
1408  spin_unlock(&queue->fastopenq->lock);
1409 
1410  /* Initialize the child socket. Have to fix some values to take
1411  * into account the child is a Fast Open socket and is created
1412  * only out of the bits carried in the SYN packet.
1413  */
1414  tp = tcp_sk(child);
1415 
1416  tp->fastopen_rsk = req;
1417  /* Do a hold on the listner sk so that if the listener is being
1418  * closed, the child that has been accepted can live on and still
1419  * access listen_lock.
1420  */
1421  sock_hold(sk);
1422  tcp_rsk(req)->listener = sk;
1423 
1424  /* RFC1323: The window in SYN & SYN/ACK segments is never
1425  * scaled. So correct it appropriately.
1426  */
1427  tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1428 
1429  /* Activate the retrans timer so that SYNACK can be retransmitted.
1430  * The request socket is not added to the SYN table of the parent
1431  * because it's been added to the accept queue directly.
1432  */
1433  inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1435 
1436  /* Add the child socket directly into the accept queue */
1437  inet_csk_reqsk_queue_add(sk, req, child);
1438 
1439  /* Now finish processing the fastopen child socket. */
1440  inet_csk(child)->icsk_af_ops->rebuild_header(child);
1442  tcp_mtup_init(child);
1443  tcp_init_buffer_space(child);
1444  tcp_init_metrics(child);
1445 
1446  /* Queue the data carried in the SYN packet. We need to first
1447  * bump skb's refcnt because the caller will attempt to free it.
1448  *
1449  * XXX (TFO) - we honor a zero-payload TFO request for now.
1450  * (Any reason not to?)
1451  */
1452  if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1453  /* Don't queue the skb if there is no payload in SYN.
1454  * XXX (TFO) - How about SYN+FIN?
1455  */
1456  tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1457  } else {
1458  skb = skb_get(skb);
1459  skb_dst_drop(skb);
1460  __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1461  skb_set_owner_r(skb, child);
1462  __skb_queue_tail(&child->sk_receive_queue, skb);
1463  tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1464  tp->syn_data_acked = 1;
1465  }
1466  sk->sk_data_ready(sk, 0);
1467  bh_unlock_sock(child);
1468  sock_put(child);
1469  WARN_ON(req->sk == NULL);
1470  return 0;
1471 }
1472 
1473 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1474 {
1475  struct tcp_extend_values tmp_ext;
1476  struct tcp_options_received tmp_opt;
1477  const u8 *hash_location;
1478  struct request_sock *req;
1479  struct inet_request_sock *ireq;
1480  struct tcp_sock *tp = tcp_sk(sk);
1481  struct dst_entry *dst = NULL;
1482  __be32 saddr = ip_hdr(skb)->saddr;
1483  __be32 daddr = ip_hdr(skb)->daddr;
1484  __u32 isn = TCP_SKB_CB(skb)->when;
1485  bool want_cookie = false;
1486  struct flowi4 fl4;
1487  struct tcp_fastopen_cookie foc = { .len = -1 };
1488  struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1489  struct sk_buff *skb_synack;
1490  int do_fastopen;
1491 
1492  /* Never answer to SYNs send to broadcast or multicast */
1493  if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1494  goto drop;
1495 
1496  /* TW buckets are converted to open requests without
1497  * limitations, they conserve resources and peer is
1498  * evidently real one.
1499  */
1500  if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1501  want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1502  if (!want_cookie)
1503  goto drop;
1504  }
1505 
1506  /* Accept backlog is full. If we have already queued enough
1507  * of warm entries in syn queue, drop request. It is better than
1508  * clogging syn queue with openreqs with exponentially increasing
1509  * timeout.
1510  */
1511  if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1512  goto drop;
1513 
1514  req = inet_reqsk_alloc(&tcp_request_sock_ops);
1515  if (!req)
1516  goto drop;
1517 
1518 #ifdef CONFIG_TCP_MD5SIG
1519  tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1520 #endif
1521 
1522  tcp_clear_options(&tmp_opt);
1523  tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1524  tmp_opt.user_mss = tp->rx_opt.user_mss;
1525  tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1526  want_cookie ? NULL : &foc);
1527 
1528  if (tmp_opt.cookie_plus > 0 &&
1529  tmp_opt.saw_tstamp &&
1530  !tp->rx_opt.cookie_out_never &&
1531  (sysctl_tcp_cookie_size > 0 ||
1532  (tp->cookie_values != NULL &&
1533  tp->cookie_values->cookie_desired > 0))) {
1534  u8 *c;
1535  u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1536  int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1537 
1538  if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1539  goto drop_and_release;
1540 
1541  /* Secret recipe starts with IP addresses */
1542  *mess++ ^= (__force u32)daddr;
1543  *mess++ ^= (__force u32)saddr;
1544 
1545  /* plus variable length Initiator Cookie */
1546  c = (u8 *)mess;
1547  while (l-- > 0)
1548  *c++ ^= *hash_location++;
1549 
1550  want_cookie = false; /* not our kind of cookie */
1551  tmp_ext.cookie_out_never = 0; /* false */
1552  tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1553  } else if (!tp->rx_opt.cookie_in_always) {
1554  /* redundant indications, but ensure initialization. */
1555  tmp_ext.cookie_out_never = 1; /* true */
1556  tmp_ext.cookie_plus = 0;
1557  } else {
1558  goto drop_and_release;
1559  }
1560  tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1561 
1562  if (want_cookie && !tmp_opt.saw_tstamp)
1563  tcp_clear_options(&tmp_opt);
1564 
1565  tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1566  tcp_openreq_init(req, &tmp_opt, skb);
1567 
1568  ireq = inet_rsk(req);
1569  ireq->loc_addr = daddr;
1570  ireq->rmt_addr = saddr;
1571  ireq->no_srccheck = inet_sk(sk)->transparent;
1572  ireq->opt = tcp_v4_save_options(skb);
1573 
1574  if (security_inet_conn_request(sk, skb, req))
1575  goto drop_and_free;
1576 
1577  if (!want_cookie || tmp_opt.tstamp_ok)
1578  TCP_ECN_create_request(req, skb);
1579 
1580  if (want_cookie) {
1581  isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1582  req->cookie_ts = tmp_opt.tstamp_ok;
1583  } else if (!isn) {
1584  /* VJ's idea. We save last timestamp seen
1585  * from the destination in peer table, when entering
1586  * state TIME-WAIT, and check against it before
1587  * accepting new connection request.
1588  *
1589  * If "isn" is not zero, this request hit alive
1590  * timewait bucket, so that all the necessary checks
1591  * are made in the function processing timewait state.
1592  */
1593  if (tmp_opt.saw_tstamp &&
1594  tcp_death_row.sysctl_tw_recycle &&
1595  (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1596  fl4.daddr == saddr) {
1597  if (!tcp_peer_is_proven(req, dst, true)) {
1599  goto drop_and_release;
1600  }
1601  }
1602  /* Kill the following clause, if you dislike this way. */
1603  else if (!sysctl_tcp_syncookies &&
1604  (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1605  (sysctl_max_syn_backlog >> 2)) &&
1606  !tcp_peer_is_proven(req, dst, false)) {
1607  /* Without syncookies last quarter of
1608  * backlog is filled with destinations,
1609  * proven to be alive.
1610  * It means that we continue to communicate
1611  * to destinations, already remembered
1612  * to the moment of synflood.
1613  */
1614  LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1615  &saddr, ntohs(tcp_hdr(skb)->source));
1616  goto drop_and_release;
1617  }
1618 
1619  isn = tcp_v4_init_sequence(skb);
1620  }
1621  tcp_rsk(req)->snt_isn = isn;
1622 
1623  if (dst == NULL) {
1624  dst = inet_csk_route_req(sk, &fl4, req);
1625  if (dst == NULL)
1626  goto drop_and_free;
1627  }
1628  do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1629 
1630  /* We don't call tcp_v4_send_synack() directly because we need
1631  * to make sure a child socket can be created successfully before
1632  * sending back synack!
1633  *
1634  * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1635  * (or better yet, call tcp_send_synack() in the child context
1636  * directly, but will have to fix bunch of other code first)
1637  * after syn_recv_sock() except one will need to first fix the
1638  * latter to remove its dependency on the current implementation
1639  * of tcp_v4_send_synack()->tcp_select_initial_window().
1640  */
1641  skb_synack = tcp_make_synack(sk, dst, req,
1642  (struct request_values *)&tmp_ext,
1643  fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1644 
1645  if (skb_synack) {
1646  __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1647  skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1648  } else
1649  goto drop_and_free;
1650 
1651  if (likely(!do_fastopen)) {
1652  int err;
1653  err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1654  ireq->rmt_addr, ireq->opt);
1655  err = net_xmit_eval(err);
1656  if (err || want_cookie)
1657  goto drop_and_free;
1658 
1659  tcp_rsk(req)->snt_synack = tcp_time_stamp;
1660  tcp_rsk(req)->listener = NULL;
1661  /* Add the request_sock to the SYN table */
1663  if (fastopen_cookie_present(&foc) && foc.len != 0)
1664  NET_INC_STATS_BH(sock_net(sk),
1666  } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1667  (struct request_values *)&tmp_ext))
1668  goto drop_and_free;
1669 
1670  return 0;
1671 
1672 drop_and_release:
1673  dst_release(dst);
1674 drop_and_free:
1675  reqsk_free(req);
1676 drop:
1677  return 0;
1678 }
1680 
1681 
1682 /*
1683  * The three way handshake has completed - we got a valid synack -
1684  * now create the new socket.
1685  */
1686 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1687  struct request_sock *req,
1688  struct dst_entry *dst)
1689 {
1690  struct inet_request_sock *ireq;
1691  struct inet_sock *newinet;
1692  struct tcp_sock *newtp;
1693  struct sock *newsk;
1694 #ifdef CONFIG_TCP_MD5SIG
1695  struct tcp_md5sig_key *key;
1696 #endif
1697  struct ip_options_rcu *inet_opt;
1698 
1699  if (sk_acceptq_is_full(sk))
1700  goto exit_overflow;
1701 
1702  newsk = tcp_create_openreq_child(sk, req, skb);
1703  if (!newsk)
1704  goto exit_nonewsk;
1705 
1706  newsk->sk_gso_type = SKB_GSO_TCPV4;
1707  inet_sk_rx_dst_set(newsk, skb);
1708 
1709  newtp = tcp_sk(newsk);
1710  newinet = inet_sk(newsk);
1711  ireq = inet_rsk(req);
1712  newinet->inet_daddr = ireq->rmt_addr;
1713  newinet->inet_rcv_saddr = ireq->loc_addr;
1714  newinet->inet_saddr = ireq->loc_addr;
1715  inet_opt = ireq->opt;
1716  rcu_assign_pointer(newinet->inet_opt, inet_opt);
1717  ireq->opt = NULL;
1718  newinet->mc_index = inet_iif(skb);
1719  newinet->mc_ttl = ip_hdr(skb)->ttl;
1720  newinet->rcv_tos = ip_hdr(skb)->tos;
1721  inet_csk(newsk)->icsk_ext_hdr_len = 0;
1722  if (inet_opt)
1723  inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1724  newinet->inet_id = newtp->write_seq ^ jiffies;
1725 
1726  if (!dst) {
1727  dst = inet_csk_route_child_sock(sk, newsk, req);
1728  if (!dst)
1729  goto put_and_exit;
1730  } else {
1731  /* syncookie case : see end of cookie_v4_check() */
1732  }
1733  sk_setup_caps(newsk, dst);
1734 
1735  tcp_mtup_init(newsk);
1736  tcp_sync_mss(newsk, dst_mtu(dst));
1737  newtp->advmss = dst_metric_advmss(dst);
1738  if (tcp_sk(sk)->rx_opt.user_mss &&
1739  tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1740  newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1741 
1742  tcp_initialize_rcv_mss(newsk);
1743  tcp_synack_rtt_meas(newsk, req);
1744  newtp->total_retrans = req->retrans;
1745 
1746 #ifdef CONFIG_TCP_MD5SIG
1747  /* Copy over the MD5 key from the original socket */
1748  key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1749  AF_INET);
1750  if (key != NULL) {
1751  /*
1752  * We're using one, so create a matching key
1753  * on the newsk structure. If we fail to get
1754  * memory, then we end up not copying the key
1755  * across. Shucks.
1756  */
1757  tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1758  AF_INET, key->key, key->keylen, GFP_ATOMIC);
1759  sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1760  }
1761 #endif
1762 
1763  if (__inet_inherit_port(sk, newsk) < 0)
1764  goto put_and_exit;
1765  __inet_hash_nolisten(newsk, NULL);
1766 
1767  return newsk;
1768 
1769 exit_overflow:
1771 exit_nonewsk:
1772  dst_release(dst);
1773 exit:
1774  NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1775  return NULL;
1776 put_and_exit:
1777  tcp_clear_xmit_timers(newsk);
1779  bh_unlock_sock(newsk);
1780  sock_put(newsk);
1781  goto exit;
1782 }
1784 
1785 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1786 {
1787  struct tcphdr *th = tcp_hdr(skb);
1788  const struct iphdr *iph = ip_hdr(skb);
1789  struct sock *nsk;
1790  struct request_sock **prev;
1791  /* Find possible connection requests. */
1792  struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1793  iph->saddr, iph->daddr);
1794  if (req)
1795  return tcp_check_req(sk, skb, req, prev, false);
1796 
1797  nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1798  th->source, iph->daddr, th->dest, inet_iif(skb));
1799 
1800  if (nsk) {
1801  if (nsk->sk_state != TCP_TIME_WAIT) {
1802  bh_lock_sock(nsk);
1803  return nsk;
1804  }
1805  inet_twsk_put(inet_twsk(nsk));
1806  return NULL;
1807  }
1808 
1809 #ifdef CONFIG_SYN_COOKIES
1810  if (!th->syn)
1811  sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1812 #endif
1813  return sk;
1814 }
1815 
1816 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1817 {
1818  const struct iphdr *iph = ip_hdr(skb);
1819 
1820  if (skb->ip_summed == CHECKSUM_COMPLETE) {
1821  if (!tcp_v4_check(skb->len, iph->saddr,
1822  iph->daddr, skb->csum)) {
1824  return 0;
1825  }
1826  }
1827 
1828  skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1829  skb->len, IPPROTO_TCP, 0);
1830 
1831  if (skb->len <= 76) {
1832  return __skb_checksum_complete(skb);
1833  }
1834  return 0;
1835 }
1836 
1837 
1838 /* The socket must have it's spinlock held when we get
1839  * here.
1840  *
1841  * We have a potential double-lock case here, so even when
1842  * doing backlog processing we use the BH locking scheme.
1843  * This is because we cannot sleep with the original spinlock
1844  * held.
1845  */
1846 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1847 {
1848  struct sock *rsk;
1849 #ifdef CONFIG_TCP_MD5SIG
1850  /*
1851  * We really want to reject the packet as early as possible
1852  * if:
1853  * o We're expecting an MD5'd packet and this is no MD5 tcp option
1854  * o There is an MD5 option and we're not expecting one
1855  */
1856  if (tcp_v4_inbound_md5_hash(sk, skb))
1857  goto discard;
1858 #endif
1859 
1860  if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1861  struct dst_entry *dst = sk->sk_rx_dst;
1862 
1863  sock_rps_save_rxhash(sk, skb);
1864  if (dst) {
1865  if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1866  dst->ops->check(dst, 0) == NULL) {
1867  dst_release(dst);
1868  sk->sk_rx_dst = NULL;
1869  }
1870  }
1871  if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1872  rsk = sk;
1873  goto reset;
1874  }
1875  return 0;
1876  }
1877 
1878  if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1879  goto csum_err;
1880 
1881  if (sk->sk_state == TCP_LISTEN) {
1882  struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1883  if (!nsk)
1884  goto discard;
1885 
1886  if (nsk != sk) {
1887  sock_rps_save_rxhash(nsk, skb);
1888  if (tcp_child_process(sk, nsk, skb)) {
1889  rsk = nsk;
1890  goto reset;
1891  }
1892  return 0;
1893  }
1894  } else
1895  sock_rps_save_rxhash(sk, skb);
1896 
1897  if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1898  rsk = sk;
1899  goto reset;
1900  }
1901  return 0;
1902 
1903 reset:
1904  tcp_v4_send_reset(rsk, skb);
1905 discard:
1906  kfree_skb(skb);
1907  /* Be careful here. If this function gets more complicated and
1908  * gcc suffers from register pressure on the x86, sk (in %ebx)
1909  * might be destroyed here. This current version compiles correctly,
1910  * but you have been warned.
1911  */
1912  return 0;
1913 
1914 csum_err:
1915  TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1916  goto discard;
1917 }
1919 
1920 void tcp_v4_early_demux(struct sk_buff *skb)
1921 {
1922  struct net *net = dev_net(skb->dev);
1923  const struct iphdr *iph;
1924  const struct tcphdr *th;
1925  struct sock *sk;
1926 
1927  if (skb->pkt_type != PACKET_HOST)
1928  return;
1929 
1930  if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1931  return;
1932 
1933  iph = ip_hdr(skb);
1934  th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1935 
1936  if (th->doff < sizeof(struct tcphdr) / 4)
1937  return;
1938 
1940  iph->saddr, th->source,
1941  iph->daddr, ntohs(th->dest),
1942  skb->skb_iif);
1943  if (sk) {
1944  skb->sk = sk;
1945  skb->destructor = sock_edemux;
1946  if (sk->sk_state != TCP_TIME_WAIT) {
1947  struct dst_entry *dst = sk->sk_rx_dst;
1948 
1949  if (dst)
1950  dst = dst_check(dst, 0);
1951  if (dst &&
1952  inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1953  skb_dst_set_noref(skb, dst);
1954  }
1955  }
1956 }
1957 
1958 /*
1959  * From tcp_input.c
1960  */
1961 
1962 int tcp_v4_rcv(struct sk_buff *skb)
1963 {
1964  const struct iphdr *iph;
1965  const struct tcphdr *th;
1966  struct sock *sk;
1967  int ret;
1968  struct net *net = dev_net(skb->dev);
1969 
1970  if (skb->pkt_type != PACKET_HOST)
1971  goto discard_it;
1972 
1973  /* Count it even if it's bad */
1975 
1976  if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1977  goto discard_it;
1978 
1979  th = tcp_hdr(skb);
1980 
1981  if (th->doff < sizeof(struct tcphdr) / 4)
1982  goto bad_packet;
1983  if (!pskb_may_pull(skb, th->doff * 4))
1984  goto discard_it;
1985 
1986  /* An explanation is required here, I think.
1987  * Packet length and doff are validated by header prediction,
1988  * provided case of th->doff==0 is eliminated.
1989  * So, we defer the checks. */
1990  if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1991  goto bad_packet;
1992 
1993  th = tcp_hdr(skb);
1994  iph = ip_hdr(skb);
1995  TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1996  TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1997  skb->len - th->doff * 4);
1998  TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1999  TCP_SKB_CB(skb)->when = 0;
2000  TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2001  TCP_SKB_CB(skb)->sacked = 0;
2002 
2003  sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2004  if (!sk)
2005  goto no_tcp_socket;
2006 
2007 process:
2008  if (sk->sk_state == TCP_TIME_WAIT)
2009  goto do_time_wait;
2010 
2011  if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2013  goto discard_and_relse;
2014  }
2015 
2016  if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2017  goto discard_and_relse;
2018  nf_reset(skb);
2019 
2020  if (sk_filter(sk, skb))
2021  goto discard_and_relse;
2022 
2023  skb->dev = NULL;
2024 
2025  bh_lock_sock_nested(sk);
2026  ret = 0;
2027  if (!sock_owned_by_user(sk)) {
2028 #ifdef CONFIG_NET_DMA
2029  struct tcp_sock *tp = tcp_sk(sk);
2030  if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2031  tp->ucopy.dma_chan = net_dma_find_channel();
2032  if (tp->ucopy.dma_chan)
2033  ret = tcp_v4_do_rcv(sk, skb);
2034  else
2035 #endif
2036  {
2037  if (!tcp_prequeue(sk, skb))
2038  ret = tcp_v4_do_rcv(sk, skb);
2039  }
2040  } else if (unlikely(sk_add_backlog(sk, skb,
2041  sk->sk_rcvbuf + sk->sk_sndbuf))) {
2042  bh_unlock_sock(sk);
2044  goto discard_and_relse;
2045  }
2046  bh_unlock_sock(sk);
2047 
2048  sock_put(sk);
2049 
2050  return ret;
2051 
2052 no_tcp_socket:
2053  if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2054  goto discard_it;
2055 
2056  if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2057 bad_packet:
2059  } else {
2060  tcp_v4_send_reset(NULL, skb);
2061  }
2062 
2063 discard_it:
2064  /* Discard frame. */
2065  kfree_skb(skb);
2066  return 0;
2067 
2068 discard_and_relse:
2069  sock_put(sk);
2070  goto discard_it;
2071 
2072 do_time_wait:
2073  if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2074  inet_twsk_put(inet_twsk(sk));
2075  goto discard_it;
2076  }
2077 
2078  if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2080  inet_twsk_put(inet_twsk(sk));
2081  goto discard_it;
2082  }
2083  switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2084  case TCP_TW_SYN: {
2085  struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2086  &tcp_hashinfo,
2087  iph->daddr, th->dest,
2088  inet_iif(skb));
2089  if (sk2) {
2090  inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2091  inet_twsk_put(inet_twsk(sk));
2092  sk = sk2;
2093  goto process;
2094  }
2095  /* Fall through to ACK */
2096  }
2097  case TCP_TW_ACK:
2098  tcp_v4_timewait_ack(sk, skb);
2099  break;
2100  case TCP_TW_RST:
2101  goto no_tcp_socket;
2102  case TCP_TW_SUCCESS:;
2103  }
2104  goto discard_it;
2105 }
2106 
2107 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2108  .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2109  .twsk_unique = tcp_twsk_unique,
2110  .twsk_destructor= tcp_twsk_destructor,
2111 };
2112 
2113 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2114 {
2115  struct dst_entry *dst = skb_dst(skb);
2116 
2117  dst_hold(dst);
2118  sk->sk_rx_dst = dst;
2119  inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2120 }
2122 
2124  .queue_xmit = ip_queue_xmit,
2125  .send_check = tcp_v4_send_check,
2126  .rebuild_header = inet_sk_rebuild_header,
2127  .sk_rx_dst_set = inet_sk_rx_dst_set,
2128  .conn_request = tcp_v4_conn_request,
2129  .syn_recv_sock = tcp_v4_syn_recv_sock,
2130  .net_header_len = sizeof(struct iphdr),
2131  .setsockopt = ip_setsockopt,
2132  .getsockopt = ip_getsockopt,
2133  .addr2sockaddr = inet_csk_addr2sockaddr,
2134  .sockaddr_len = sizeof(struct sockaddr_in),
2135  .bind_conflict = inet_csk_bind_conflict,
2136 #ifdef CONFIG_COMPAT
2137  .compat_setsockopt = compat_ip_setsockopt,
2138  .compat_getsockopt = compat_ip_getsockopt,
2139 #endif
2140 };
2141 EXPORT_SYMBOL(ipv4_specific);
2142 
2143 #ifdef CONFIG_TCP_MD5SIG
2144 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2145  .md5_lookup = tcp_v4_md5_lookup,
2146  .calc_md5_hash = tcp_v4_md5_hash_skb,
2147  .md5_parse = tcp_v4_parse_md5_keys,
2148 };
2149 #endif
2150 
2151 /* NOTE: A lot of things set to zero explicitly by call to
2152  * sk_alloc() so need not be done here.
2153  */
2154 static int tcp_v4_init_sock(struct sock *sk)
2155 {
2156  struct inet_connection_sock *icsk = inet_csk(sk);
2157 
2158  tcp_init_sock(sk);
2159 
2160  icsk->icsk_af_ops = &ipv4_specific;
2161 
2162 #ifdef CONFIG_TCP_MD5SIG
2163  tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2164 #endif
2165 
2166  return 0;
2167 }
2168 
2169 void tcp_v4_destroy_sock(struct sock *sk)
2170 {
2171  struct tcp_sock *tp = tcp_sk(sk);
2172 
2173  tcp_clear_xmit_timers(sk);
2174 
2176 
2177  /* Cleanup up the write buffer. */
2178  tcp_write_queue_purge(sk);
2179 
2180  /* Cleans up our, hopefully empty, out_of_order_queue. */
2181  __skb_queue_purge(&tp->out_of_order_queue);
2182 
2183 #ifdef CONFIG_TCP_MD5SIG
2184  /* Clean up the MD5 key list, if any */
2185  if (tp->md5sig_info) {
2186  tcp_clear_md5_list(sk);
2187  kfree_rcu(tp->md5sig_info, rcu);
2188  tp->md5sig_info = NULL;
2189  }
2190 #endif
2191 
2192 #ifdef CONFIG_NET_DMA
2193  /* Cleans up our sk_async_wait_queue */
2194  __skb_queue_purge(&sk->sk_async_wait_queue);
2195 #endif
2196 
2197  /* Clean prequeue, it must be empty really */
2198  __skb_queue_purge(&tp->ucopy.prequeue);
2199 
2200  /* Clean up a referenced TCP bind bucket. */
2201  if (inet_csk(sk)->icsk_bind_hash)
2202  inet_put_port(sk);
2203 
2204  /* TCP Cookie Transactions */
2205  if (tp->cookie_values != NULL) {
2206  kref_put(&tp->cookie_values->kref,
2207  tcp_cookie_values_release);
2208  tp->cookie_values = NULL;
2209  }
2210  BUG_ON(tp->fastopen_rsk != NULL);
2211 
2212  /* If socket is aborted during connect operation */
2214 
2215  sk_sockets_allocated_dec(sk);
2216  sock_release_memcg(sk);
2217 }
2219 
2220 #ifdef CONFIG_PROC_FS
2221 /* Proc filesystem TCP sock list dumping. */
2222 
2223 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2224 {
2225  return hlist_nulls_empty(head) ? NULL :
2226  list_entry(head->first, struct inet_timewait_sock, tw_node);
2227 }
2228 
2229 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2230 {
2231  return !is_a_nulls(tw->tw_node.next) ?
2232  hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2233 }
2234 
2235 /*
2236  * Get next listener socket follow cur. If cur is NULL, get first socket
2237  * starting from bucket given in st->bucket; when st->bucket is zero the
2238  * very first socket in the hash table is returned.
2239  */
2240 static void *listening_get_next(struct seq_file *seq, void *cur)
2241 {
2242  struct inet_connection_sock *icsk;
2243  struct hlist_nulls_node *node;
2244  struct sock *sk = cur;
2245  struct inet_listen_hashbucket *ilb;
2246  struct tcp_iter_state *st = seq->private;
2247  struct net *net = seq_file_net(seq);
2248 
2249  if (!sk) {
2250  ilb = &tcp_hashinfo.listening_hash[st->bucket];
2251  spin_lock_bh(&ilb->lock);
2252  sk = sk_nulls_head(&ilb->head);
2253  st->offset = 0;
2254  goto get_sk;
2255  }
2256  ilb = &tcp_hashinfo.listening_hash[st->bucket];
2257  ++st->num;
2258  ++st->offset;
2259 
2260  if (st->state == TCP_SEQ_STATE_OPENREQ) {
2261  struct request_sock *req = cur;
2262 
2263  icsk = inet_csk(st->syn_wait_sk);
2264  req = req->dl_next;
2265  while (1) {
2266  while (req) {
2267  if (req->rsk_ops->family == st->family) {
2268  cur = req;
2269  goto out;
2270  }
2271  req = req->dl_next;
2272  }
2273  if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2274  break;
2275 get_req:
2276  req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2277  }
2278  sk = sk_nulls_next(st->syn_wait_sk);
2280  read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2281  } else {
2282  icsk = inet_csk(sk);
2283  read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2284  if (reqsk_queue_len(&icsk->icsk_accept_queue))
2285  goto start_req;
2286  read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2287  sk = sk_nulls_next(sk);
2288  }
2289 get_sk:
2290  sk_nulls_for_each_from(sk, node) {
2291  if (!net_eq(sock_net(sk), net))
2292  continue;
2293  if (sk->sk_family == st->family) {
2294  cur = sk;
2295  goto out;
2296  }
2297  icsk = inet_csk(sk);
2298  read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2299  if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2300 start_req:
2301  st->uid = sock_i_uid(sk);
2302  st->syn_wait_sk = sk;
2304  st->sbucket = 0;
2305  goto get_req;
2306  }
2307  read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2308  }
2309  spin_unlock_bh(&ilb->lock);
2310  st->offset = 0;
2311  if (++st->bucket < INET_LHTABLE_SIZE) {
2312  ilb = &tcp_hashinfo.listening_hash[st->bucket];
2313  spin_lock_bh(&ilb->lock);
2314  sk = sk_nulls_head(&ilb->head);
2315  goto get_sk;
2316  }
2317  cur = NULL;
2318 out:
2319  return cur;
2320 }
2321 
2322 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2323 {
2324  struct tcp_iter_state *st = seq->private;
2325  void *rc;
2326 
2327  st->bucket = 0;
2328  st->offset = 0;
2329  rc = listening_get_next(seq, NULL);
2330 
2331  while (rc && *pos) {
2332  rc = listening_get_next(seq, rc);
2333  --*pos;
2334  }
2335  return rc;
2336 }
2337 
2338 static inline bool empty_bucket(struct tcp_iter_state *st)
2339 {
2340  return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2341  hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2342 }
2343 
2344 /*
2345  * Get first established socket starting from bucket given in st->bucket.
2346  * If st->bucket is zero, the very first socket in the hash is returned.
2347  */
2348 static void *established_get_first(struct seq_file *seq)
2349 {
2350  struct tcp_iter_state *st = seq->private;
2351  struct net *net = seq_file_net(seq);
2352  void *rc = NULL;
2353 
2354  st->offset = 0;
2355  for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2356  struct sock *sk;
2357  struct hlist_nulls_node *node;
2358  struct inet_timewait_sock *tw;
2359  spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2360 
2361  /* Lockless fast path for the common case of empty buckets */
2362  if (empty_bucket(st))
2363  continue;
2364 
2365  spin_lock_bh(lock);
2366  sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2367  if (sk->sk_family != st->family ||
2368  !net_eq(sock_net(sk), net)) {
2369  continue;
2370  }
2371  rc = sk;
2372  goto out;
2373  }
2375  inet_twsk_for_each(tw, node,
2376  &tcp_hashinfo.ehash[st->bucket].twchain) {
2377  if (tw->tw_family != st->family ||
2378  !net_eq(twsk_net(tw), net)) {
2379  continue;
2380  }
2381  rc = tw;
2382  goto out;
2383  }
2384  spin_unlock_bh(lock);
2386  }
2387 out:
2388  return rc;
2389 }
2390 
2391 static void *established_get_next(struct seq_file *seq, void *cur)
2392 {
2393  struct sock *sk = cur;
2394  struct inet_timewait_sock *tw;
2395  struct hlist_nulls_node *node;
2396  struct tcp_iter_state *st = seq->private;
2397  struct net *net = seq_file_net(seq);
2398 
2399  ++st->num;
2400  ++st->offset;
2401 
2402  if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2403  tw = cur;
2404  tw = tw_next(tw);
2405 get_tw:
2406  while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2407  tw = tw_next(tw);
2408  }
2409  if (tw) {
2410  cur = tw;
2411  goto out;
2412  }
2413  spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2415 
2416  /* Look for next non empty bucket */
2417  st->offset = 0;
2418  while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2419  empty_bucket(st))
2420  ;
2421  if (st->bucket > tcp_hashinfo.ehash_mask)
2422  return NULL;
2423 
2424  spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2425  sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2426  } else
2427  sk = sk_nulls_next(sk);
2428 
2429  sk_nulls_for_each_from(sk, node) {
2430  if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2431  goto found;
2432  }
2433 
2435  tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2436  goto get_tw;
2437 found:
2438  cur = sk;
2439 out:
2440  return cur;
2441 }
2442 
2443 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2444 {
2445  struct tcp_iter_state *st = seq->private;
2446  void *rc;
2447 
2448  st->bucket = 0;
2449  rc = established_get_first(seq);
2450 
2451  while (rc && pos) {
2452  rc = established_get_next(seq, rc);
2453  --pos;
2454  }
2455  return rc;
2456 }
2457 
2458 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2459 {
2460  void *rc;
2461  struct tcp_iter_state *st = seq->private;
2462 
2464  rc = listening_get_idx(seq, &pos);
2465 
2466  if (!rc) {
2468  rc = established_get_idx(seq, pos);
2469  }
2470 
2471  return rc;
2472 }
2473 
2474 static void *tcp_seek_last_pos(struct seq_file *seq)
2475 {
2476  struct tcp_iter_state *st = seq->private;
2477  int offset = st->offset;
2478  int orig_num = st->num;
2479  void *rc = NULL;
2480 
2481  switch (st->state) {
2482  case TCP_SEQ_STATE_OPENREQ:
2484  if (st->bucket >= INET_LHTABLE_SIZE)
2485  break;
2487  rc = listening_get_next(seq, NULL);
2488  while (offset-- && rc)
2489  rc = listening_get_next(seq, rc);
2490  if (rc)
2491  break;
2492  st->bucket = 0;
2493  /* Fallthrough */
2497  if (st->bucket > tcp_hashinfo.ehash_mask)
2498  break;
2499  rc = established_get_first(seq);
2500  while (offset-- && rc)
2501  rc = established_get_next(seq, rc);
2502  }
2503 
2504  st->num = orig_num;
2505 
2506  return rc;
2507 }
2508 
2509 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2510 {
2511  struct tcp_iter_state *st = seq->private;
2512  void *rc;
2513 
2514  if (*pos && *pos == st->last_pos) {
2515  rc = tcp_seek_last_pos(seq);
2516  if (rc)
2517  goto out;
2518  }
2519 
2521  st->num = 0;
2522  st->bucket = 0;
2523  st->offset = 0;
2524  rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2525 
2526 out:
2527  st->last_pos = *pos;
2528  return rc;
2529 }
2530 
2531 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2532 {
2533  struct tcp_iter_state *st = seq->private;
2534  void *rc = NULL;
2535 
2536  if (v == SEQ_START_TOKEN) {
2537  rc = tcp_get_idx(seq, 0);
2538  goto out;
2539  }
2540 
2541  switch (st->state) {
2542  case TCP_SEQ_STATE_OPENREQ:
2544  rc = listening_get_next(seq, v);
2545  if (!rc) {
2547  st->bucket = 0;
2548  st->offset = 0;
2549  rc = established_get_first(seq);
2550  }
2551  break;
2554  rc = established_get_next(seq, v);
2555  break;
2556  }
2557 out:
2558  ++*pos;
2559  st->last_pos = *pos;
2560  return rc;
2561 }
2562 
2563 static void tcp_seq_stop(struct seq_file *seq, void *v)
2564 {
2565  struct tcp_iter_state *st = seq->private;
2566 
2567  switch (st->state) {
2568  case TCP_SEQ_STATE_OPENREQ:
2569  if (v) {
2570  struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2571  read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2572  }
2574  if (v != SEQ_START_TOKEN)
2575  spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2576  break;
2579  if (v)
2580  spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2581  break;
2582  }
2583 }
2584 
2585 int tcp_seq_open(struct inode *inode, struct file *file)
2586 {
2587  struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2588  struct tcp_iter_state *s;
2589  int err;
2590 
2591  err = seq_open_net(inode, file, &afinfo->seq_ops,
2592  sizeof(struct tcp_iter_state));
2593  if (err < 0)
2594  return err;
2595 
2596  s = ((struct seq_file *)file->private_data)->private;
2597  s->family = afinfo->family;
2598  s->last_pos = 0;
2599  return 0;
2600 }
2602 
2603 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2604 {
2605  int rc = 0;
2606  struct proc_dir_entry *p;
2607 
2608  afinfo->seq_ops.start = tcp_seq_start;
2609  afinfo->seq_ops.next = tcp_seq_next;
2610  afinfo->seq_ops.stop = tcp_seq_stop;
2611 
2612  p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2613  afinfo->seq_fops, afinfo);
2614  if (!p)
2615  rc = -ENOMEM;
2616  return rc;
2617 }
2619 
2620 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2621 {
2622  proc_net_remove(net, afinfo->name);
2623 }
2625 
2626 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2627  struct seq_file *f, int i, kuid_t uid, int *len)
2628 {
2629  const struct inet_request_sock *ireq = inet_rsk(req);
2630  long delta = req->expires - jiffies;
2631 
2632  seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2633  " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2634  i,
2635  ireq->loc_addr,
2636  ntohs(inet_sk(sk)->inet_sport),
2637  ireq->rmt_addr,
2638  ntohs(ireq->rmt_port),
2639  TCP_SYN_RECV,
2640  0, 0, /* could print option size, but that is af dependent. */
2641  1, /* timers active (only the expire timer) */
2642  jiffies_delta_to_clock_t(delta),
2643  req->retrans,
2644  from_kuid_munged(seq_user_ns(f), uid),
2645  0, /* non standard timer */
2646  0, /* open_requests have no inode */
2647  atomic_read(&sk->sk_refcnt),
2648  req,
2649  len);
2650 }
2651 
2652 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2653 {
2654  int timer_active;
2655  unsigned long timer_expires;
2656  const struct tcp_sock *tp = tcp_sk(sk);
2657  const struct inet_connection_sock *icsk = inet_csk(sk);
2658  const struct inet_sock *inet = inet_sk(sk);
2659  struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2660  __be32 dest = inet->inet_daddr;
2661  __be32 src = inet->inet_rcv_saddr;
2662  __u16 destp = ntohs(inet->inet_dport);
2663  __u16 srcp = ntohs(inet->inet_sport);
2664  int rx_queue;
2665 
2666  if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2667  timer_active = 1;
2668  timer_expires = icsk->icsk_timeout;
2669  } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2670  timer_active = 4;
2671  timer_expires = icsk->icsk_timeout;
2672  } else if (timer_pending(&sk->sk_timer)) {
2673  timer_active = 2;
2674  timer_expires = sk->sk_timer.expires;
2675  } else {
2676  timer_active = 0;
2677  timer_expires = jiffies;
2678  }
2679 
2680  if (sk->sk_state == TCP_LISTEN)
2681  rx_queue = sk->sk_ack_backlog;
2682  else
2683  /*
2684  * because we dont lock socket, we might find a transient negative value
2685  */
2686  rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2687 
2688  seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2689  "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2690  i, src, srcp, dest, destp, sk->sk_state,
2691  tp->write_seq - tp->snd_una,
2692  rx_queue,
2693  timer_active,
2694  jiffies_delta_to_clock_t(timer_expires - jiffies),
2695  icsk->icsk_retransmits,
2696  from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2697  icsk->icsk_probes_out,
2698  sock_i_ino(sk),
2699  atomic_read(&sk->sk_refcnt), sk,
2701  jiffies_to_clock_t(icsk->icsk_ack.ato),
2702  (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2703  tp->snd_cwnd,
2704  sk->sk_state == TCP_LISTEN ?
2705  (fastopenq ? fastopenq->max_qlen : 0) :
2706  (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2707  len);
2708 }
2709 
2710 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2711  struct seq_file *f, int i, int *len)
2712 {
2713  __be32 dest, src;
2714  __u16 destp, srcp;
2715  long delta = tw->tw_ttd - jiffies;
2716 
2717  dest = tw->tw_daddr;
2718  src = tw->tw_rcv_saddr;
2719  destp = ntohs(tw->tw_dport);
2720  srcp = ntohs(tw->tw_sport);
2721 
2722  seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2723  " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2724  i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2725  3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2726  atomic_read(&tw->tw_refcnt), tw, len);
2727 }
2728 
2729 #define TMPSZ 150
2730 
2731 static int tcp4_seq_show(struct seq_file *seq, void *v)
2732 {
2733  struct tcp_iter_state *st;
2734  int len;
2735 
2736  if (v == SEQ_START_TOKEN) {
2737  seq_printf(seq, "%-*s\n", TMPSZ - 1,
2738  " sl local_address rem_address st tx_queue "
2739  "rx_queue tr tm->when retrnsmt uid timeout "
2740  "inode");
2741  goto out;
2742  }
2743  st = seq->private;
2744 
2745  switch (st->state) {
2748  get_tcp4_sock(v, seq, st->num, &len);
2749  break;
2750  case TCP_SEQ_STATE_OPENREQ:
2751  get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2752  break;
2754  get_timewait4_sock(v, seq, st->num, &len);
2755  break;
2756  }
2757  seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2758 out:
2759  return 0;
2760 }
2761 
2762 static const struct file_operations tcp_afinfo_seq_fops = {
2763  .owner = THIS_MODULE,
2764  .open = tcp_seq_open,
2765  .read = seq_read,
2766  .llseek = seq_lseek,
2767  .release = seq_release_net
2768 };
2769 
2770 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2771  .name = "tcp",
2772  .family = AF_INET,
2773  .seq_fops = &tcp_afinfo_seq_fops,
2774  .seq_ops = {
2775  .show = tcp4_seq_show,
2776  },
2777 };
2778 
2779 static int __net_init tcp4_proc_init_net(struct net *net)
2780 {
2781  return tcp_proc_register(net, &tcp4_seq_afinfo);
2782 }
2783 
2784 static void __net_exit tcp4_proc_exit_net(struct net *net)
2785 {
2786  tcp_proc_unregister(net, &tcp4_seq_afinfo);
2787 }
2788 
2789 static struct pernet_operations tcp4_net_ops = {
2790  .init = tcp4_proc_init_net,
2791  .exit = tcp4_proc_exit_net,
2792 };
2793 
2794 int __init tcp4_proc_init(void)
2795 {
2796  return register_pernet_subsys(&tcp4_net_ops);
2797 }
2798 
2799 void tcp4_proc_exit(void)
2800 {
2801  unregister_pernet_subsys(&tcp4_net_ops);
2802 }
2803 #endif /* CONFIG_PROC_FS */
2804 
2805 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2806 {
2807  const struct iphdr *iph = skb_gro_network_header(skb);
2808  __wsum wsum;
2809  __sum16 sum;
2810 
2811  switch (skb->ip_summed) {
2812  case CHECKSUM_COMPLETE:
2813  if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2814  skb->csum)) {
2816  break;
2817  }
2818 flush:
2819  NAPI_GRO_CB(skb)->flush = 1;
2820  return NULL;
2821 
2822  case CHECKSUM_NONE:
2823  wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2824  skb_gro_len(skb), IPPROTO_TCP, 0);
2825  sum = csum_fold(skb_checksum(skb,
2826  skb_gro_offset(skb),
2827  skb_gro_len(skb),
2828  wsum));
2829  if (sum)
2830  goto flush;
2831 
2833  break;
2834  }
2835 
2836  return tcp_gro_receive(head, skb);
2837 }
2838 
2839 int tcp4_gro_complete(struct sk_buff *skb)
2840 {
2841  const struct iphdr *iph = ip_hdr(skb);
2842  struct tcphdr *th = tcp_hdr(skb);
2843 
2844  th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2845  iph->saddr, iph->daddr, 0);
2846  skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2847 
2848  return tcp_gro_complete(skb);
2849 }
2850 
2851 struct proto tcp_prot = {
2852  .name = "TCP",
2853  .owner = THIS_MODULE,
2854  .close = tcp_close,
2855  .connect = tcp_v4_connect,
2856  .disconnect = tcp_disconnect,
2857  .accept = inet_csk_accept,
2858  .ioctl = tcp_ioctl,
2859  .init = tcp_v4_init_sock,
2860  .destroy = tcp_v4_destroy_sock,
2861  .shutdown = tcp_shutdown,
2862  .setsockopt = tcp_setsockopt,
2863  .getsockopt = tcp_getsockopt,
2864  .recvmsg = tcp_recvmsg,
2865  .sendmsg = tcp_sendmsg,
2866  .sendpage = tcp_sendpage,
2867  .backlog_rcv = tcp_v4_do_rcv,
2868  .release_cb = tcp_release_cb,
2869  .mtu_reduced = tcp_v4_mtu_reduced,
2870  .hash = inet_hash,
2871  .unhash = inet_unhash,
2872  .get_port = inet_csk_get_port,
2873  .enter_memory_pressure = tcp_enter_memory_pressure,
2874  .sockets_allocated = &tcp_sockets_allocated,
2875  .orphan_count = &tcp_orphan_count,
2876  .memory_allocated = &tcp_memory_allocated,
2877  .memory_pressure = &tcp_memory_pressure,
2878  .sysctl_wmem = sysctl_tcp_wmem,
2879  .sysctl_rmem = sysctl_tcp_rmem,
2880  .max_header = MAX_TCP_HEADER,
2881  .obj_size = sizeof(struct tcp_sock),
2882  .slab_flags = SLAB_DESTROY_BY_RCU,
2883  .twsk_prot = &tcp_timewait_sock_ops,
2884  .rsk_prot = &tcp_request_sock_ops,
2885  .h.hashinfo = &tcp_hashinfo,
2886  .no_autobind = true,
2887 #ifdef CONFIG_COMPAT
2888  .compat_setsockopt = compat_tcp_setsockopt,
2889  .compat_getsockopt = compat_tcp_getsockopt,
2890 #endif
2891 #ifdef CONFIG_MEMCG_KMEM
2892  .init_cgroup = tcp_init_cgroup,
2893  .destroy_cgroup = tcp_destroy_cgroup,
2894  .proto_cgroup = tcp_proto_cgroup,
2895 #endif
2896 };
2897 EXPORT_SYMBOL(tcp_prot);
2898 
2899 static int __net_init tcp_sk_init(struct net *net)
2900 {
2901  return 0;
2902 }
2903 
2904 static void __net_exit tcp_sk_exit(struct net *net)
2905 {
2906 }
2907 
2908 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2909 {
2911 }
2912 
2913 static struct pernet_operations __net_initdata tcp_sk_ops = {
2914  .init = tcp_sk_init,
2915  .exit = tcp_sk_exit,
2916  .exit_batch = tcp_sk_exit_batch,
2917 };
2918 
2920 {
2922  if (register_pernet_subsys(&tcp_sk_ops))
2923  panic("Failed to create the TCP control socket.\n");
2924 }