Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
nf_conntrack_proto_tcp.c
Go to the documentation of this file.
1 /* (C) 1999-2001 Paul `Rusty' Russell
2  * (C) 2002-2004 Netfilter Core Team <[email protected]>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  */
8 
9 #include <linux/types.h>
10 #include <linux/timer.h>
11 #include <linux/module.h>
12 #include <linux/in.h>
13 #include <linux/tcp.h>
14 #include <linux/spinlock.h>
15 #include <linux/skbuff.h>
16 #include <linux/ipv6.h>
17 #include <net/ip6_checksum.h>
18 #include <asm/unaligned.h>
19 
20 #include <net/tcp.h>
21 
22 #include <linux/netfilter.h>
23 #include <linux/netfilter_ipv4.h>
24 #include <linux/netfilter_ipv6.h>
28 #include <net/netfilter/nf_log.h>
31 
32 /* "Be conservative in what you do,
33  be liberal in what you accept from others."
34  If it's non-zero, we mark only out of window RST segments as INVALID. */
35 static int nf_ct_tcp_be_liberal __read_mostly = 0;
36 
37 /* If it is set to zero, we disable picking up already established
38  connections. */
39 static int nf_ct_tcp_loose __read_mostly = 1;
40 
41 /* Max number of the retransmitted packets without receiving an (acceptable)
42  ACK from the destination. If this number is reached, a shorter timer
43  will be started. */
44 static int nf_ct_tcp_max_retrans __read_mostly = 3;
45 
46  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
47  closely. They're more complex. --RR */
48 
49 static const char *const tcp_conntrack_names[] = {
50  "NONE",
51  "SYN_SENT",
52  "SYN_RECV",
53  "ESTABLISHED",
54  "FIN_WAIT",
55  "CLOSE_WAIT",
56  "LAST_ACK",
57  "TIME_WAIT",
58  "CLOSE",
59  "SYN_SENT2",
60 };
61 
62 #define SECS * HZ
63 #define MINS * 60 SECS
64 #define HOURS * 60 MINS
65 #define DAYS * 24 HOURS
66 
67 static unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] __read_mostly = {
77 /* RFC1122 says the R2 limit should be at least 100 seconds.
78  Linux uses 15 packets as limit, which corresponds
79  to ~13-30min depending on RTO. */
82 };
83 
84 #define sNO TCP_CONNTRACK_NONE
85 #define sSS TCP_CONNTRACK_SYN_SENT
86 #define sSR TCP_CONNTRACK_SYN_RECV
87 #define sES TCP_CONNTRACK_ESTABLISHED
88 #define sFW TCP_CONNTRACK_FIN_WAIT
89 #define sCW TCP_CONNTRACK_CLOSE_WAIT
90 #define sLA TCP_CONNTRACK_LAST_ACK
91 #define sTW TCP_CONNTRACK_TIME_WAIT
92 #define sCL TCP_CONNTRACK_CLOSE
93 #define sS2 TCP_CONNTRACK_SYN_SENT2
94 #define sIV TCP_CONNTRACK_MAX
95 #define sIG TCP_CONNTRACK_IGNORE
96 
97 /* What TCP flags are set from RST/SYN/FIN/ACK. */
105 };
106 
107 /*
108  * The TCP state transition table needs a few words...
109  *
110  * We are the man in the middle. All the packets go through us
111  * but might get lost in transit to the destination.
112  * It is assumed that the destinations can't receive segments
113  * we haven't seen.
114  *
115  * The checked segment is in window, but our windows are *not*
116  * equivalent with the ones of the sender/receiver. We always
117  * try to guess the state of the current sender.
118  *
119  * The meaning of the states are:
120  *
121  * NONE: initial state
122  * SYN_SENT: SYN-only packet seen
123  * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open
124  * SYN_RECV: SYN-ACK packet seen
125  * ESTABLISHED: ACK packet seen
126  * FIN_WAIT: FIN packet seen
127  * CLOSE_WAIT: ACK seen (after FIN)
128  * LAST_ACK: FIN seen (after FIN)
129  * TIME_WAIT: last ACK seen
130  * CLOSE: closed connection (RST)
131  *
132  * Packets marked as IGNORED (sIG):
133  * if they may be either invalid or valid
134  * and the receiver may send back a connection
135  * closing RST or a SYN/ACK.
136  *
137  * Packets marked as INVALID (sIV):
138  * if we regard them as truly invalid packets
139  */
140 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
141  {
142 /* ORIGINAL */
143 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
144 /*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
145 /*
146  * sNO -> sSS Initialize a new connection
147  * sSS -> sSS Retransmitted SYN
148  * sS2 -> sS2 Late retransmitted SYN
149  * sSR -> sIG
150  * sES -> sIG Error: SYNs in window outside the SYN_SENT state
151  * are errors. Receiver will reply with RST
152  * and close the connection.
153  * Or we are not in sync and hold a dead connection.
154  * sFW -> sIG
155  * sCW -> sIG
156  * sLA -> sIG
157  * sTW -> sSS Reopened connection (RFC 1122).
158  * sCL -> sSS
159  */
160 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
161 /*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
162 /*
163  * sNO -> sIV Too late and no reason to do anything
164  * sSS -> sIV Client can't send SYN and then SYN/ACK
165  * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open
166  * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open
167  * sES -> sIV Invalid SYN/ACK packets sent by the client
168  * sFW -> sIV
169  * sCW -> sIV
170  * sLA -> sIV
171  * sTW -> sIV
172  * sCL -> sIV
173  */
174 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
175 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
176 /*
177  * sNO -> sIV Too late and no reason to do anything...
178  * sSS -> sIV Client migth not send FIN in this state:
179  * we enforce waiting for a SYN/ACK reply first.
180  * sS2 -> sIV
181  * sSR -> sFW Close started.
182  * sES -> sFW
183  * sFW -> sLA FIN seen in both directions, waiting for
184  * the last ACK.
185  * Migth be a retransmitted FIN as well...
186  * sCW -> sLA
187  * sLA -> sLA Retransmitted FIN. Remain in the same state.
188  * sTW -> sTW
189  * sCL -> sCL
190  */
191 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
192 /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
193 /*
194  * sNO -> sES Assumed.
195  * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet.
196  * sS2 -> sIV
197  * sSR -> sES Established state is reached.
198  * sES -> sES :-)
199  * sFW -> sCW Normal close request answered by ACK.
200  * sCW -> sCW
201  * sLA -> sTW Last ACK detected.
202  * sTW -> sTW Retransmitted last ACK. Remain in the same state.
203  * sCL -> sCL
204  */
205 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
206 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
207 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
208  },
209  {
210 /* REPLY */
211 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
212 /*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 },
213 /*
214  * sNO -> sIV Never reached.
215  * sSS -> sS2 Simultaneous open
216  * sS2 -> sS2 Retransmitted simultaneous SYN
217  * sSR -> sIV Invalid SYN packets sent by the server
218  * sES -> sIV
219  * sFW -> sIV
220  * sCW -> sIV
221  * sLA -> sIV
222  * sTW -> sIV Reopened connection, but server may not do it.
223  * sCL -> sIV
224  */
225 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
226 /*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
227 /*
228  * sSS -> sSR Standard open.
229  * sS2 -> sSR Simultaneous open
230  * sSR -> sIG Retransmitted SYN/ACK, ignore it.
231  * sES -> sIG Late retransmitted SYN/ACK?
232  * sFW -> sIG Might be SYN/ACK answering ignored SYN
233  * sCW -> sIG
234  * sLA -> sIG
235  * sTW -> sIG
236  * sCL -> sIG
237  */
238 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
239 /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
240 /*
241  * sSS -> sIV Server might not send FIN in this state.
242  * sS2 -> sIV
243  * sSR -> sFW Close started.
244  * sES -> sFW
245  * sFW -> sLA FIN seen in both directions.
246  * sCW -> sLA
247  * sLA -> sLA Retransmitted FIN.
248  * sTW -> sTW
249  * sCL -> sCL
250  */
251 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
252 /*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
253 /*
254  * sSS -> sIG Might be a half-open connection.
255  * sS2 -> sIG
256  * sSR -> sSR Might answer late resent SYN.
257  * sES -> sES :-)
258  * sFW -> sCW Normal close request answered by ACK.
259  * sCW -> sCW
260  * sLA -> sTW Last ACK detected.
261  * sTW -> sTW Retransmitted last ACK.
262  * sCL -> sCL
263  */
264 /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */
265 /*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
266 /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
267  }
268 };
269 
270 static inline struct nf_tcp_net *tcp_pernet(struct net *net)
271 {
272  return &net->ct.nf_ct_proto.tcp;
273 }
274 
275 static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
276  struct nf_conntrack_tuple *tuple)
277 {
278  const struct tcphdr *hp;
279  struct tcphdr _hdr;
280 
281  /* Actually only need first 8 bytes. */
282  hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
283  if (hp == NULL)
284  return false;
285 
286  tuple->src.u.tcp.port = hp->source;
287  tuple->dst.u.tcp.port = hp->dest;
288 
289  return true;
290 }
291 
292 static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
293  const struct nf_conntrack_tuple *orig)
294 {
295  tuple->src.u.tcp.port = orig->dst.u.tcp.port;
296  tuple->dst.u.tcp.port = orig->src.u.tcp.port;
297  return true;
298 }
299 
300 /* Print out the per-protocol part of the tuple. */
301 static int tcp_print_tuple(struct seq_file *s,
302  const struct nf_conntrack_tuple *tuple)
303 {
304  return seq_printf(s, "sport=%hu dport=%hu ",
305  ntohs(tuple->src.u.tcp.port),
306  ntohs(tuple->dst.u.tcp.port));
307 }
308 
309 /* Print out the private part of the conntrack. */
310 static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
311 {
312  enum tcp_conntrack state;
313 
314  spin_lock_bh(&ct->lock);
315  state = ct->proto.tcp.state;
316  spin_unlock_bh(&ct->lock);
317 
318  return seq_printf(s, "%s ", tcp_conntrack_names[state]);
319 }
320 
321 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
322 {
323  if (tcph->rst) return TCP_RST_SET;
324  else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
325  else if (tcph->fin) return TCP_FIN_SET;
326  else if (tcph->ack) return TCP_ACK_SET;
327  else return TCP_NONE_SET;
328 }
329 
330 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
331  in IP Filter' by Guido van Rooij.
332 
333  http://www.sane.nl/events/sane2000/papers.html
334  http://www.darkart.com/mirrors/www.obfuscation.org/ipf/
335 
336  The boundaries and the conditions are changed according to RFC793:
337  the packet must intersect the window (i.e. segments may be
338  after the right or before the left edge) and thus receivers may ACK
339  segments after the right edge of the window.
340 
341  td_maxend = max(sack + max(win,1)) seen in reply packets
342  td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
343  td_maxwin += seq + len - sender.td_maxend
344  if seq + len > sender.td_maxend
345  td_end = max(seq + len) seen in sent packets
346 
347  I. Upper bound for valid data: seq <= sender.td_maxend
348  II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin
349  III. Upper bound for valid (s)ack: sack <= receiver.td_end
350  IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW
351 
352  where sack is the highest right edge of sack block found in the packet
353  or ack in the case of packet without SACK option.
354 
355  The upper bound limit for a valid (s)ack is not ignored -
356  we doesn't have to deal with fragments.
357 */
358 
359 static inline __u32 segment_seq_plus_len(__u32 seq,
360  size_t len,
361  unsigned int dataoff,
362  const struct tcphdr *tcph)
363 {
364  /* XXX Should I use payload length field in IP/IPv6 header ?
365  * - YK */
366  return (seq + len - dataoff - tcph->doff*4
367  + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
368 }
369 
370 /* Fixme: what about big packets? */
371 #define MAXACKWINCONST 66000
372 #define MAXACKWINDOW(sender) \
373  ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \
374  : MAXACKWINCONST)
375 
376 /*
377  * Simplified tcp_parse_options routine from tcp_input.c
378  */
379 static void tcp_options(const struct sk_buff *skb,
380  unsigned int dataoff,
381  const struct tcphdr *tcph,
382  struct ip_ct_tcp_state *state)
383 {
384  unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
385  const unsigned char *ptr;
386  int length = (tcph->doff*4) - sizeof(struct tcphdr);
387 
388  if (!length)
389  return;
390 
391  ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
392  length, buff);
393  BUG_ON(ptr == NULL);
394 
395  state->td_scale =
396  state->flags = 0;
397 
398  while (length > 0) {
399  int opcode=*ptr++;
400  int opsize;
401 
402  switch (opcode) {
403  case TCPOPT_EOL:
404  return;
405  case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
406  length--;
407  continue;
408  default:
409  opsize=*ptr++;
410  if (opsize < 2) /* "silly options" */
411  return;
412  if (opsize > length)
413  return; /* don't parse partial options */
414 
415  if (opcode == TCPOPT_SACK_PERM
416  && opsize == TCPOLEN_SACK_PERM)
418  else if (opcode == TCPOPT_WINDOW
419  && opsize == TCPOLEN_WINDOW) {
420  state->td_scale = *(u_int8_t *)ptr;
421 
422  if (state->td_scale > 14) {
423  /* See RFC1323 */
424  state->td_scale = 14;
425  }
426  state->flags |=
428  }
429  ptr += opsize - 2;
430  length -= opsize;
431  }
432  }
433 }
434 
435 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
436  const struct tcphdr *tcph, __u32 *sack)
437 {
438  unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
439  const unsigned char *ptr;
440  int length = (tcph->doff*4) - sizeof(struct tcphdr);
441  __u32 tmp;
442 
443  if (!length)
444  return;
445 
446  ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
447  length, buff);
448  BUG_ON(ptr == NULL);
449 
450  /* Fast path for timestamp-only option */
451  if (length == TCPOLEN_TSTAMP_ALIGNED
452  && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
453  | (TCPOPT_NOP << 16)
454  | (TCPOPT_TIMESTAMP << 8)
456  return;
457 
458  while (length > 0) {
459  int opcode = *ptr++;
460  int opsize, i;
461 
462  switch (opcode) {
463  case TCPOPT_EOL:
464  return;
465  case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
466  length--;
467  continue;
468  default:
469  opsize = *ptr++;
470  if (opsize < 2) /* "silly options" */
471  return;
472  if (opsize > length)
473  return; /* don't parse partial options */
474 
475  if (opcode == TCPOPT_SACK
476  && opsize >= (TCPOLEN_SACK_BASE
478  && !((opsize - TCPOLEN_SACK_BASE)
480  for (i = 0;
481  i < (opsize - TCPOLEN_SACK_BASE);
482  i += TCPOLEN_SACK_PERBLOCK) {
483  tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
484 
485  if (after(tmp, *sack))
486  *sack = tmp;
487  }
488  return;
489  }
490  ptr += opsize - 2;
491  length -= opsize;
492  }
493  }
494 }
495 
496 #ifdef CONFIG_NF_NAT_NEEDED
497 static inline s16 nat_offset(const struct nf_conn *ct,
498  enum ip_conntrack_dir dir,
499  u32 seq)
500 {
502 
503  return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
504 }
505 #define NAT_OFFSET(ct, dir, seq) \
506  (nat_offset(ct, dir, seq))
507 #else
508 #define NAT_OFFSET(ct, dir, seq) 0
509 #endif
510 
511 static bool tcp_in_window(const struct nf_conn *ct,
512  struct ip_ct_tcp *state,
513  enum ip_conntrack_dir dir,
514  unsigned int index,
515  const struct sk_buff *skb,
516  unsigned int dataoff,
517  const struct tcphdr *tcph,
518  u_int8_t pf)
519 {
520  struct net *net = nf_ct_net(ct);
521  struct nf_tcp_net *tn = tcp_pernet(net);
522  struct ip_ct_tcp_state *sender = &state->seen[dir];
523  struct ip_ct_tcp_state *receiver = &state->seen[!dir];
524  const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
525  __u32 seq, ack, sack, end, win, swin;
526  s16 receiver_offset;
527  bool res;
528 
529  /*
530  * Get the required data from the packet.
531  */
532  seq = ntohl(tcph->seq);
533  ack = sack = ntohl(tcph->ack_seq);
534  win = ntohs(tcph->window);
535  end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
536 
537  if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
538  tcp_sack(skb, dataoff, tcph, &sack);
539 
540  /* Take into account NAT sequence number mangling */
541  receiver_offset = NAT_OFFSET(ct, !dir, ack - 1);
542  ack -= receiver_offset;
543  sack -= receiver_offset;
544 
545  pr_debug("tcp_in_window: START\n");
546  pr_debug("tcp_in_window: ");
547  nf_ct_dump_tuple(tuple);
548  pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
549  seq, ack, receiver_offset, sack, receiver_offset, win, end);
550  pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
551  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
552  sender->td_end, sender->td_maxend, sender->td_maxwin,
553  sender->td_scale,
554  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
555  receiver->td_scale);
556 
557  if (sender->td_maxwin == 0) {
558  /*
559  * Initialize sender data.
560  */
561  if (tcph->syn) {
562  /*
563  * SYN-ACK in reply to a SYN
564  * or SYN from reply direction in simultaneous open.
565  */
566  sender->td_end =
567  sender->td_maxend = end;
568  sender->td_maxwin = (win == 0 ? 1 : win);
569 
570  tcp_options(skb, dataoff, tcph, sender);
571  /*
572  * RFC 1323:
573  * Both sides must send the Window Scale option
574  * to enable window scaling in either direction.
575  */
576  if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
577  && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
578  sender->td_scale =
579  receiver->td_scale = 0;
580  if (!tcph->ack)
581  /* Simultaneous open */
582  return true;
583  } else {
584  /*
585  * We are in the middle of a connection,
586  * its history is lost for us.
587  * Let's try to use the data from the packet.
588  */
589  sender->td_end = end;
590  swin = win << sender->td_scale;
591  sender->td_maxwin = (swin == 0 ? 1 : swin);
592  sender->td_maxend = end + sender->td_maxwin;
593  /*
594  * We haven't seen traffic in the other direction yet
595  * but we have to tweak window tracking to pass III
596  * and IV until that happens.
597  */
598  if (receiver->td_maxwin == 0)
599  receiver->td_end = receiver->td_maxend = sack;
600  }
601  } else if (((state->state == TCP_CONNTRACK_SYN_SENT
602  && dir == IP_CT_DIR_ORIGINAL)
603  || (state->state == TCP_CONNTRACK_SYN_RECV
604  && dir == IP_CT_DIR_REPLY))
605  && after(end, sender->td_end)) {
606  /*
607  * RFC 793: "if a TCP is reinitialized ... then it need
608  * not wait at all; it must only be sure to use sequence
609  * numbers larger than those recently used."
610  */
611  sender->td_end =
612  sender->td_maxend = end;
613  sender->td_maxwin = (win == 0 ? 1 : win);
614 
615  tcp_options(skb, dataoff, tcph, sender);
616  }
617 
618  if (!(tcph->ack)) {
619  /*
620  * If there is no ACK, just pretend it was set and OK.
621  */
622  ack = sack = receiver->td_end;
623  } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
625  && (ack == 0)) {
626  /*
627  * Broken TCP stacks, that set ACK in RST packets as well
628  * with zero ack value.
629  */
630  ack = sack = receiver->td_end;
631  }
632 
633  if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
634  /*
635  * RST sent answering SYN.
636  */
637  seq = end = sender->td_end;
638 
639  pr_debug("tcp_in_window: ");
640  nf_ct_dump_tuple(tuple);
641  pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
642  seq, ack, receiver_offset, sack, receiver_offset, win, end);
643  pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
644  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
645  sender->td_end, sender->td_maxend, sender->td_maxwin,
646  sender->td_scale,
647  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
648  receiver->td_scale);
649 
650  pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
651  before(seq, sender->td_maxend + 1),
652  after(end, sender->td_end - receiver->td_maxwin - 1),
653  before(sack, receiver->td_end + 1),
654  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
655 
656  if (before(seq, sender->td_maxend + 1) &&
657  after(end, sender->td_end - receiver->td_maxwin - 1) &&
658  before(sack, receiver->td_end + 1) &&
659  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
660  /*
661  * Take into account window scaling (RFC 1323).
662  */
663  if (!tcph->syn)
664  win <<= sender->td_scale;
665 
666  /*
667  * Update sender data.
668  */
669  swin = win + (sack - ack);
670  if (sender->td_maxwin < swin)
671  sender->td_maxwin = swin;
672  if (after(end, sender->td_end)) {
673  sender->td_end = end;
675  }
676  if (tcph->ack) {
677  if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
678  sender->td_maxack = ack;
679  sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
680  } else if (after(ack, sender->td_maxack))
681  sender->td_maxack = ack;
682  }
683 
684  /*
685  * Update receiver data.
686  */
687  if (receiver->td_maxwin != 0 && after(end, sender->td_maxend))
688  receiver->td_maxwin += end - sender->td_maxend;
689  if (after(sack + win, receiver->td_maxend - 1)) {
690  receiver->td_maxend = sack + win;
691  if (win == 0)
692  receiver->td_maxend++;
693  }
694  if (ack == receiver->td_end)
696 
697  /*
698  * Check retransmissions.
699  */
700  if (index == TCP_ACK_SET) {
701  if (state->last_dir == dir
702  && state->last_seq == seq
703  && state->last_ack == ack
704  && state->last_end == end
705  && state->last_win == win)
706  state->retrans++;
707  else {
708  state->last_dir = dir;
709  state->last_seq = seq;
710  state->last_ack = ack;
711  state->last_end = end;
712  state->last_win = win;
713  state->retrans = 0;
714  }
715  }
716  res = true;
717  } else {
718  res = false;
719  if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
720  tn->tcp_be_liberal)
721  res = true;
722  if (!res && LOG_INVALID(net, IPPROTO_TCP))
723  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
724  "nf_ct_tcp: %s ",
725  before(seq, sender->td_maxend + 1) ?
726  after(end, sender->td_end - receiver->td_maxwin - 1) ?
727  before(sack, receiver->td_end + 1) ?
728  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
729  : "ACK is under the lower bound (possible overly delayed ACK)"
730  : "ACK is over the upper bound (ACKed data not seen yet)"
731  : "SEQ is under the lower bound (already ACKed data retransmitted)"
732  : "SEQ is over the upper bound (over the window of the receiver)");
733  }
734 
735  pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
736  "receiver end=%u maxend=%u maxwin=%u\n",
737  res, sender->td_end, sender->td_maxend, sender->td_maxwin,
738  receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
739 
740  return res;
741 }
742 
743 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
744 static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
745  TCPHDR_URG) + 1] =
746 {
747  [TCPHDR_SYN] = 1,
748  [TCPHDR_SYN|TCPHDR_URG] = 1,
749  [TCPHDR_SYN|TCPHDR_ACK] = 1,
750  [TCPHDR_RST] = 1,
751  [TCPHDR_RST|TCPHDR_ACK] = 1,
752  [TCPHDR_FIN|TCPHDR_ACK] = 1,
754  [TCPHDR_ACK] = 1,
755  [TCPHDR_ACK|TCPHDR_URG] = 1,
756 };
757 
758 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */
759 static int tcp_error(struct net *net, struct nf_conn *tmpl,
760  struct sk_buff *skb,
761  unsigned int dataoff,
762  enum ip_conntrack_info *ctinfo,
763  u_int8_t pf,
764  unsigned int hooknum)
765 {
766  const struct tcphdr *th;
767  struct tcphdr _tcph;
768  unsigned int tcplen = skb->len - dataoff;
769  u_int8_t tcpflags;
770 
771  /* Smaller that minimal TCP header? */
772  th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
773  if (th == NULL) {
774  if (LOG_INVALID(net, IPPROTO_TCP))
775  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
776  "nf_ct_tcp: short packet ");
777  return -NF_ACCEPT;
778  }
779 
780  /* Not whole TCP header or malformed packet */
781  if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
782  if (LOG_INVALID(net, IPPROTO_TCP))
783  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
784  "nf_ct_tcp: truncated/malformed packet ");
785  return -NF_ACCEPT;
786  }
787 
788  /* Checksum invalid? Ignore.
789  * We skip checking packets on the outgoing path
790  * because the checksum is assumed to be correct.
791  */
792  /* FIXME: Source route IP option packets --RR */
793  if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
794  nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
795  if (LOG_INVALID(net, IPPROTO_TCP))
796  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
797  "nf_ct_tcp: bad TCP checksum ");
798  return -NF_ACCEPT;
799  }
800 
801  /* Check TCP flags. */
802  tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
803  if (!tcp_valid_flags[tcpflags]) {
804  if (LOG_INVALID(net, IPPROTO_TCP))
805  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
806  "nf_ct_tcp: invalid TCP flag combination ");
807  return -NF_ACCEPT;
808  }
809 
810  return NF_ACCEPT;
811 }
812 
813 static unsigned int *tcp_get_timeouts(struct net *net)
814 {
815  return tcp_pernet(net)->timeouts;
816 }
817 
818 /* Returns verdict for packet, or -1 for invalid. */
819 static int tcp_packet(struct nf_conn *ct,
820  const struct sk_buff *skb,
821  unsigned int dataoff,
822  enum ip_conntrack_info ctinfo,
823  u_int8_t pf,
824  unsigned int hooknum,
825  unsigned int *timeouts)
826 {
827  struct net *net = nf_ct_net(ct);
828  struct nf_tcp_net *tn = tcp_pernet(net);
829  struct nf_conntrack_tuple *tuple;
830  enum tcp_conntrack new_state, old_state;
831  enum ip_conntrack_dir dir;
832  const struct tcphdr *th;
833  struct tcphdr _tcph;
834  unsigned long timeout;
835  unsigned int index;
836 
837  th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
838  BUG_ON(th == NULL);
839 
840  spin_lock_bh(&ct->lock);
841  old_state = ct->proto.tcp.state;
842  dir = CTINFO2DIR(ctinfo);
843  index = get_conntrack_index(th);
844  new_state = tcp_conntracks[dir][index][old_state];
845  tuple = &ct->tuplehash[dir].tuple;
846 
847  switch (new_state) {
849  if (old_state < TCP_CONNTRACK_TIME_WAIT)
850  break;
851  /* RFC 1122: "When a connection is closed actively,
852  * it MUST linger in TIME-WAIT state for a time 2xMSL
853  * (Maximum Segment Lifetime). However, it MAY accept
854  * a new SYN from the remote TCP to reopen the connection
855  * directly from TIME-WAIT state, if..."
856  * We ignore the conditions because we are in the
857  * TIME-WAIT state anyway.
858  *
859  * Handle aborted connections: we and the server
860  * think there is an existing connection but the client
861  * aborts it and starts a new one.
862  */
863  if (((ct->proto.tcp.seen[dir].flags
864  | ct->proto.tcp.seen[!dir].flags)
866  || (ct->proto.tcp.last_dir == dir
867  && ct->proto.tcp.last_index == TCP_RST_SET)) {
868  /* Attempt to reopen a closed/aborted connection.
869  * Delete this connection and look up again. */
870  spin_unlock_bh(&ct->lock);
871 
872  /* Only repeat if we can actually remove the timer.
873  * Destruction may already be in progress in process
874  * context and we must give it a chance to terminate.
875  */
876  if (nf_ct_kill(ct))
877  return -NF_REPEAT;
878  return NF_DROP;
879  }
880  /* Fall through */
882  /* Ignored packets:
883  *
884  * Our connection entry may be out of sync, so ignore
885  * packets which may signal the real connection between
886  * the client and the server.
887  *
888  * a) SYN in ORIGINAL
889  * b) SYN/ACK in REPLY
890  * c) ACK in reply direction after initial SYN in original.
891  *
892  * If the ignored packet is invalid, the receiver will send
893  * a RST we'll catch below.
894  */
895  if (index == TCP_SYNACK_SET
896  && ct->proto.tcp.last_index == TCP_SYN_SET
897  && ct->proto.tcp.last_dir != dir
898  && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
899  /* b) This SYN/ACK acknowledges a SYN that we earlier
900  * ignored as invalid. This means that the client and
901  * the server are both in sync, while the firewall is
902  * not. We get in sync from the previously annotated
903  * values.
904  */
905  old_state = TCP_CONNTRACK_SYN_SENT;
906  new_state = TCP_CONNTRACK_SYN_RECV;
907  ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
908  ct->proto.tcp.last_end;
909  ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
910  ct->proto.tcp.last_end;
911  ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
912  ct->proto.tcp.last_win == 0 ?
913  1 : ct->proto.tcp.last_win;
914  ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
915  ct->proto.tcp.last_wscale;
916  ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
917  ct->proto.tcp.last_flags;
918  memset(&ct->proto.tcp.seen[dir], 0,
919  sizeof(struct ip_ct_tcp_state));
920  break;
921  }
922  ct->proto.tcp.last_index = index;
923  ct->proto.tcp.last_dir = dir;
924  ct->proto.tcp.last_seq = ntohl(th->seq);
925  ct->proto.tcp.last_end =
926  segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
927  ct->proto.tcp.last_win = ntohs(th->window);
928 
929  /* a) This is a SYN in ORIGINAL. The client and the server
930  * may be in sync but we are not. In that case, we annotate
931  * the TCP options and let the packet go through. If it is a
932  * valid SYN packet, the server will reply with a SYN/ACK, and
933  * then we'll get in sync. Otherwise, the server ignores it. */
934  if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
935  struct ip_ct_tcp_state seen = {};
936 
937  ct->proto.tcp.last_flags =
938  ct->proto.tcp.last_wscale = 0;
939  tcp_options(skb, dataoff, th, &seen);
940  if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
941  ct->proto.tcp.last_flags |=
943  ct->proto.tcp.last_wscale = seen.td_scale;
944  }
945  if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
946  ct->proto.tcp.last_flags |=
948  }
949  }
950  spin_unlock_bh(&ct->lock);
951  if (LOG_INVALID(net, IPPROTO_TCP))
952  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
953  "nf_ct_tcp: invalid packet ignored in "
954  "state %s ", tcp_conntrack_names[old_state]);
955  return NF_ACCEPT;
956  case TCP_CONNTRACK_MAX:
957  /* Invalid packet */
958  pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
959  dir, get_conntrack_index(th), old_state);
960  spin_unlock_bh(&ct->lock);
961  if (LOG_INVALID(net, IPPROTO_TCP))
962  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
963  "nf_ct_tcp: invalid state ");
964  return -NF_ACCEPT;
965  case TCP_CONNTRACK_CLOSE:
966  if (index == TCP_RST_SET
967  && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
968  && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
969  /* Invalid RST */
970  spin_unlock_bh(&ct->lock);
971  if (LOG_INVALID(net, IPPROTO_TCP))
972  nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
973  "nf_ct_tcp: invalid RST ");
974  return -NF_ACCEPT;
975  }
976  if (index == TCP_RST_SET
978  && ct->proto.tcp.last_index == TCP_SYN_SET)
979  || (!test_bit(IPS_ASSURED_BIT, &ct->status)
980  && ct->proto.tcp.last_index == TCP_ACK_SET))
981  && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
982  /* RST sent to invalid SYN or ACK we had let through
983  * at a) and c) above:
984  *
985  * a) SYN was in window then
986  * c) we hold a half-open connection.
987  *
988  * Delete our connection entry.
989  * We skip window checking, because packet might ACK
990  * segments we ignored. */
991  goto in_window;
992  }
993  /* Just fall through */
994  default:
995  /* Keep compilers happy. */
996  break;
997  }
998 
999  if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
1000  skb, dataoff, th, pf)) {
1001  spin_unlock_bh(&ct->lock);
1002  return -NF_ACCEPT;
1003  }
1004  in_window:
1005  /* From now on we have got in-window packets */
1006  ct->proto.tcp.last_index = index;
1007  ct->proto.tcp.last_dir = dir;
1008 
1009  pr_debug("tcp_conntracks: ");
1010  nf_ct_dump_tuple(tuple);
1011  pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
1012  (th->syn ? 1 : 0), (th->ack ? 1 : 0),
1013  (th->fin ? 1 : 0), (th->rst ? 1 : 0),
1014  old_state, new_state);
1015 
1016  ct->proto.tcp.state = new_state;
1017  if (old_state != new_state
1018  && new_state == TCP_CONNTRACK_FIN_WAIT)
1019  ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
1020 
1021  if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
1022  timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
1023  timeout = timeouts[TCP_CONNTRACK_RETRANS];
1024  else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
1026  timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
1027  timeout = timeouts[TCP_CONNTRACK_UNACK];
1028  else
1029  timeout = timeouts[new_state];
1030  spin_unlock_bh(&ct->lock);
1031 
1032  if (new_state != old_state)
1033  nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
1034 
1035  if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1036  /* If only reply is a RST, we can consider ourselves not to
1037  have an established connection: this is a fairly common
1038  problem case, so we can delete the conntrack
1039  immediately. --RR */
1040  if (th->rst) {
1041  nf_ct_kill_acct(ct, ctinfo, skb);
1042  return NF_ACCEPT;
1043  }
1044  } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1045  && (old_state == TCP_CONNTRACK_SYN_RECV
1046  || old_state == TCP_CONNTRACK_ESTABLISHED)
1047  && new_state == TCP_CONNTRACK_ESTABLISHED) {
1048  /* Set ASSURED if we see see valid ack in ESTABLISHED
1049  after SYN_RECV or a valid answer for a picked up
1050  connection. */
1052  nf_conntrack_event_cache(IPCT_ASSURED, ct);
1053  }
1054  nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1055 
1056  return NF_ACCEPT;
1057 }
1058 
1059 /* Called when a new connection for this protocol found. */
1060 static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1061  unsigned int dataoff, unsigned int *timeouts)
1062 {
1063  enum tcp_conntrack new_state;
1064  const struct tcphdr *th;
1065  struct tcphdr _tcph;
1066  struct net *net = nf_ct_net(ct);
1067  struct nf_tcp_net *tn = tcp_pernet(net);
1068  const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
1069  const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
1070 
1071  th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
1072  BUG_ON(th == NULL);
1073 
1074  /* Don't need lock here: this conntrack not in circulation yet */
1075  new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
1076 
1077  /* Invalid: delete conntrack */
1078  if (new_state >= TCP_CONNTRACK_MAX) {
1079  pr_debug("nf_ct_tcp: invalid new deleting.\n");
1080  return false;
1081  }
1082 
1083  if (new_state == TCP_CONNTRACK_SYN_SENT) {
1084  memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1085  /* SYN packet */
1086  ct->proto.tcp.seen[0].td_end =
1087  segment_seq_plus_len(ntohl(th->seq), skb->len,
1088  dataoff, th);
1089  ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1090  if (ct->proto.tcp.seen[0].td_maxwin == 0)
1091  ct->proto.tcp.seen[0].td_maxwin = 1;
1092  ct->proto.tcp.seen[0].td_maxend =
1093  ct->proto.tcp.seen[0].td_end;
1094 
1095  tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
1096  } else if (tn->tcp_loose == 0) {
1097  /* Don't try to pick up connections. */
1098  return false;
1099  } else {
1100  memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1101  /*
1102  * We are in the middle of a connection,
1103  * its history is lost for us.
1104  * Let's try to use the data from the packet.
1105  */
1106  ct->proto.tcp.seen[0].td_end =
1107  segment_seq_plus_len(ntohl(th->seq), skb->len,
1108  dataoff, th);
1109  ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1110  if (ct->proto.tcp.seen[0].td_maxwin == 0)
1111  ct->proto.tcp.seen[0].td_maxwin = 1;
1112  ct->proto.tcp.seen[0].td_maxend =
1113  ct->proto.tcp.seen[0].td_end +
1114  ct->proto.tcp.seen[0].td_maxwin;
1115 
1116  /* We assume SACK and liberal window checking to handle
1117  * window scaling */
1118  ct->proto.tcp.seen[0].flags =
1119  ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
1121  }
1122 
1123  /* tcp_packet will set them */
1124  ct->proto.tcp.last_index = TCP_NONE_SET;
1125 
1126  pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1127  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1128  sender->td_end, sender->td_maxend, sender->td_maxwin,
1129  sender->td_scale,
1130  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1131  receiver->td_scale);
1132  return true;
1133 }
1134 
1135 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1136 
1137 #include <linux/netfilter/nfnetlink.h>
1139 
1140 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1141  struct nf_conn *ct)
1142 {
1143  struct nlattr *nest_parms;
1144  struct nf_ct_tcp_flags tmp = {};
1145 
1146  spin_lock_bh(&ct->lock);
1147  nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
1148  if (!nest_parms)
1149  goto nla_put_failure;
1150 
1151  if (nla_put_u8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state) ||
1152  nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1153  ct->proto.tcp.seen[0].td_scale) ||
1154  nla_put_u8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1155  ct->proto.tcp.seen[1].td_scale))
1156  goto nla_put_failure;
1157 
1158  tmp.flags = ct->proto.tcp.seen[0].flags;
1160  sizeof(struct nf_ct_tcp_flags), &tmp))
1161  goto nla_put_failure;
1162 
1163  tmp.flags = ct->proto.tcp.seen[1].flags;
1165  sizeof(struct nf_ct_tcp_flags), &tmp))
1166  goto nla_put_failure;
1167  spin_unlock_bh(&ct->lock);
1168 
1169  nla_nest_end(skb, nest_parms);
1170 
1171  return 0;
1172 
1173 nla_put_failure:
1174  spin_unlock_bh(&ct->lock);
1175  return -1;
1176 }
1177 
1178 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1179  [CTA_PROTOINFO_TCP_STATE] = { .type = NLA_U8 },
1180  [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1181  [CTA_PROTOINFO_TCP_WSCALE_REPLY] = { .type = NLA_U8 },
1182  [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL] = { .len = sizeof(struct nf_ct_tcp_flags) },
1183  [CTA_PROTOINFO_TCP_FLAGS_REPLY] = { .len = sizeof(struct nf_ct_tcp_flags) },
1184 };
1185 
1186 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1187 {
1188  struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1189  struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1190  int err;
1191 
1192  /* updates could not contain anything about the private
1193  * protocol info, in that case skip the parsing */
1194  if (!pattr)
1195  return 0;
1196 
1197  err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
1198  if (err < 0)
1199  return err;
1200 
1201  if (tb[CTA_PROTOINFO_TCP_STATE] &&
1202  nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1203  return -EINVAL;
1204 
1205  spin_lock_bh(&ct->lock);
1206  if (tb[CTA_PROTOINFO_TCP_STATE])
1207  ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1208 
1210  struct nf_ct_tcp_flags *attr =
1211  nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1212  ct->proto.tcp.seen[0].flags &= ~attr->mask;
1213  ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1214  }
1215 
1217  struct nf_ct_tcp_flags *attr =
1218  nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1219  ct->proto.tcp.seen[1].flags &= ~attr->mask;
1220  ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1221  }
1222 
1225  ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1226  ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1227  ct->proto.tcp.seen[0].td_scale =
1228  nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1229  ct->proto.tcp.seen[1].td_scale =
1230  nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1231  }
1232  spin_unlock_bh(&ct->lock);
1233 
1234  return 0;
1235 }
1236 
1237 static int tcp_nlattr_size(void)
1238 {
1239  return nla_total_size(0) /* CTA_PROTOINFO_TCP */
1240  + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
1241 }
1242 
1243 static int tcp_nlattr_tuple_size(void)
1244 {
1245  return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1246 }
1247 #endif
1248 
1249 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
1250 
1251 #include <linux/netfilter/nfnetlink.h>
1253 
1254 static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
1255  struct net *net, void *data)
1256 {
1257  unsigned int *timeouts = data;
1258  struct nf_tcp_net *tn = tcp_pernet(net);
1259  int i;
1260 
1261  /* set default TCP timeouts. */
1262  for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
1263  timeouts[i] = tn->timeouts[i];
1264 
1266  timeouts[TCP_CONNTRACK_SYN_SENT] =
1267  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT]))*HZ;
1268  }
1269  if (tb[CTA_TIMEOUT_TCP_SYN_RECV]) {
1270  timeouts[TCP_CONNTRACK_SYN_RECV] =
1271  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_RECV]))*HZ;
1272  }
1273  if (tb[CTA_TIMEOUT_TCP_ESTABLISHED]) {
1274  timeouts[TCP_CONNTRACK_ESTABLISHED] =
1275  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_ESTABLISHED]))*HZ;
1276  }
1277  if (tb[CTA_TIMEOUT_TCP_FIN_WAIT]) {
1278  timeouts[TCP_CONNTRACK_FIN_WAIT] =
1279  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_FIN_WAIT]))*HZ;
1280  }
1281  if (tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]) {
1282  timeouts[TCP_CONNTRACK_CLOSE_WAIT] =
1283  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE_WAIT]))*HZ;
1284  }
1285  if (tb[CTA_TIMEOUT_TCP_LAST_ACK]) {
1286  timeouts[TCP_CONNTRACK_LAST_ACK] =
1287  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_LAST_ACK]))*HZ;
1288  }
1289  if (tb[CTA_TIMEOUT_TCP_TIME_WAIT]) {
1290  timeouts[TCP_CONNTRACK_TIME_WAIT] =
1291  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_TIME_WAIT]))*HZ;
1292  }
1293  if (tb[CTA_TIMEOUT_TCP_CLOSE]) {
1294  timeouts[TCP_CONNTRACK_CLOSE] =
1295  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_CLOSE]))*HZ;
1296  }
1297  if (tb[CTA_TIMEOUT_TCP_SYN_SENT2]) {
1298  timeouts[TCP_CONNTRACK_SYN_SENT2] =
1299  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_SYN_SENT2]))*HZ;
1300  }
1301  if (tb[CTA_TIMEOUT_TCP_RETRANS]) {
1302  timeouts[TCP_CONNTRACK_RETRANS] =
1303  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_RETRANS]))*HZ;
1304  }
1305  if (tb[CTA_TIMEOUT_TCP_UNACK]) {
1306  timeouts[TCP_CONNTRACK_UNACK] =
1307  ntohl(nla_get_be32(tb[CTA_TIMEOUT_TCP_UNACK]))*HZ;
1308  }
1309  return 0;
1310 }
1311 
1312 static int
1313 tcp_timeout_obj_to_nlattr(struct sk_buff *skb, const void *data)
1314 {
1315  const unsigned int *timeouts = data;
1316 
1317  if (nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT,
1318  htonl(timeouts[TCP_CONNTRACK_SYN_SENT] / HZ)) ||
1319  nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_RECV,
1320  htonl(timeouts[TCP_CONNTRACK_SYN_RECV] / HZ)) ||
1321  nla_put_be32(skb, CTA_TIMEOUT_TCP_ESTABLISHED,
1322  htonl(timeouts[TCP_CONNTRACK_ESTABLISHED] / HZ)) ||
1323  nla_put_be32(skb, CTA_TIMEOUT_TCP_FIN_WAIT,
1324  htonl(timeouts[TCP_CONNTRACK_FIN_WAIT] / HZ)) ||
1325  nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE_WAIT,
1326  htonl(timeouts[TCP_CONNTRACK_CLOSE_WAIT] / HZ)) ||
1327  nla_put_be32(skb, CTA_TIMEOUT_TCP_LAST_ACK,
1328  htonl(timeouts[TCP_CONNTRACK_LAST_ACK] / HZ)) ||
1329  nla_put_be32(skb, CTA_TIMEOUT_TCP_TIME_WAIT,
1330  htonl(timeouts[TCP_CONNTRACK_TIME_WAIT] / HZ)) ||
1331  nla_put_be32(skb, CTA_TIMEOUT_TCP_CLOSE,
1332  htonl(timeouts[TCP_CONNTRACK_CLOSE] / HZ)) ||
1333  nla_put_be32(skb, CTA_TIMEOUT_TCP_SYN_SENT2,
1334  htonl(timeouts[TCP_CONNTRACK_SYN_SENT2] / HZ)) ||
1335  nla_put_be32(skb, CTA_TIMEOUT_TCP_RETRANS,
1336  htonl(timeouts[TCP_CONNTRACK_RETRANS] / HZ)) ||
1337  nla_put_be32(skb, CTA_TIMEOUT_TCP_UNACK,
1338  htonl(timeouts[TCP_CONNTRACK_UNACK] / HZ)))
1339  goto nla_put_failure;
1340  return 0;
1341 
1342 nla_put_failure:
1343  return -ENOSPC;
1344 }
1345 
1346 static const struct nla_policy tcp_timeout_nla_policy[CTA_TIMEOUT_TCP_MAX+1] = {
1348  [CTA_TIMEOUT_TCP_SYN_RECV] = { .type = NLA_U32 },
1349  [CTA_TIMEOUT_TCP_ESTABLISHED] = { .type = NLA_U32 },
1350  [CTA_TIMEOUT_TCP_FIN_WAIT] = { .type = NLA_U32 },
1351  [CTA_TIMEOUT_TCP_CLOSE_WAIT] = { .type = NLA_U32 },
1352  [CTA_TIMEOUT_TCP_LAST_ACK] = { .type = NLA_U32 },
1353  [CTA_TIMEOUT_TCP_TIME_WAIT] = { .type = NLA_U32 },
1354  [CTA_TIMEOUT_TCP_CLOSE] = { .type = NLA_U32 },
1355  [CTA_TIMEOUT_TCP_SYN_SENT2] = { .type = NLA_U32 },
1356 };
1357 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
1358 
1359 #ifdef CONFIG_SYSCTL
1360 static struct ctl_table tcp_sysctl_table[] = {
1361  {
1362  .procname = "nf_conntrack_tcp_timeout_syn_sent",
1363  .maxlen = sizeof(unsigned int),
1364  .mode = 0644,
1366  },
1367  {
1368  .procname = "nf_conntrack_tcp_timeout_syn_recv",
1369  .maxlen = sizeof(unsigned int),
1370  .mode = 0644,
1372  },
1373  {
1374  .procname = "nf_conntrack_tcp_timeout_established",
1375  .maxlen = sizeof(unsigned int),
1376  .mode = 0644,
1378  },
1379  {
1380  .procname = "nf_conntrack_tcp_timeout_fin_wait",
1381  .maxlen = sizeof(unsigned int),
1382  .mode = 0644,
1384  },
1385  {
1386  .procname = "nf_conntrack_tcp_timeout_close_wait",
1387  .maxlen = sizeof(unsigned int),
1388  .mode = 0644,
1390  },
1391  {
1392  .procname = "nf_conntrack_tcp_timeout_last_ack",
1393  .maxlen = sizeof(unsigned int),
1394  .mode = 0644,
1396  },
1397  {
1398  .procname = "nf_conntrack_tcp_timeout_time_wait",
1399  .maxlen = sizeof(unsigned int),
1400  .mode = 0644,
1402  },
1403  {
1404  .procname = "nf_conntrack_tcp_timeout_close",
1405  .maxlen = sizeof(unsigned int),
1406  .mode = 0644,
1408  },
1409  {
1410  .procname = "nf_conntrack_tcp_timeout_max_retrans",
1411  .maxlen = sizeof(unsigned int),
1412  .mode = 0644,
1414  },
1415  {
1416  .procname = "nf_conntrack_tcp_timeout_unacknowledged",
1417  .maxlen = sizeof(unsigned int),
1418  .mode = 0644,
1420  },
1421  {
1422  .procname = "nf_conntrack_tcp_loose",
1423  .maxlen = sizeof(unsigned int),
1424  .mode = 0644,
1426  },
1427  {
1428  .procname = "nf_conntrack_tcp_be_liberal",
1429  .maxlen = sizeof(unsigned int),
1430  .mode = 0644,
1432  },
1433  {
1434  .procname = "nf_conntrack_tcp_max_retrans",
1435  .maxlen = sizeof(unsigned int),
1436  .mode = 0644,
1438  },
1439  { }
1440 };
1441 
1442 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1443 static struct ctl_table tcp_compat_sysctl_table[] = {
1444  {
1445  .procname = "ip_conntrack_tcp_timeout_syn_sent",
1446  .maxlen = sizeof(unsigned int),
1447  .mode = 0644,
1449  },
1450  {
1451  .procname = "ip_conntrack_tcp_timeout_syn_sent2",
1452  .maxlen = sizeof(unsigned int),
1453  .mode = 0644,
1455  },
1456  {
1457  .procname = "ip_conntrack_tcp_timeout_syn_recv",
1458  .maxlen = sizeof(unsigned int),
1459  .mode = 0644,
1461  },
1462  {
1463  .procname = "ip_conntrack_tcp_timeout_established",
1464  .maxlen = sizeof(unsigned int),
1465  .mode = 0644,
1467  },
1468  {
1469  .procname = "ip_conntrack_tcp_timeout_fin_wait",
1470  .maxlen = sizeof(unsigned int),
1471  .mode = 0644,
1473  },
1474  {
1475  .procname = "ip_conntrack_tcp_timeout_close_wait",
1476  .maxlen = sizeof(unsigned int),
1477  .mode = 0644,
1479  },
1480  {
1481  .procname = "ip_conntrack_tcp_timeout_last_ack",
1482  .maxlen = sizeof(unsigned int),
1483  .mode = 0644,
1485  },
1486  {
1487  .procname = "ip_conntrack_tcp_timeout_time_wait",
1488  .maxlen = sizeof(unsigned int),
1489  .mode = 0644,
1491  },
1492  {
1493  .procname = "ip_conntrack_tcp_timeout_close",
1494  .maxlen = sizeof(unsigned int),
1495  .mode = 0644,
1497  },
1498  {
1499  .procname = "ip_conntrack_tcp_timeout_max_retrans",
1500  .maxlen = sizeof(unsigned int),
1501  .mode = 0644,
1503  },
1504  {
1505  .procname = "ip_conntrack_tcp_loose",
1506  .maxlen = sizeof(unsigned int),
1507  .mode = 0644,
1509  },
1510  {
1511  .procname = "ip_conntrack_tcp_be_liberal",
1512  .maxlen = sizeof(unsigned int),
1513  .mode = 0644,
1515  },
1516  {
1517  .procname = "ip_conntrack_tcp_max_retrans",
1518  .maxlen = sizeof(unsigned int),
1519  .mode = 0644,
1521  },
1522  { }
1523 };
1524 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
1525 #endif /* CONFIG_SYSCTL */
1526 
1527 static int tcp_kmemdup_sysctl_table(struct nf_proto_net *pn,
1528  struct nf_tcp_net *tn)
1529 {
1530 #ifdef CONFIG_SYSCTL
1531  if (pn->ctl_table)
1532  return 0;
1533 
1534  pn->ctl_table = kmemdup(tcp_sysctl_table,
1535  sizeof(tcp_sysctl_table),
1536  GFP_KERNEL);
1537  if (!pn->ctl_table)
1538  return -ENOMEM;
1539 
1540  pn->ctl_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
1541  pn->ctl_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
1542  pn->ctl_table[2].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
1543  pn->ctl_table[3].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
1544  pn->ctl_table[4].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
1545  pn->ctl_table[5].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
1546  pn->ctl_table[6].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
1547  pn->ctl_table[7].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
1548  pn->ctl_table[8].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
1549  pn->ctl_table[9].data = &tn->timeouts[TCP_CONNTRACK_UNACK];
1550  pn->ctl_table[10].data = &tn->tcp_loose;
1551  pn->ctl_table[11].data = &tn->tcp_be_liberal;
1552  pn->ctl_table[12].data = &tn->tcp_max_retrans;
1553 #endif
1554  return 0;
1555 }
1556 
1557 static int tcp_kmemdup_compat_sysctl_table(struct nf_proto_net *pn,
1558  struct nf_tcp_net *tn)
1559 {
1560 #ifdef CONFIG_SYSCTL
1561 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1562  pn->ctl_compat_table = kmemdup(tcp_compat_sysctl_table,
1563  sizeof(tcp_compat_sysctl_table),
1564  GFP_KERNEL);
1565  if (!pn->ctl_compat_table)
1566  return -ENOMEM;
1567 
1568  pn->ctl_compat_table[0].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT];
1569  pn->ctl_compat_table[1].data = &tn->timeouts[TCP_CONNTRACK_SYN_SENT2];
1570  pn->ctl_compat_table[2].data = &tn->timeouts[TCP_CONNTRACK_SYN_RECV];
1571  pn->ctl_compat_table[3].data = &tn->timeouts[TCP_CONNTRACK_ESTABLISHED];
1572  pn->ctl_compat_table[4].data = &tn->timeouts[TCP_CONNTRACK_FIN_WAIT];
1573  pn->ctl_compat_table[5].data = &tn->timeouts[TCP_CONNTRACK_CLOSE_WAIT];
1574  pn->ctl_compat_table[6].data = &tn->timeouts[TCP_CONNTRACK_LAST_ACK];
1575  pn->ctl_compat_table[7].data = &tn->timeouts[TCP_CONNTRACK_TIME_WAIT];
1576  pn->ctl_compat_table[8].data = &tn->timeouts[TCP_CONNTRACK_CLOSE];
1577  pn->ctl_compat_table[9].data = &tn->timeouts[TCP_CONNTRACK_RETRANS];
1578  pn->ctl_compat_table[10].data = &tn->tcp_loose;
1579  pn->ctl_compat_table[11].data = &tn->tcp_be_liberal;
1580  pn->ctl_compat_table[12].data = &tn->tcp_max_retrans;
1581 #endif
1582 #endif
1583  return 0;
1584 }
1585 
1586 static int tcp_init_net(struct net *net, u_int16_t proto)
1587 {
1588  int ret;
1589  struct nf_tcp_net *tn = tcp_pernet(net);
1590  struct nf_proto_net *pn = &tn->pn;
1591 
1592  if (!pn->users) {
1593  int i;
1594 
1595  for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
1596  tn->timeouts[i] = tcp_timeouts[i];
1597 
1598  tn->tcp_loose = nf_ct_tcp_loose;
1599  tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
1600  tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
1601  }
1602 
1603  if (proto == AF_INET) {
1604  ret = tcp_kmemdup_compat_sysctl_table(pn, tn);
1605  if (ret < 0)
1606  return ret;
1607 
1608  ret = tcp_kmemdup_sysctl_table(pn, tn);
1609  if (ret < 0)
1610  nf_ct_kfree_compat_sysctl_table(pn);
1611  } else
1612  ret = tcp_kmemdup_sysctl_table(pn, tn);
1613 
1614  return ret;
1615 }
1616 
1617 static struct nf_proto_net *tcp_get_net_proto(struct net *net)
1618 {
1619  return &net->ct.nf_ct_proto.tcp.pn;
1620 }
1621 
1623 {
1624  .l3proto = PF_INET,
1625  .l4proto = IPPROTO_TCP,
1626  .name = "tcp",
1627  .pkt_to_tuple = tcp_pkt_to_tuple,
1628  .invert_tuple = tcp_invert_tuple,
1629  .print_tuple = tcp_print_tuple,
1630  .print_conntrack = tcp_print_conntrack,
1631  .packet = tcp_packet,
1632  .get_timeouts = tcp_get_timeouts,
1633  .new = tcp_new,
1634  .error = tcp_error,
1635 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1636  .to_nlattr = tcp_to_nlattr,
1637  .nlattr_size = tcp_nlattr_size,
1638  .from_nlattr = nlattr_to_tcp,
1639  .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
1640  .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
1641  .nlattr_tuple_size = tcp_nlattr_tuple_size,
1642  .nla_policy = nf_ct_port_nla_policy,
1643 #endif
1644 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
1645  .ctnl_timeout = {
1646  .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
1647  .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
1648  .nlattr_max = CTA_TIMEOUT_TCP_MAX,
1649  .obj_size = sizeof(unsigned int) *
1650  TCP_CONNTRACK_TIMEOUT_MAX,
1651  .nla_policy = tcp_timeout_nla_policy,
1652  },
1653 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
1654  .init_net = tcp_init_net,
1655  .get_net_proto = tcp_get_net_proto,
1656 };
1657 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
1658 
1659 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
1660 {
1661  .l3proto = PF_INET6,
1662  .l4proto = IPPROTO_TCP,
1663  .name = "tcp",
1664  .pkt_to_tuple = tcp_pkt_to_tuple,
1665  .invert_tuple = tcp_invert_tuple,
1666  .print_tuple = tcp_print_tuple,
1667  .print_conntrack = tcp_print_conntrack,
1668  .packet = tcp_packet,
1669  .get_timeouts = tcp_get_timeouts,
1670  .new = tcp_new,
1671  .error = tcp_error,
1672 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1673  .to_nlattr = tcp_to_nlattr,
1674  .nlattr_size = tcp_nlattr_size,
1675  .from_nlattr = nlattr_to_tcp,
1676  .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
1677  .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
1678  .nlattr_tuple_size = tcp_nlattr_tuple_size,
1679  .nla_policy = nf_ct_port_nla_policy,
1680 #endif
1681 #if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
1682  .ctnl_timeout = {
1683  .nlattr_to_obj = tcp_timeout_nlattr_to_obj,
1684  .obj_to_nlattr = tcp_timeout_obj_to_nlattr,
1685  .nlattr_max = CTA_TIMEOUT_TCP_MAX,
1686  .obj_size = sizeof(unsigned int) *
1687  TCP_CONNTRACK_TIMEOUT_MAX,
1688  .nla_policy = tcp_timeout_nla_policy,
1689  },
1690 #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
1691  .init_net = tcp_init_net,
1692  .get_net_proto = tcp_get_net_proto,
1693 };
1694 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);