Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
tcp_cong.c
Go to the documentation of this file.
1 /*
2  * Plugable TCP congestion control support and newReno
3  * congestion control.
4  * Based on ideas from I/O scheduler suport and Web100.
5  *
6  * Copyright (C) 2005 Stephen Hemminger <[email protected]>
7  */
8 
9 #define pr_fmt(fmt) "TCP: " fmt
10 
11 #include <linux/module.h>
12 #include <linux/mm.h>
13 #include <linux/types.h>
14 #include <linux/list.h>
15 #include <linux/gfp.h>
16 #include <net/tcp.h>
17 
19 
20 static DEFINE_SPINLOCK(tcp_cong_list_lock);
21 static LIST_HEAD(tcp_cong_list);
22 
23 /* Simple linear search, don't expect many entries! */
24 static struct tcp_congestion_ops *tcp_ca_find(const char *name)
25 {
26  struct tcp_congestion_ops *e;
27 
28  list_for_each_entry_rcu(e, &tcp_cong_list, list) {
29  if (strcmp(e->name, name) == 0)
30  return e;
31  }
32 
33  return NULL;
34 }
35 
36 /*
37  * Attach new congestion control algorithm to the list
38  * of available options.
39  */
41 {
42  int ret = 0;
43 
44  /* all algorithms must implement ssthresh and cong_avoid ops */
45  if (!ca->ssthresh || !ca->cong_avoid) {
46  pr_err("%s does not implement required ops\n", ca->name);
47  return -EINVAL;
48  }
49 
50  spin_lock(&tcp_cong_list_lock);
51  if (tcp_ca_find(ca->name)) {
52  pr_notice("%s already registered\n", ca->name);
53  ret = -EEXIST;
54  } else {
55  list_add_tail_rcu(&ca->list, &tcp_cong_list);
56  pr_info("%s registered\n", ca->name);
57  }
58  spin_unlock(&tcp_cong_list_lock);
59 
60  return ret;
61 }
63 
64 /*
65  * Remove congestion control algorithm, called from
66  * the module's remove function. Module ref counts are used
67  * to ensure that this can't be done till all sockets using
68  * that method are closed.
69  */
71 {
72  spin_lock(&tcp_cong_list_lock);
73  list_del_rcu(&ca->list);
74  spin_unlock(&tcp_cong_list_lock);
75 }
77 
78 /* Assign choice of congestion control. */
80 {
81  struct inet_connection_sock *icsk = inet_csk(sk);
82  struct tcp_congestion_ops *ca;
83 
84  /* if no choice made yet assign the current value set as default */
85  if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) {
86  rcu_read_lock();
87  list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
88  if (try_module_get(ca->owner)) {
89  icsk->icsk_ca_ops = ca;
90  break;
91  }
92 
93  /* fallback to next available */
94  }
95  rcu_read_unlock();
96  }
97 
98  if (icsk->icsk_ca_ops->init)
99  icsk->icsk_ca_ops->init(sk);
100 }
101 
102 /* Manage refcounts on socket close. */
104 {
105  struct inet_connection_sock *icsk = inet_csk(sk);
106 
107  if (icsk->icsk_ca_ops->release)
108  icsk->icsk_ca_ops->release(sk);
109  module_put(icsk->icsk_ca_ops->owner);
110 }
111 
112 /* Used by sysctl to change default congestion control */
114 {
115  struct tcp_congestion_ops *ca;
116  int ret = -ENOENT;
117 
118  spin_lock(&tcp_cong_list_lock);
119  ca = tcp_ca_find(name);
120 #ifdef CONFIG_MODULES
121  if (!ca && capable(CAP_NET_ADMIN)) {
122  spin_unlock(&tcp_cong_list_lock);
123 
124  request_module("tcp_%s", name);
125  spin_lock(&tcp_cong_list_lock);
126  ca = tcp_ca_find(name);
127  }
128 #endif
129 
130  if (ca) {
131  ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
132  list_move(&ca->list, &tcp_cong_list);
133  ret = 0;
134  }
135  spin_unlock(&tcp_cong_list_lock);
136 
137  return ret;
138 }
139 
140 /* Set default value from kernel configuration at bootup */
141 static int __init tcp_congestion_default(void)
142 {
143  return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG);
144 }
145 late_initcall(tcp_congestion_default);
146 
147 
148 /* Build string with list of available congestion control values */
149 void tcp_get_available_congestion_control(char *buf, size_t maxlen)
150 {
151  struct tcp_congestion_ops *ca;
152  size_t offs = 0;
153 
154  rcu_read_lock();
155  list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
156  offs += snprintf(buf + offs, maxlen - offs,
157  "%s%s",
158  offs == 0 ? "" : " ", ca->name);
159 
160  }
161  rcu_read_unlock();
162 }
163 
164 /* Get current default congestion control */
166 {
167  struct tcp_congestion_ops *ca;
168  /* We will always have reno... */
169  BUG_ON(list_empty(&tcp_cong_list));
170 
171  rcu_read_lock();
172  ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
173  strncpy(name, ca->name, TCP_CA_NAME_MAX);
174  rcu_read_unlock();
175 }
176 
177 /* Built list of non-restricted congestion control values */
178 void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
179 {
180  struct tcp_congestion_ops *ca;
181  size_t offs = 0;
182 
183  *buf = '\0';
184  rcu_read_lock();
185  list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
186  if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
187  continue;
188  offs += snprintf(buf + offs, maxlen - offs,
189  "%s%s",
190  offs == 0 ? "" : " ", ca->name);
191 
192  }
193  rcu_read_unlock();
194 }
195 
196 /* Change list of non-restricted congestion control */
198 {
199  struct tcp_congestion_ops *ca;
200  char *saved_clone, *clone, *name;
201  int ret = 0;
202 
203  saved_clone = clone = kstrdup(val, GFP_USER);
204  if (!clone)
205  return -ENOMEM;
206 
207  spin_lock(&tcp_cong_list_lock);
208  /* pass 1 check for bad entries */
209  while ((name = strsep(&clone, " ")) && *name) {
210  ca = tcp_ca_find(name);
211  if (!ca) {
212  ret = -ENOENT;
213  goto out;
214  }
215  }
216 
217  /* pass 2 clear old values */
218  list_for_each_entry_rcu(ca, &tcp_cong_list, list)
220 
221  /* pass 3 mark as allowed */
222  while ((name = strsep(&val, " ")) && *name) {
223  ca = tcp_ca_find(name);
224  WARN_ON(!ca);
225  if (ca)
227  }
228 out:
229  spin_unlock(&tcp_cong_list_lock);
230  kfree(saved_clone);
231 
232  return ret;
233 }
234 
235 
236 /* Change congestion control for socket */
237 int tcp_set_congestion_control(struct sock *sk, const char *name)
238 {
239  struct inet_connection_sock *icsk = inet_csk(sk);
240  struct tcp_congestion_ops *ca;
241  int err = 0;
242 
243  rcu_read_lock();
244  ca = tcp_ca_find(name);
245 
246  /* no change asking for existing value */
247  if (ca == icsk->icsk_ca_ops)
248  goto out;
249 
250 #ifdef CONFIG_MODULES
251  /* not found attempt to autoload module */
252  if (!ca && capable(CAP_NET_ADMIN)) {
253  rcu_read_unlock();
254  request_module("tcp_%s", name);
255  rcu_read_lock();
256  ca = tcp_ca_find(name);
257  }
258 #endif
259  if (!ca)
260  err = -ENOENT;
261 
262  else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN)))
263  err = -EPERM;
264 
265  else if (!try_module_get(ca->owner))
266  err = -EBUSY;
267 
268  else {
270  icsk->icsk_ca_ops = ca;
271 
272  if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init)
273  icsk->icsk_ca_ops->init(sk);
274  }
275  out:
276  rcu_read_unlock();
277  return err;
278 }
279 
280 /* RFC2861 Check whether we are limited by application or congestion window
281  * This is the inverse of cwnd check in tcp_tso_should_defer
282  */
283 bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
284 {
285  const struct tcp_sock *tp = tcp_sk(sk);
286  u32 left;
287 
288  if (in_flight >= tp->snd_cwnd)
289  return true;
290 
291  left = tp->snd_cwnd - in_flight;
292  if (sk_can_gso(sk) &&
293  left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
294  left * tp->mss_cache < sk->sk_gso_max_size &&
295  left < sk->sk_gso_max_segs)
296  return true;
297  return left <= tcp_max_tso_deferred_mss(tp);
298 }
300 
301 /*
302  * Slow start is used when congestion window is less than slow start
303  * threshold. This version implements the basic RFC2581 version
304  * and optionally supports:
305  * RFC3742 Limited Slow Start - growth limited to max_ssthresh
306  * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
307  */
308 void tcp_slow_start(struct tcp_sock *tp)
309 {
310  int cnt; /* increase in packets */
311  unsigned int delta = 0;
312 
313  /* RFC3465: ABC Slow start
314  * Increase only after a full MSS of bytes is acked
315  *
316  * TCP sender SHOULD increase cwnd by the number of
317  * previously unacknowledged bytes ACKed by each incoming
318  * acknowledgment, provided the increase is not more than L
319  */
320  if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
321  return;
322 
324  cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */
325  else
326  cnt = tp->snd_cwnd; /* exponential increase */
327 
328  /* RFC3465: ABC
329  * We MAY increase by 2 if discovered delayed ack
330  */
331  if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
332  cnt <<= 1;
333  tp->bytes_acked = 0;
334 
335  tp->snd_cwnd_cnt += cnt;
336  while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
337  tp->snd_cwnd_cnt -= tp->snd_cwnd;
338  delta++;
339  }
340  tp->snd_cwnd = min(tp->snd_cwnd + delta, tp->snd_cwnd_clamp);
341 }
343 
344 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
345 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
346 {
347  if (tp->snd_cwnd_cnt >= w) {
348  if (tp->snd_cwnd < tp->snd_cwnd_clamp)
349  tp->snd_cwnd++;
350  tp->snd_cwnd_cnt = 0;
351  } else {
352  tp->snd_cwnd_cnt++;
353  }
354 }
356 
357 /*
358  * TCP Reno congestion control
359  * This is special case used for fallback as well.
360  */
361 /* This is Jacobson's slow start and congestion avoidance.
362  * SIGCOMM '88, p. 328.
363  */
364 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
365 {
366  struct tcp_sock *tp = tcp_sk(sk);
367 
368  if (!tcp_is_cwnd_limited(sk, in_flight))
369  return;
370 
371  /* In "safe" area, increase. */
372  if (tp->snd_cwnd <= tp->snd_ssthresh)
373  tcp_slow_start(tp);
374 
375  /* In dangerous area, increase slowly. */
376  else if (sysctl_tcp_abc) {
377  /* RFC3465: Appropriate Byte Count
378  * increase once for each full cwnd acked
379  */
380  if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) {
381  tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache;
382  if (tp->snd_cwnd < tp->snd_cwnd_clamp)
383  tp->snd_cwnd++;
384  }
385  } else {
386  tcp_cong_avoid_ai(tp, tp->snd_cwnd);
387  }
388 }
390 
391 /* Slow start threshold is half the congestion window (min 2) */
393 {
394  const struct tcp_sock *tp = tcp_sk(sk);
395  return max(tp->snd_cwnd >> 1U, 2U);
396 }
398 
399 /* Lower bound on congestion window with halving. */
400 u32 tcp_reno_min_cwnd(const struct sock *sk)
401 {
402  const struct tcp_sock *tp = tcp_sk(sk);
403  return tp->snd_ssthresh/2;
404 }
406 
408  .flags = TCP_CONG_NON_RESTRICTED,
409  .name = "reno",
410  .owner = THIS_MODULE,
411  .ssthresh = tcp_reno_ssthresh,
412  .cong_avoid = tcp_reno_cong_avoid,
413  .min_cwnd = tcp_reno_min_cwnd,
414 };
415 
416 /* Initial congestion control used (until SYN)
417  * really reno under another name so we can tell difference
418  * during tcp_set_default_congestion_control
419  */
421  .name = "",
422  .owner = THIS_MODULE,
423  .ssthresh = tcp_reno_ssthresh,
424  .cong_avoid = tcp_reno_cong_avoid,
425  .min_cwnd = tcp_reno_min_cwnd,
426 };
427 EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);