linux/net/sched/sch_netem.c

1302 lines
31 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* net/sched/sch_netem.c Network emulator
*
* Many of the algorithms and ideas for this came from
* NIST Net which is not copyrighted.
*
* Authors: Stephen Hemminger <shemminger@osdl.org>
* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
*/
#include <linux/mm.h>
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/reciprocal_div.h>
#include <linux/rbtree.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
#define VERSION "1.3"
/* Network Emulation Queuing algorithm.
====================================
Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
Network Emulation Tool
[2] Luigi Rizzo, DummyNet for FreeBSD
----------------------------------------------------------------
This started out as a simple way to delay outgoing packets to
test TCP but has grown to include most of the functionality
of a full blown network emulator like NISTnet. It can delay
packets and add random jitter (and correlation). The random
distribution can be loaded from a table as well to provide
normal, Pareto, or experimental curves. Packet loss,
duplication, and reordering can also be emulated.
This qdisc does not do classification that can be handled in
layering other disciplines. It does not need to do bandwidth
control either since that can be handled by using token
bucket or other rate control.
Correlated Loss Generator models
Added generation of correlated loss according to the
"Gilbert-Elliot" model, a 4-state markov model.
References:
[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
and intuitive loss model for packet networks and its implementation
in the Netem module in the Linux kernel", available in [1]
Authors: Stefano Salsano <stefano.salsano at uniroma2.it
Fabio Ludovici <fabio.ludovici at yahoo.it>
*/
struct disttable {
u32 size;
s16 table[];
};
struct netem_sched_data {
/* internal t(ime)fifo qdisc uses t_root and sch->limit */
struct rb_root t_root;
/* a linear queue; reduces rbtree rebalancing when jitter is low */
struct sk_buff *t_head;
struct sk_buff *t_tail;
/* optional qdisc for classful handling (NULL at netem init) */
struct Qdisc *qdisc;
struct qdisc_watchdog watchdog;
s64 latency;
s64 jitter;
u32 loss;
u32 ecn;
u32 limit;
u32 counter;
u32 gap;
u32 duplicate;
u32 reorder;
u32 corrupt;
u64 rate;
s32 packet_overhead;
u32 cell_size;
reciprocal_divide: update/correction of the algorithm Jakub Zawadzki noticed that some divisions by reciprocal_divide() were not correct [1][2], which he could also show with BPF code after divisions are transformed into reciprocal_value() for runtime invariance which can be passed to reciprocal_divide() later on; reverse in BPF dump ended up with a different, off-by-one K in some situations. This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not use reciprocal divide"). This follow-up patch improves reciprocal_value() and reciprocal_divide() to work in all cases by using Granlund and Montgomery method, so that also future use is safe and without any non-obvious side-effects. Known problems with the old implementation were that division by 1 always returned 0 and some off-by-ones when the dividend and divisor where very large. This seemed to not be problematic with its current users, as far as we can tell. Eric Dumazet checked for the slab usage, we cannot surely say so in the case of flex_array. Still, in order to fix that, we propose an extension from the original implementation from commit 6a2d7a955d8d resp. [3][4], by using the algorithm proposed in "Division by Invariant Integers Using Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is, pseudocode for q = n/d where q, n, d is in u32 universe: 1) Initialization: int l = ceil(log_2 d) uword m' = floor((1<<32)*((1<<l)-d)/d)+1 int sh_1 = min(l,1) int sh_2 = max(l-1,0) 2) For q = n/d, all uword: uword t = (n*m')>>32 q = (t+((n-t)>>sh_1))>>sh_2 The assembler implementation from Agner Fog [6] also helped a lot while implementing. We have tested the implementation on x86_64, ppc64, i686, s390x; on x86_64/haswell we're still half the latency compared to normal divide. Joint work with Daniel Borkmann. [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c [3] https://gmplib.org/~tege/division-paper.pdf [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 [6] http://www.agner.org/optimize/asmlib.zip Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Austin S Hemmelgarn <ahferroin7@gmail.com> Cc: linux-kernel@vger.kernel.org Cc: Jesse Gross <jesse@nicira.com> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Matt Mackall <mpm@selenic.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andy Gospodarek <andy@greyhouse.net> Cc: Veaceslav Falico <vfalico@redhat.com> Cc: Jay Vosburgh <fubar@us.ibm.com> Cc: Jakub Zawadzki <darkjames-ws@darkjames.pl> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 09:29:41 +08:00
struct reciprocal_value cell_size_reciprocal;
s32 cell_overhead;
struct crndstate {
u32 last;
u32 rho;
} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
struct disttable *delay_dist;
enum {
CLG_RANDOM,
CLG_4_STATES,
CLG_GILB_ELL,
} loss_model;
enum {
TX_IN_GAP_PERIOD = 1,
TX_IN_BURST_PERIOD,
LOST_IN_GAP_PERIOD,
LOST_IN_BURST_PERIOD,
} _4_state_model;
enum {
GOOD_STATE = 1,
BAD_STATE,
} GE_state_model;
/* Correlated Loss Generation models */
struct clgstate {
/* state of the Markov chain */
u8 state;
/* 4-states and Gilbert-Elliot models */
u32 a1; /* p13 for 4-states or p for GE */
u32 a2; /* p31 for 4-states or r for GE */
u32 a3; /* p32 for 4-states or h for GE */
u32 a4; /* p14 for 4-states or 1-k for GE */
u32 a5; /* p23 used only in 4-states */
} clg;
struct tc_netem_slot slot_config;
struct slotstate {
u64 slot_next;
s32 packets_left;
s32 bytes_left;
} slot;
struct disttable *slot_dist;
};
/* Time stamp put into socket buffer control block
* Only valid when skbs are in our internal t(ime)fifo queue.
*
* As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
* and skb->next & skb->prev are scratch space for a qdisc,
* we save skb->tstamp value in skb->cb[] before destroying it.
*/
struct netem_skb_cb {
u64 time_to_send;
};
static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
{
/* we assume we can use skb next/prev/tstamp as storage for rb_node */
qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
}
/* init_crandom - initialize correlated random number generator
* Use entropy source for initial seed.
*/
static void init_crandom(struct crndstate *state, unsigned long rho)
{
state->rho = rho;
state->last = prandom_u32();
}
/* get_crandom - correlated random number generator
* Next number depends on last value.
* rho is scaled to avoid floating point.
*/
static u32 get_crandom(struct crndstate *state)
{
u64 value, rho;
unsigned long answer;
if (!state || state->rho == 0) /* no correlation */
return prandom_u32();
value = prandom_u32();
rho = (u64)state->rho + 1;
answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
state->last = answer;
return answer;
}
/* loss_4state - 4-state model loss generator
* Generates losses according to the 4-state Markov chain adopted in
* the GI (General and Intuitive) loss model.
*/
static bool loss_4state(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
u32 rnd = prandom_u32();
/*
* Makes a comparison between rnd and the transition
* probabilities outgoing from the current state, then decides the
* next state and if the next packet has to be transmitted or lost.
* The four states correspond to:
* TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
* LOST_IN_BURST_PERIOD => isolated losses within a gap period
* LOST_IN_GAP_PERIOD => lost packets within a burst period
* TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
*/
switch (clg->state) {
case TX_IN_GAP_PERIOD:
if (rnd < clg->a4) {
clg->state = LOST_IN_BURST_PERIOD;
return true;
} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
clg->state = LOST_IN_GAP_PERIOD;
return true;
} else if (clg->a1 + clg->a4 < rnd) {
clg->state = TX_IN_GAP_PERIOD;
}
break;
case TX_IN_BURST_PERIOD:
if (rnd < clg->a5) {
clg->state = LOST_IN_GAP_PERIOD;
return true;
} else {
clg->state = TX_IN_BURST_PERIOD;
}
break;
case LOST_IN_GAP_PERIOD:
if (rnd < clg->a3)
clg->state = TX_IN_BURST_PERIOD;
else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
clg->state = TX_IN_GAP_PERIOD;
} else if (clg->a2 + clg->a3 < rnd) {
clg->state = LOST_IN_GAP_PERIOD;
return true;
}
break;
case LOST_IN_BURST_PERIOD:
clg->state = TX_IN_GAP_PERIOD;
break;
}
return false;
}
/* loss_gilb_ell - Gilbert-Elliot model loss generator
* Generates losses according to the Gilbert-Elliot loss model or
* its special cases (Gilbert or Simple Gilbert)
*
* Makes a comparison between random number and the transition
* probabilities outgoing from the current state, then decides the
* next state. A second random number is extracted and the comparison
* with the loss probability of the current state decides if the next
* packet will be transmitted or lost.
*/
static bool loss_gilb_ell(struct netem_sched_data *q)
{
struct clgstate *clg = &q->clg;
switch (clg->state) {
case GOOD_STATE:
if (prandom_u32() < clg->a1)
clg->state = BAD_STATE;
if (prandom_u32() < clg->a4)
return true;
break;
case BAD_STATE:
if (prandom_u32() < clg->a2)
clg->state = GOOD_STATE;
if (prandom_u32() > clg->a3)
return true;
}
return false;
}
static bool loss_event(struct netem_sched_data *q)
{
switch (q->loss_model) {
case CLG_RANDOM:
/* Random packet drop 0 => none, ~0 => all */
return q->loss && q->loss >= get_crandom(&q->loss_cor);
case CLG_4_STATES:
/* 4state loss model algorithm (used also for GI model)
* Extracts a value from the markov 4 state loss generator,
* if it is 1 drops a packet and if needed writes the event in
* the kernel logs
*/
return loss_4state(q);
case CLG_GILB_ELL:
/* Gilbert-Elliot loss model algorithm
* Extracts a value from the Gilbert-Elliot loss generator,
* if it is 1 drops a packet and if needed writes the event in
* the kernel logs
*/
return loss_gilb_ell(q);
}
return false; /* not reached */
}
/* tabledist - return a pseudo-randomly distributed value with mean mu and
* std deviation sigma. Uses table lookup to approximate the desired
* distribution, and a uniformly-distributed pseudo-random source.
*/
static s64 tabledist(s64 mu, s32 sigma,
struct crndstate *state,
const struct disttable *dist)
{
s64 x;
long t;
u32 rnd;
if (sigma == 0)
return mu;
rnd = get_crandom(state);
/* default uniform distribution */
if (dist == NULL)
return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
t = dist->table[rnd % dist->size];
x = (sigma % NETEM_DIST_SCALE) * t;
if (x >= 0)
x += NETEM_DIST_SCALE/2;
else
x -= NETEM_DIST_SCALE/2;
return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
}
static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
{
len += q->packet_overhead;
if (q->cell_size) {
u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
if (len > cells * q->cell_size) /* extra cell needed for remainder */
cells++;
len = cells * (q->cell_size + q->cell_overhead);
}
return div64_u64(len * NSEC_PER_SEC, q->rate);
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
}
static void tfifo_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
sch_netem: faster rb tree removal While running TCP tests involving netem storing millions of packets, I had the idea to speed up tfifo_reset() and did experiments. I tried the rbtree_postorder_for_each_entry_safe() method that is used in skb_rbtree_purge() but discovered it was slower than the current tfifo_reset() method. I measured time taken to release skbs with three occupation levels : 10^4, 10^5 and 10^6 skbs with three methods : 1) (current 'naive' method) while ((p = rb_first(&q->t_root))) { struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); rtnl_kfree_skbs(skb, skb); } 2) Use rb_next() instead of rb_first() in the loop : p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = netem_rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } 3) "optimized" method using rbtree_postorder_for_each_entry_safe() struct sk_buff *skb, *next; rbtree_postorder_for_each_entry_safe(skb, next, &q->t_root, rbnode) { rtnl_kfree_skbs(skb, skb); } q->t_root = RB_ROOT; Results : method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 10000 skbs in 541846 ns (54 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb) method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 100000 skbs in 5942456 ns (59 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb) method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 1000000 skbs in 82619635 ns (82 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb) Method 2) is simply faster, probably because it maintains a smaller working size set. Note that this is the method we use in tcp_ofo_queue() already. I will also change skb_rbtree_purge() in a second patch. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-24 02:07:28 +08:00
struct rb_node *p = rb_first(&q->t_root);
sch_netem: faster rb tree removal While running TCP tests involving netem storing millions of packets, I had the idea to speed up tfifo_reset() and did experiments. I tried the rbtree_postorder_for_each_entry_safe() method that is used in skb_rbtree_purge() but discovered it was slower than the current tfifo_reset() method. I measured time taken to release skbs with three occupation levels : 10^4, 10^5 and 10^6 skbs with three methods : 1) (current 'naive' method) while ((p = rb_first(&q->t_root))) { struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); rtnl_kfree_skbs(skb, skb); } 2) Use rb_next() instead of rb_first() in the loop : p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = netem_rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } 3) "optimized" method using rbtree_postorder_for_each_entry_safe() struct sk_buff *skb, *next; rbtree_postorder_for_each_entry_safe(skb, next, &q->t_root, rbnode) { rtnl_kfree_skbs(skb, skb); } q->t_root = RB_ROOT; Results : method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 10000 skbs in 541846 ns (54 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb) method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 100000 skbs in 5942456 ns (59 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb) method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 1000000 skbs in 82619635 ns (82 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb) Method 2) is simply faster, probably because it maintains a smaller working size set. Note that this is the method we use in tcp_ofo_queue() already. I will also change skb_rbtree_purge() in a second patch. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-24 02:07:28 +08:00
while (p) {
struct sk_buff *skb = rb_to_skb(p);
sch_netem: faster rb tree removal While running TCP tests involving netem storing millions of packets, I had the idea to speed up tfifo_reset() and did experiments. I tried the rbtree_postorder_for_each_entry_safe() method that is used in skb_rbtree_purge() but discovered it was slower than the current tfifo_reset() method. I measured time taken to release skbs with three occupation levels : 10^4, 10^5 and 10^6 skbs with three methods : 1) (current 'naive' method) while ((p = rb_first(&q->t_root))) { struct sk_buff *skb = netem_rb_to_skb(p); rb_erase(p, &q->t_root); rtnl_kfree_skbs(skb, skb); } 2) Use rb_next() instead of rb_first() in the loop : p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = netem_rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } 3) "optimized" method using rbtree_postorder_for_each_entry_safe() struct sk_buff *skb, *next; rbtree_postorder_for_each_entry_safe(skb, next, &q->t_root, rbnode) { rtnl_kfree_skbs(skb, skb); } q->t_root = RB_ROOT; Results : method_1:while (rb_first()) rb_erase() 10000 skbs in 690378 ns (69 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 10000 skbs in 541846 ns (54 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 10000 skbs in 868307 ns (86 ns per skb) method_1:while (rb_first()) rb_erase() 99996 skbs in 7804021 ns (78 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 100000 skbs in 5942456 ns (59 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 100000 skbs in 11584940 ns (115 ns per skb) method_1:while (rb_first()) rb_erase() 1000000 skbs in 108577838 ns (108 ns per skb) method_2:rb_first; while (p) { p = rb_next(p); ...} 1000000 skbs in 82619635 ns (82 ns per skb) method_3:rbtree_postorder_for_each_entry_safe() 1000000 skbs in 127328743 ns (127 ns per skb) Method 2) is simply faster, probably because it maintains a smaller working size set. Note that this is the method we use in tcp_ofo_queue() already. I will also change skb_rbtree_purge() in a second patch. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: David Ahern <dsahern@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-09-24 02:07:28 +08:00
p = rb_next(p);
rb_erase(&skb->rbnode, &q->t_root);
rtnl_kfree_skbs(skb, skb);
}
rtnl_kfree_skbs(q->t_head, q->t_tail);
q->t_head = NULL;
q->t_tail = NULL;
}
static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
u64 tnext = netem_skb_cb(nskb)->time_to_send;
if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
if (q->t_tail)
q->t_tail->next = nskb;
else
q->t_head = nskb;
q->t_tail = nskb;
} else {
struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
while (*p) {
struct sk_buff *skb;
parent = *p;
skb = rb_to_skb(parent);
if (tnext >= netem_skb_cb(skb)->time_to_send)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&nskb->rbnode, parent, p);
rb_insert_color(&nskb->rbnode, &q->t_root);
}
sch->q.qlen++;
}
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
* when we statistically choose to corrupt one, we instead segment it, returning
* the first packet to be corrupted, and re-enqueue the remaining frames
*/
static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
{
struct sk_buff *segs;
netdev_features_t features = netif_skb_features(skb);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
qdisc_drop(skb, sch, to_free);
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
return NULL;
}
consume_skb(skb);
return segs;
}
/*
* Insert one skb into qdisc.
* Note: parent depends on return value to account for queue length.
* NET_XMIT_DROP: queue length didn't change.
* NET_XMIT_SUCCESS: one skb was queued.
*/
static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
struct netem_sched_data *q = qdisc_priv(sch);
/* We don't fill cb now as skb_unshare() may invalidate it */
struct netem_skb_cb *cb;
struct sk_buff *skb2;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
struct sk_buff *segs = NULL;
unsigned int prev_len = qdisc_pkt_len(skb);
int count = 1;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
int rc = NET_XMIT_SUCCESS;
net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] <IRQ> [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] </IRQ> In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan <lansheng@huawei.com> Reported-by: Qin Ji <jiqin.ji@huawei.com> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-28 18:47:58 +08:00
int rc_drop = NET_XMIT_DROP;
net: Prevent invalid access to skb->prev in __qdisc_drop_all __qdisc_drop_all() accesses skb->prev to get to the tail of the segment-list. With commit 68d2f84a1368 ("net: gro: properly remove skb from list") the skb-list handling has been changed to set skb->next to NULL and set the list-poison on skb->prev. With that change, __qdisc_drop_all() will panic when it tries to dereference skb->prev. Since commit 992cba7e276d ("net: Add and use skb_list_del_init().") __list_del_entry is used, leaving skb->prev unchanged (thus, pointing to the list-head if it's the first skb of the list). This will make __qdisc_drop_all modify the next-pointer of the list-head and result in a panic later on: [ 34.501053] general protection fault: 0000 [#1] SMP KASAN PTI [ 34.501968] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.20.0-rc2.mptcp #108 [ 34.502887] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011 [ 34.504074] RIP: 0010:dev_gro_receive+0x343/0x1f90 [ 34.504751] Code: e0 48 c1 e8 03 42 80 3c 30 00 0f 85 4a 1c 00 00 4d 8b 24 24 4c 39 65 d0 0f 84 0a 04 00 00 49 8d 7c 24 38 48 89 f8 48 c1 e8 03 <42> 0f b6 04 30 84 c0 74 08 3c 04 [ 34.507060] RSP: 0018:ffff8883af507930 EFLAGS: 00010202 [ 34.507761] RAX: 0000000000000007 RBX: ffff8883970b2c80 RCX: 1ffff11072e165a6 [ 34.508640] RDX: 1ffff11075867008 RSI: ffff8883ac338040 RDI: 0000000000000038 [ 34.509493] RBP: ffff8883af5079d0 R08: ffff8883970b2d40 R09: 0000000000000062 [ 34.510346] R10: 0000000000000034 R11: 0000000000000000 R12: 0000000000000000 [ 34.511215] R13: 0000000000000000 R14: dffffc0000000000 R15: ffff8883ac338008 [ 34.512082] FS: 0000000000000000(0000) GS:ffff8883af500000(0000) knlGS:0000000000000000 [ 34.513036] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 34.513741] CR2: 000055ccc3e9d020 CR3: 00000003abf32000 CR4: 00000000000006e0 [ 34.514593] Call Trace: [ 34.514893] <IRQ> [ 34.515157] napi_gro_receive+0x93/0x150 [ 34.515632] receive_buf+0x893/0x3700 [ 34.516094] ? __netif_receive_skb+0x1f/0x1a0 [ 34.516629] ? virtnet_probe+0x1b40/0x1b40 [ 34.517153] ? __stable_node_chain+0x4d0/0x850 [ 34.517684] ? kfree+0x9a/0x180 [ 34.518067] ? __kasan_slab_free+0x171/0x190 [ 34.518582] ? detach_buf+0x1df/0x650 [ 34.519061] ? lapic_next_event+0x5a/0x90 [ 34.519539] ? virtqueue_get_buf_ctx+0x280/0x7f0 [ 34.520093] virtnet_poll+0x2df/0xd60 [ 34.520533] ? receive_buf+0x3700/0x3700 [ 34.521027] ? qdisc_watchdog_schedule_ns+0xd5/0x140 [ 34.521631] ? htb_dequeue+0x1817/0x25f0 [ 34.522107] ? sch_direct_xmit+0x142/0xf30 [ 34.522595] ? virtqueue_napi_schedule+0x26/0x30 [ 34.523155] net_rx_action+0x2f6/0xc50 [ 34.523601] ? napi_complete_done+0x2f0/0x2f0 [ 34.524126] ? kasan_check_read+0x11/0x20 [ 34.524608] ? _raw_spin_lock+0x7d/0xd0 [ 34.525070] ? _raw_spin_lock_bh+0xd0/0xd0 [ 34.525563] ? kvm_guest_apic_eoi_write+0x6b/0x80 [ 34.526130] ? apic_ack_irq+0x9e/0xe0 [ 34.526567] __do_softirq+0x188/0x4b5 [ 34.527015] irq_exit+0x151/0x180 [ 34.527417] do_IRQ+0xdb/0x150 [ 34.527783] common_interrupt+0xf/0xf [ 34.528223] </IRQ> This patch makes sure that skb->prev is set to NULL when entering netem_enqueue. Cc: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp> Cc: Tyler Hicks <tyhicks@canonical.com> Cc: Eric Dumazet <eric.dumazet@gmail.com> Fixes: 68d2f84a1368 ("net: gro: properly remove skb from list") Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Christoph Paasch <cpaasch@apple.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-30 08:01:04 +08:00
/* Do not fool qdisc_drop_all() */
skb->prev = NULL;
/* Random duplication */
if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
++count;
/* Drop packet? */
if (loss_event(q)) {
if (q->ecn && INET_ECN_set_ce(skb))
qdisc_qstats_drop(sch); /* mark packet */
else
--count;
}
if (count == 0) {
qdisc_qstats_drop(sch);
__qdisc_drop(skb, to_free);
return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
}
/* If a delay is expected, orphan the skb. (orphaning usually takes
* place at TX completion time, so _before_ the link transit delay)
*/
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
if (q->latency || q->jitter || q->rate)
skb_orphan_partial(skb);
/*
* If we need to duplicate packet, then re-insert at top of the
* qdisc tree, since parent queuer expects that only one
* skb will be queued.
*/
if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
sch_netem: fix rcu splat in netem_enqueue() qdisc_root() use from netem_enqueue() triggers a lockdep warning. __dev_queue_xmit() uses rcu_read_lock_bh() which is not equivalent to rcu_read_lock() + local_bh_disable_bh as far as lockdep is concerned. WARNING: suspicious RCU usage 5.3.0-rc7+ #0 Not tainted ----------------------------- include/net/sch_generic.h:492 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syz-executor427/8855: #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: lwtunnel_xmit_redirect include/net/lwtunnel.h:92 [inline] #0: 00000000b5525c01 (rcu_read_lock_bh){....}, at: ip_finish_output2+0x2dc/0x2570 net/ipv4/ip_output.c:214 #1: 00000000b5525c01 (rcu_read_lock_bh){....}, at: __dev_queue_xmit+0x20a/0x3650 net/core/dev.c:3804 #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: spin_lock include/linux/spinlock.h:338 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_xmit_skb net/core/dev.c:3502 [inline] #2: 00000000364bae92 (&(&sch->q.lock)->rlock){+.-.}, at: __dev_queue_xmit+0x14b8/0x3650 net/core/dev.c:3838 stack backtrace: CPU: 0 PID: 8855 Comm: syz-executor427 Not tainted 5.3.0-rc7+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5357 qdisc_root include/net/sch_generic.h:492 [inline] netem_enqueue+0x1cfb/0x2d80 net/sched/sch_netem.c:479 __dev_xmit_skb net/core/dev.c:3527 [inline] __dev_queue_xmit+0x15d2/0x3650 net/core/dev.c:3838 dev_queue_xmit+0x18/0x20 net/core/dev.c:3902 neigh_hh_output include/net/neighbour.h:500 [inline] neigh_output include/net/neighbour.h:509 [inline] ip_finish_output2+0x1726/0x2570 net/ipv4/ip_output.c:228 __ip_finish_output net/ipv4/ip_output.c:308 [inline] __ip_finish_output+0x5fc/0xb90 net/ipv4/ip_output.c:290 ip_finish_output+0x38/0x1f0 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:294 [inline] ip_mc_output+0x292/0xf40 net/ipv4/ip_output.c:417 dst_output include/net/dst.h:436 [inline] ip_local_out+0xbb/0x190 net/ipv4/ip_output.c:125 ip_send_skb+0x42/0xf0 net/ipv4/ip_output.c:1555 udp_send_skb.isra.0+0x6b2/0x1160 net/ipv4/udp.c:887 udp_sendmsg+0x1e96/0x2820 net/ipv4/udp.c:1174 inet_sendmsg+0x9e/0xe0 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-09-25 04:11:26 +08:00
struct Qdisc *rootq = qdisc_root_bh(sch);
u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
q->duplicate = 0;
rootq->enqueue(skb2, rootq, to_free);
q->duplicate = dupsave;
net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] <IRQ> [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] </IRQ> In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan <lansheng@huawei.com> Reported-by: Qin Ji <jiqin.ji@huawei.com> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-28 18:47:58 +08:00
rc_drop = NET_XMIT_SUCCESS;
}
/*
* Randomized packet corruption.
* Make copy if needed since we are modifying
* If packet is going to be hardware checksummed, then
* do it now in software before we mangle it.
*/
if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
if (skb_is_gso(skb)) {
net: netem: fix use after free and double free with packet corruption Brendan reports that the use of netem's packet corruption capability leads to strange crashes. This seems to be caused by commit d66280b12bd7 ("net: netem: use a list in addition to rbtree") which uses skb->next pointer to construct a fast-path queue of in-order skbs. Packet corruption code has to invoke skb_gso_segment() in case of skbs in need of GSO. skb_gso_segment() returns a list of skbs. If next pointers of the skbs on that list do not get cleared fast path list may point to freed skbs or skbs which are also on the RB tree. Let's say skb gets segmented into 3 frames: A -> B -> C A gets hooked to the t_head t_tail list by tfifo_enqueue(), but it's next pointer didn't get cleared so we have: h t |/ A -> B -> C Now if B and C get also get enqueued successfully all is fine, because tfifo_enqueue() will overwrite the list in order. IOW: Enqueue B: h t | | A -> B C Enqueue C: h t | | A -> B -> C But if B and C get reordered we may end up with: h t RB tree |/ | A -> B -> C B \ C Or if they get dropped just: h t |/ A -> B -> C where A and B are already freed. To reproduce either limit has to be set low to cause freeing of segs or reorders have to happen (due to delay jitter). Note that we only have to mark the first segment as not on the list, "finish_segs" handling of other frags already does that. Another caveat is that qdisc_drop_all() still has to free all segments correctly in case of drop of first segment, therefore we re-link segs before calling it. v2: - re-link before drop, v1 was leaking non-first segs if limit was hit at the first seg - better commit message which lead to discovering the above :) Reported-by: Brendan Galloway <brendan.galloway@netronome.com> Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Acked-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-18 02:11:11 +08:00
skb = netem_segment(skb, sch, to_free);
if (!skb)
net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] <IRQ> [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] </IRQ> In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan <lansheng@huawei.com> Reported-by: Qin Ji <jiqin.ji@huawei.com> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-28 18:47:58 +08:00
return rc_drop;
net: netem: fix use after free and double free with packet corruption Brendan reports that the use of netem's packet corruption capability leads to strange crashes. This seems to be caused by commit d66280b12bd7 ("net: netem: use a list in addition to rbtree") which uses skb->next pointer to construct a fast-path queue of in-order skbs. Packet corruption code has to invoke skb_gso_segment() in case of skbs in need of GSO. skb_gso_segment() returns a list of skbs. If next pointers of the skbs on that list do not get cleared fast path list may point to freed skbs or skbs which are also on the RB tree. Let's say skb gets segmented into 3 frames: A -> B -> C A gets hooked to the t_head t_tail list by tfifo_enqueue(), but it's next pointer didn't get cleared so we have: h t |/ A -> B -> C Now if B and C get also get enqueued successfully all is fine, because tfifo_enqueue() will overwrite the list in order. IOW: Enqueue B: h t | | A -> B C Enqueue C: h t | | A -> B -> C But if B and C get reordered we may end up with: h t RB tree |/ | A -> B -> C B \ C Or if they get dropped just: h t |/ A -> B -> C where A and B are already freed. To reproduce either limit has to be set low to cause freeing of segs or reorders have to happen (due to delay jitter). Note that we only have to mark the first segment as not on the list, "finish_segs" handling of other frags already does that. Another caveat is that qdisc_drop_all() still has to free all segments correctly in case of drop of first segment, therefore we re-link segs before calling it. v2: - re-link before drop, v1 was leaking non-first segs if limit was hit at the first seg - better commit message which lead to discovering the above :) Reported-by: Brendan Galloway <brendan.galloway@netronome.com> Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Acked-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-18 02:11:11 +08:00
segs = skb->next;
skb_mark_not_on_list(skb);
qdisc_skb_cb(skb)->pkt_len = skb->len;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
}
skb = skb_unshare(skb, GFP_ATOMIC);
if (unlikely(!skb)) {
qdisc_qstats_drop(sch);
goto finish_segs;
}
if (skb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(skb)) {
qdisc_drop(skb, sch, to_free);
skb = NULL;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
goto finish_segs;
}
skb->data[prandom_u32() % skb_headlen(skb)] ^=
1<<(prandom_u32() % 8);
}
net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] <IRQ> [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] </IRQ> In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan <lansheng@huawei.com> Reported-by: Qin Ji <jiqin.ji@huawei.com> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-28 18:47:58 +08:00
if (unlikely(sch->q.qlen >= sch->limit)) {
net: netem: fix use after free and double free with packet corruption Brendan reports that the use of netem's packet corruption capability leads to strange crashes. This seems to be caused by commit d66280b12bd7 ("net: netem: use a list in addition to rbtree") which uses skb->next pointer to construct a fast-path queue of in-order skbs. Packet corruption code has to invoke skb_gso_segment() in case of skbs in need of GSO. skb_gso_segment() returns a list of skbs. If next pointers of the skbs on that list do not get cleared fast path list may point to freed skbs or skbs which are also on the RB tree. Let's say skb gets segmented into 3 frames: A -> B -> C A gets hooked to the t_head t_tail list by tfifo_enqueue(), but it's next pointer didn't get cleared so we have: h t |/ A -> B -> C Now if B and C get also get enqueued successfully all is fine, because tfifo_enqueue() will overwrite the list in order. IOW: Enqueue B: h t | | A -> B C Enqueue C: h t | | A -> B -> C But if B and C get reordered we may end up with: h t RB tree |/ | A -> B -> C B \ C Or if they get dropped just: h t |/ A -> B -> C where A and B are already freed. To reproduce either limit has to be set low to cause freeing of segs or reorders have to happen (due to delay jitter). Note that we only have to mark the first segment as not on the list, "finish_segs" handling of other frags already does that. Another caveat is that qdisc_drop_all() still has to free all segments correctly in case of drop of first segment, therefore we re-link segs before calling it. v2: - re-link before drop, v1 was leaking non-first segs if limit was hit at the first seg - better commit message which lead to discovering the above :) Reported-by: Brendan Galloway <brendan.galloway@netronome.com> Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree") Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com> Reviewed-by: Dirk van der Merwe <dirk.vandermerwe@netronome.com> Acked-by: Cong Wang <xiyou.wangcong@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-18 02:11:11 +08:00
/* re-link segs, so that qdisc_drop_all() frees them all */
skb->next = segs;
net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] <IRQ> [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] </IRQ> In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan <lansheng@huawei.com> Reported-by: Qin Ji <jiqin.ji@huawei.com> Suggested-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-28 18:47:58 +08:00
qdisc_drop_all(skb, sch, to_free);
return rc_drop;
}
qdisc_qstats_backlog_inc(sch, skb);
cb = netem_skb_cb(skb);
if (q->gap == 0 || /* not doing reordering */
q->counter < q->gap - 1 || /* inside last reordering gap */
q->reorder < get_crandom(&q->reorder_cor)) {
u64 now;
s64 delay;
delay = tabledist(q->latency, q->jitter,
&q->delay_cor, q->delay_dist);
now = ktime_get_ns();
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
if (q->rate) {
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
struct netem_skb_cb *last = NULL;
if (sch->q.tail)
last = netem_skb_cb(sch->q.tail);
if (q->t_root.rb_node) {
struct sk_buff *t_skb;
struct netem_skb_cb *t_last;
t_skb = skb_rb_last(&q->t_root);
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
t_last = netem_skb_cb(t_skb);
if (!last ||
t_last->time_to_send > last->time_to_send)
last = t_last;
}
if (q->t_tail) {
struct netem_skb_cb *t_last =
netem_skb_cb(q->t_tail);
if (!last ||
t_last->time_to_send > last->time_to_send)
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
last = t_last;
}
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
if (last) {
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
/*
* Last packet in queue is reference point (now),
* calculate this time bonus and subtract
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
* from delay.
*/
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
delay -= last->time_to_send - now;
delay = max_t(s64, 0, delay);
netem: apply correct delay when rate throttling I recently reported on the netem list that iperf network benchmarks show unexpected results when a bandwidth throttling rate has been configured for netem. Specifically: 1) The measured link bandwidth *increases* when a higher delay is added 2) The measured link bandwidth appears higher than the specified limit 3) The measured link bandwidth for the same very slow settings varies significantly across machines The issue can be reproduced by using tc to configure netem with a 512kbit rate and various (none, 1us, 50ms, 100ms, 200ms) delays on a veth pair between network namespaces, and then using iperf (or any other network benchmarking tool) to test throughput. Complete detailed instructions are in the original email chain here: https://lists.linuxfoundation.org/pipermail/netem/2017-February/001672.html There appear to be two underlying bugs causing these effects: - The first issue causes long delays when the rate is slow and no delay is configured (e.g., "rate 512kbit"). This is because SKBs are not orphaned when no delay is configured, so orphaning does not occur until *after* the rate-induced delay has been applied. For this reason, adding a tiny delay (e.g., "rate 512kbit delay 1us") dramatically increases the measured bandwidth. - The second issue is that rate-induced delays are not correctly applied, allowing SKB delays to occur in parallel. The indended approach is to compute the delay for an SKB and to add this delay to the end of the current queue. However, the code does not detect existing SKBs in the queue due to improperly testing sch->q.qlen, which is nonzero even when packets exist only in the rbtree. Consequently, new SKBs do not wait for the current queue to empty. When packet delays vary significantly (e.g., if packet sizes are different), then this also causes unintended reordering. I modified the code to expect a delay (and orphan the SKB) when a rate is configured. I also added some defensive tests that correctly find the latest scheduled delivery time, even if it is (unexpectedly) for a packet in sch->q. I have tested these changes on the latest kernel (4.11.0-rc1+) and the iperf / ping test results are as expected. Signed-off-by: Nik Unger <njunger@uwaterloo.ca> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-03-14 01:16:58 +08:00
now = last->time_to_send;
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
}
delay += packet_time_ns(qdisc_pkt_len(skb), q);
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
}
cb->time_to_send = now + delay;
++q->counter;
tfifo_enqueue(skb, sch);
} else {
/*
* Do re-ordering by putting one out of N packets at the front
* of the queue.
*/
cb->time_to_send = ktime_get_ns();
q->counter = 0;
__qdisc_enqueue_head(skb, &sch->q);
sch->qstats.requeues++;
}
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
finish_segs:
if (segs) {
unsigned int len, last_len;
int nb;
len = skb ? skb->len : 0;
nb = skb ? 1 : 0;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
while (segs) {
skb2 = segs->next;
skb_mark_not_on_list(segs);
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
qdisc_skb_cb(segs)->pkt_len = segs->len;
last_len = segs->len;
rc = qdisc_enqueue(segs, sch, to_free);
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
if (rc != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(rc))
qdisc_qstats_drop(sch);
} else {
nb++;
len += last_len;
}
segs = skb2;
}
/* Parent qdiscs accounted for 1 skb of size @prev_len */
qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
} else if (!skb) {
return NET_XMIT_DROP;
netem: Segment GSO packets on enqueue This was recently reported to me, and reproduced on the latest net kernel, when attempting to run netperf from a host that had a netem qdisc attached to the egress interface: [ 788.073771] ---------------------[ cut here ]--------------------------- [ 788.096716] WARNING: at net/core/dev.c:2253 skb_warn_bad_offload+0xcd/0xda() [ 788.129521] bnx2: caps=(0x00000001801949b3, 0x0000000000000000) len=2962 data_len=0 gso_size=1448 gso_type=1 ip_summed=3 [ 788.182150] Modules linked in: sch_netem kvm_amd kvm crc32_pclmul ipmi_ssif ghash_clmulni_intel sp5100_tco amd64_edac_mod aesni_intel lrw gf128mul glue_helper ablk_helper edac_mce_amd cryptd pcspkr sg edac_core hpilo ipmi_si i2c_piix4 k10temp fam15h_power hpwdt ipmi_msghandler shpchp acpi_power_meter pcc_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 syscopyarea sysfillrect sysimgblt i2c_algo_bit drm_kms_helper ahci ata_generic pata_acpi ttm libahci crct10dif_pclmul pata_atiixp tg3 libata crct10dif_common drm crc32c_intel ptp serio_raw bnx2 r8169 hpsa pps_core i2c_core mii dm_mirror dm_region_hash dm_log dm_mod [ 788.465294] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G W ------------ 3.10.0-327.el7.x86_64 #1 [ 788.511521] Hardware name: HP ProLiant DL385p Gen8, BIOS A28 12/17/2012 [ 788.542260] ffff880437c036b8 f7afc56532a53db9 ffff880437c03670 ffffffff816351f1 [ 788.576332] ffff880437c036a8 ffffffff8107b200 ffff880633e74200 ffff880231674000 [ 788.611943] 0000000000000001 0000000000000003 0000000000000000 ffff880437c03710 [ 788.647241] Call Trace: [ 788.658817] <IRQ> [<ffffffff816351f1>] dump_stack+0x19/0x1b [ 788.686193] [<ffffffff8107b200>] warn_slowpath_common+0x70/0xb0 [ 788.713803] [<ffffffff8107b29c>] warn_slowpath_fmt+0x5c/0x80 [ 788.741314] [<ffffffff812f92f3>] ? ___ratelimit+0x93/0x100 [ 788.767018] [<ffffffff81637f49>] skb_warn_bad_offload+0xcd/0xda [ 788.796117] [<ffffffff8152950c>] skb_checksum_help+0x17c/0x190 [ 788.823392] [<ffffffffa01463a1>] netem_enqueue+0x741/0x7c0 [sch_netem] [ 788.854487] [<ffffffff8152cb58>] dev_queue_xmit+0x2a8/0x570 [ 788.880870] [<ffffffff8156ae1d>] ip_finish_output+0x53d/0x7d0 ... The problem occurs because netem is not prepared to handle GSO packets (as it uses skb_checksum_help in its enqueue path, which cannot manipulate these frames). The solution I think is to simply segment the skb in a simmilar fashion to the way we do in __dev_queue_xmit (via validate_xmit_skb), with some minor changes. When we decide to corrupt an skb, if the frame is GSO, we segment it, corrupt the first segment, and enqueue the remaining ones. tested successfully by myself on the latest net kernel, to which this applies Signed-off-by: Neil Horman <nhorman@tuxdriver.com> CC: Jamal Hadi Salim <jhs@mojatatu.com> CC: "David S. Miller" <davem@davemloft.net> CC: netem@lists.linux-foundation.org CC: eric.dumazet@gmail.com CC: stephen@networkplumber.org Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-03 00:20:15 +08:00
}
return NET_XMIT_SUCCESS;
}
/* Delay the next round with a new future slot with a
* correct number of bytes and packets.
*/
static void get_slot_next(struct netem_sched_data *q, u64 now)
{
s64 next_delay;
if (!q->slot_dist)
next_delay = q->slot_config.min_delay +
(prandom_u32() *
(q->slot_config.max_delay -
q->slot_config.min_delay) >> 32);
else
next_delay = tabledist(q->slot_config.dist_delay,
(s32)(q->slot_config.dist_jitter),
NULL, q->slot_dist);
q->slot.slot_next = now + next_delay;
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
}
static struct sk_buff *netem_peek(struct netem_sched_data *q)
{
struct sk_buff *skb = skb_rb_first(&q->t_root);
u64 t1, t2;
if (!skb)
return q->t_head;
if (!q->t_head)
return skb;
t1 = netem_skb_cb(skb)->time_to_send;
t2 = netem_skb_cb(q->t_head)->time_to_send;
if (t1 < t2)
return skb;
return q->t_head;
}
static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
{
if (skb == q->t_head) {
q->t_head = skb->next;
if (!q->t_head)
q->t_tail = NULL;
} else {
rb_erase(&skb->rbnode, &q->t_root);
}
}
static struct sk_buff *netem_dequeue(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct sk_buff *skb;
tfifo_dequeue:
skb = __qdisc_dequeue_head(&sch->q);
if (skb) {
qdisc_qstats_backlog_dec(sch, skb);
netem: Fixes byte backlog accounting for the first of two chained netem instances Fixes byte backlog accounting for the first of two chained netem instances. Bytes backlog reported now corresponds to the number of queued packets. When two netem instances are chained, for instance to apply rate and queue limitation followed by packet delay, the number of backlogged bytes reported by the first netem instance is wrong. It reports the sum of bytes in the queues of the first and second netem. The first netem reports the correct number of backlogged packets but not bytes. This is shown in the example below. Consider a chain of two netem schedulers created using the following commands: $ tc -s qdisc replace dev veth2 root handle 1:0 netem rate 10000kbit limit 100 $ tc -s qdisc add dev veth2 parent 1:0 handle 2: netem delay 50ms Start an iperf session to send packets out on the specified interface and monitor the backlog using tc: $ tc -s qdisc show dev veth2 Output using unpatched netem: qdisc netem 1: root refcnt 2 limit 100 rate 10000Kbit Sent 98422639 bytes 65434 pkt (dropped 123, overlimits 0 requeues 0) backlog 172694b 73p requeues 0 qdisc netem 2: parent 1: limit 1000 delay 50.0ms Sent 98422639 bytes 65434 pkt (dropped 0, overlimits 0 requeues 0) backlog 63588b 42p requeues 0 The interface used to produce this output has an MTU of 1500. The output for backlogged bytes behind netem 1 is 172694b. This value is not correct. Consider the total number of sent bytes and packets. By dividing the number of sent bytes by the number of sent packets, we get an average packet size of ~=1504. If we divide the number of backlogged bytes by packets, we get ~=2365. This is due to the first netem incorrectly counting the 63588b which are in netem 2's queue as being in its own queue. To verify this is the case, we subtract them from the reported value and divide by the number of packets as follows: 172694 - 63588 = 109106 bytes actualled backlogged in netem 1 109106 / 73 packets ~= 1494 bytes (which matches our MTU) The root cause is that the byte accounting is not done at the same time with packet accounting. The solution is to update the backlog value every time the packet queue is updated. Signed-off-by: Joseph D Beshay <joseph.beshay@utdallas.edu> Acked-by: Hagen Paul Pfeifer <hagen@jauu.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-07 02:00:56 +08:00
deliver:
qdisc_bstats_update(sch, skb);
return skb;
}
skb = netem_peek(q);
if (skb) {
u64 time_to_send;
u64 now = ktime_get_ns();
/* if more time remaining? */
time_to_send = netem_skb_cb(skb)->time_to_send;
if (q->slot.slot_next && q->slot.slot_next < time_to_send)
get_slot_next(q, now);
if (time_to_send <= now && q->slot.slot_next <= now) {
netem_erase_head(q, skb);
sch->q.qlen--;
netem: Fixes byte backlog accounting for the first of two chained netem instances Fixes byte backlog accounting for the first of two chained netem instances. Bytes backlog reported now corresponds to the number of queued packets. When two netem instances are chained, for instance to apply rate and queue limitation followed by packet delay, the number of backlogged bytes reported by the first netem instance is wrong. It reports the sum of bytes in the queues of the first and second netem. The first netem reports the correct number of backlogged packets but not bytes. This is shown in the example below. Consider a chain of two netem schedulers created using the following commands: $ tc -s qdisc replace dev veth2 root handle 1:0 netem rate 10000kbit limit 100 $ tc -s qdisc add dev veth2 parent 1:0 handle 2: netem delay 50ms Start an iperf session to send packets out on the specified interface and monitor the backlog using tc: $ tc -s qdisc show dev veth2 Output using unpatched netem: qdisc netem 1: root refcnt 2 limit 100 rate 10000Kbit Sent 98422639 bytes 65434 pkt (dropped 123, overlimits 0 requeues 0) backlog 172694b 73p requeues 0 qdisc netem 2: parent 1: limit 1000 delay 50.0ms Sent 98422639 bytes 65434 pkt (dropped 0, overlimits 0 requeues 0) backlog 63588b 42p requeues 0 The interface used to produce this output has an MTU of 1500. The output for backlogged bytes behind netem 1 is 172694b. This value is not correct. Consider the total number of sent bytes and packets. By dividing the number of sent bytes by the number of sent packets, we get an average packet size of ~=1504. If we divide the number of backlogged bytes by packets, we get ~=2365. This is due to the first netem incorrectly counting the 63588b which are in netem 2's queue as being in its own queue. To verify this is the case, we subtract them from the reported value and divide by the number of packets as follows: 172694 - 63588 = 109106 bytes actualled backlogged in netem 1 109106 / 73 packets ~= 1494 bytes (which matches our MTU) The root cause is that the byte accounting is not done at the same time with packet accounting. The solution is to update the backlog value every time the packet queue is updated. Signed-off-by: Joseph D Beshay <joseph.beshay@utdallas.edu> Acked-by: Hagen Paul Pfeifer <hagen@jauu.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-07 02:00:56 +08:00
qdisc_qstats_backlog_dec(sch, skb);
skb->next = NULL;
skb->prev = NULL;
/* skb->dev shares skb->rbnode area,
* we need to restore its value.
*/
skb->dev = qdisc_dev(sch);
if (q->slot.slot_next) {
q->slot.packets_left--;
q->slot.bytes_left -= qdisc_pkt_len(skb);
if (q->slot.packets_left <= 0 ||
q->slot.bytes_left <= 0)
get_slot_next(q, now);
}
if (q->qdisc) {
unsigned int pkt_len = qdisc_pkt_len(skb);
struct sk_buff *to_free = NULL;
int err;
err = qdisc_enqueue(skb, q->qdisc, &to_free);
kfree_skb_list(to_free);
if (err != NET_XMIT_SUCCESS &&
net_xmit_drop_count(err)) {
qdisc_qstats_drop(sch);
qdisc_tree_reduce_backlog(sch, 1,
pkt_len);
}
goto tfifo_dequeue;
}
goto deliver;
}
if (q->qdisc) {
skb = q->qdisc->ops->dequeue(q->qdisc);
if (skb)
goto deliver;
}
qdisc_watchdog_schedule_ns(&q->watchdog,
max(time_to_send,
q->slot.slot_next));
}
if (q->qdisc) {
skb = q->qdisc->ops->dequeue(q->qdisc);
if (skb)
goto deliver;
}
return NULL;
}
static void netem_reset(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_reset_queue(sch);
tfifo_reset(sch);
if (q->qdisc)
qdisc_reset(q->qdisc);
qdisc_watchdog_cancel(&q->watchdog);
}
static void dist_free(struct disttable *d)
{
kvfree(d);
}
/*
* Distribution data is a variable size payload containing
* signed 16 bit values.
*/
static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
const struct nlattr *attr)
{
size_t n = nla_len(attr)/sizeof(__s16);
const __s16 *data = nla_data(attr);
spinlock_t *root_lock;
struct disttable *d;
int i;
if (!n || n > NETEM_DIST_MAX)
return -EINVAL;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:27 +08:00
d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
if (!d)
return -ENOMEM;
d->size = n;
for (i = 0; i < n; i++)
d->table[i] = data[i];
root_lock = qdisc_root_sleeping_lock(sch);
spin_lock_bh(root_lock);
swap(*tbl, d);
spin_unlock_bh(root_lock);
dist_free(d);
return 0;
}
static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
{
const struct tc_netem_slot *c = nla_data(attr);
q->slot_config = *c;
if (q->slot_config.max_packets == 0)
q->slot_config.max_packets = INT_MAX;
if (q->slot_config.max_bytes == 0)
q->slot_config.max_bytes = INT_MAX;
/* capping dist_jitter to the range acceptable by tabledist() */
q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
q->slot.packets_left = q->slot_config.max_packets;
q->slot.bytes_left = q->slot_config.max_bytes;
if (q->slot_config.min_delay | q->slot_config.max_delay |
q->slot_config.dist_jitter)
q->slot.slot_next = ktime_get_ns();
else
q->slot.slot_next = 0;
}
static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
{
const struct tc_netem_corr *c = nla_data(attr);
init_crandom(&q->delay_cor, c->delay_corr);
init_crandom(&q->loss_cor, c->loss_corr);
init_crandom(&q->dup_cor, c->dup_corr);
}
static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
{
const struct tc_netem_reorder *r = nla_data(attr);
q->reorder = r->probability;
init_crandom(&q->reorder_cor, r->correlation);
}
static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
{
const struct tc_netem_corrupt *r = nla_data(attr);
q->corrupt = r->probability;
init_crandom(&q->corrupt_cor, r->correlation);
}
static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
{
const struct tc_netem_rate *r = nla_data(attr);
q->rate = r->rate;
q->packet_overhead = r->packet_overhead;
q->cell_size = r->cell_size;
reciprocal_divide: update/correction of the algorithm Jakub Zawadzki noticed that some divisions by reciprocal_divide() were not correct [1][2], which he could also show with BPF code after divisions are transformed into reciprocal_value() for runtime invariance which can be passed to reciprocal_divide() later on; reverse in BPF dump ended up with a different, off-by-one K in some situations. This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not use reciprocal divide"). This follow-up patch improves reciprocal_value() and reciprocal_divide() to work in all cases by using Granlund and Montgomery method, so that also future use is safe and without any non-obvious side-effects. Known problems with the old implementation were that division by 1 always returned 0 and some off-by-ones when the dividend and divisor where very large. This seemed to not be problematic with its current users, as far as we can tell. Eric Dumazet checked for the slab usage, we cannot surely say so in the case of flex_array. Still, in order to fix that, we propose an extension from the original implementation from commit 6a2d7a955d8d resp. [3][4], by using the algorithm proposed in "Division by Invariant Integers Using Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is, pseudocode for q = n/d where q, n, d is in u32 universe: 1) Initialization: int l = ceil(log_2 d) uword m' = floor((1<<32)*((1<<l)-d)/d)+1 int sh_1 = min(l,1) int sh_2 = max(l-1,0) 2) For q = n/d, all uword: uword t = (n*m')>>32 q = (t+((n-t)>>sh_1))>>sh_2 The assembler implementation from Agner Fog [6] also helped a lot while implementing. We have tested the implementation on x86_64, ppc64, i686, s390x; on x86_64/haswell we're still half the latency compared to normal divide. Joint work with Daniel Borkmann. [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c [3] https://gmplib.org/~tege/division-paper.pdf [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 [6] http://www.agner.org/optimize/asmlib.zip Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Austin S Hemmelgarn <ahferroin7@gmail.com> Cc: linux-kernel@vger.kernel.org Cc: Jesse Gross <jesse@nicira.com> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Matt Mackall <mpm@selenic.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andy Gospodarek <andy@greyhouse.net> Cc: Veaceslav Falico <vfalico@redhat.com> Cc: Jay Vosburgh <fubar@us.ibm.com> Cc: Jakub Zawadzki <darkjames-ws@darkjames.pl> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 09:29:41 +08:00
q->cell_overhead = r->cell_overhead;
if (q->cell_size)
q->cell_size_reciprocal = reciprocal_value(q->cell_size);
reciprocal_divide: update/correction of the algorithm Jakub Zawadzki noticed that some divisions by reciprocal_divide() were not correct [1][2], which he could also show with BPF code after divisions are transformed into reciprocal_value() for runtime invariance which can be passed to reciprocal_divide() later on; reverse in BPF dump ended up with a different, off-by-one K in some situations. This has been fixed by Eric Dumazet in commit aee636c4809fa5 ("bpf: do not use reciprocal divide"). This follow-up patch improves reciprocal_value() and reciprocal_divide() to work in all cases by using Granlund and Montgomery method, so that also future use is safe and without any non-obvious side-effects. Known problems with the old implementation were that division by 1 always returned 0 and some off-by-ones when the dividend and divisor where very large. This seemed to not be problematic with its current users, as far as we can tell. Eric Dumazet checked for the slab usage, we cannot surely say so in the case of flex_array. Still, in order to fix that, we propose an extension from the original implementation from commit 6a2d7a955d8d resp. [3][4], by using the algorithm proposed in "Division by Invariant Integers Using Multiplication" [5], Torbjörn Granlund and Peter L. Montgomery, that is, pseudocode for q = n/d where q, n, d is in u32 universe: 1) Initialization: int l = ceil(log_2 d) uword m' = floor((1<<32)*((1<<l)-d)/d)+1 int sh_1 = min(l,1) int sh_2 = max(l-1,0) 2) For q = n/d, all uword: uword t = (n*m')>>32 q = (t+((n-t)>>sh_1))>>sh_2 The assembler implementation from Agner Fog [6] also helped a lot while implementing. We have tested the implementation on x86_64, ppc64, i686, s390x; on x86_64/haswell we're still half the latency compared to normal divide. Joint work with Daniel Borkmann. [1] http://www.wireshark.org/~darkjames/reciprocal-buggy.c [2] http://www.wireshark.org/~darkjames/set-and-dump-filter-k-bug.c [3] https://gmplib.org/~tege/division-paper.pdf [4] http://homepage.cs.uiowa.edu/~jones/bcd/divide.html [5] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 [6] http://www.agner.org/optimize/asmlib.zip Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Austin S Hemmelgarn <ahferroin7@gmail.com> Cc: linux-kernel@vger.kernel.org Cc: Jesse Gross <jesse@nicira.com> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Matt Mackall <mpm@selenic.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Andy Gospodarek <andy@greyhouse.net> Cc: Veaceslav Falico <vfalico@redhat.com> Cc: Jay Vosburgh <fubar@us.ibm.com> Cc: Jakub Zawadzki <darkjames-ws@darkjames.pl> Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-01-22 09:29:41 +08:00
else
q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
}
static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
{
const struct nlattr *la;
int rem;
nla_for_each_nested(la, attr, rem) {
u16 type = nla_type(la);
switch (type) {
case NETEM_LOSS_GI: {
const struct tc_netem_gimodel *gi = nla_data(la);
if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
pr_info("netem: incorrect gi model size\n");
return -EINVAL;
}
q->loss_model = CLG_4_STATES;
q->clg.state = TX_IN_GAP_PERIOD;
q->clg.a1 = gi->p13;
q->clg.a2 = gi->p31;
q->clg.a3 = gi->p32;
q->clg.a4 = gi->p14;
q->clg.a5 = gi->p23;
break;
}
case NETEM_LOSS_GE: {
const struct tc_netem_gemodel *ge = nla_data(la);
if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
pr_info("netem: incorrect ge model size\n");
return -EINVAL;
}
q->loss_model = CLG_GILB_ELL;
q->clg.state = GOOD_STATE;
q->clg.a1 = ge->p;
q->clg.a2 = ge->r;
q->clg.a3 = ge->h;
q->clg.a4 = ge->k1;
break;
}
default:
pr_info("netem: unknown loss type %u\n", type);
return -EINVAL;
}
}
return 0;
}
static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
[TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
[TCA_NETEM_ECN] = { .type = NLA_U32 },
[TCA_NETEM_RATE64] = { .type = NLA_U64 },
[TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
[TCA_NETEM_JITTER64] = { .type = NLA_S64 },
[TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
};
static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy, int len)
{
int nested_len = nla_len(nla) - NLA_ALIGN(len);
if (nested_len < 0) {
pr_info("netem: invalid attributes len %d\n", nested_len);
return -EINVAL;
}
if (nested_len >= nla_attr_size(0))
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 20:07:28 +08:00
return nla_parse_deprecated(tb, maxtype,
nla_data(nla) + NLA_ALIGN(len),
nested_len, policy, NULL);
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
return 0;
}
/* Parse netlink message to set options */
static int netem_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct netem_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_NETEM_MAX + 1];
struct tc_netem_qopt *qopt;
struct clgstate old_clg;
int old_loss_model = CLG_RANDOM;
int ret;
if (opt == NULL)
return -EINVAL;
qopt = nla_data(opt);
ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
if (ret < 0)
return ret;
/* backup q->clg and q->loss_model */
old_clg = q->clg;
old_loss_model = q->loss_model;
if (tb[TCA_NETEM_LOSS]) {
ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
if (ret) {
q->loss_model = old_loss_model;
return ret;
}
} else {
q->loss_model = CLG_RANDOM;
}
if (tb[TCA_NETEM_DELAY_DIST]) {
ret = get_dist_table(sch, &q->delay_dist,
tb[TCA_NETEM_DELAY_DIST]);
if (ret)
goto get_table_failure;
}
if (tb[TCA_NETEM_SLOT_DIST]) {
ret = get_dist_table(sch, &q->slot_dist,
tb[TCA_NETEM_SLOT_DIST]);
if (ret)
goto get_table_failure;
}
sch->limit = qopt->limit;
q->latency = PSCHED_TICKS2NS(qopt->latency);
q->jitter = PSCHED_TICKS2NS(qopt->jitter);
q->limit = qopt->limit;
q->gap = qopt->gap;
q->counter = 0;
q->loss = qopt->loss;
q->duplicate = qopt->duplicate;
/* for compatibility with earlier versions.
* if gap is set, need to assume 100% probability
*/
if (q->gap)
q->reorder = ~0;
if (tb[TCA_NETEM_CORR])
get_correlation(q, tb[TCA_NETEM_CORR]);
if (tb[TCA_NETEM_REORDER])
get_reorder(q, tb[TCA_NETEM_REORDER]);
if (tb[TCA_NETEM_CORRUPT])
get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
if (tb[TCA_NETEM_RATE])
get_rate(q, tb[TCA_NETEM_RATE]);
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
if (tb[TCA_NETEM_RATE64])
q->rate = max_t(u64, q->rate,
nla_get_u64(tb[TCA_NETEM_RATE64]));
if (tb[TCA_NETEM_LATENCY64])
q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
if (tb[TCA_NETEM_JITTER64])
q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
if (tb[TCA_NETEM_ECN])
q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
if (tb[TCA_NETEM_SLOT])
get_slot(q, tb[TCA_NETEM_SLOT]);
/* capping jitter to the range acceptable by tabledist() */
q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
return ret;
get_table_failure:
/* recover clg and loss_model, in case of
* q->clg and q->loss_model were modified
* in get_loss_clg()
*/
q->clg = old_clg;
q->loss_model = old_loss_model;
return ret;
}
static int netem_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
struct netem_sched_data *q = qdisc_priv(sch);
int ret;
sch_netem: avoid null pointer deref on init failure netem can fail in ->init due to missing options (either not supplied by user-space or used as a default qdisc) causing a timer->base null pointer deref in its ->destroy() and ->reset() callbacks. Reproduce: $ sysctl net.core.default_qdisc=netem $ ip l set ethX up Crash log: [ 1814.846943] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1814.847181] IP: hrtimer_active+0x17/0x8a [ 1814.847270] PGD 59c34067 [ 1814.847271] P4D 59c34067 [ 1814.847337] PUD 37374067 [ 1814.847403] PMD 0 [ 1814.847468] [ 1814.847582] Oops: 0000 [#1] SMP [ 1814.847655] Modules linked in: sch_netem(O) sch_fq_codel(O) [ 1814.847761] CPU: 3 PID: 1573 Comm: ip Tainted: G O 4.13.0-rc6+ #62 [ 1814.847884] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 [ 1814.848043] task: ffff88003723a700 task.stack: ffff88005adc8000 [ 1814.848235] RIP: 0010:hrtimer_active+0x17/0x8a [ 1814.848407] RSP: 0018:ffff88005adcb590 EFLAGS: 00010246 [ 1814.848590] RAX: 0000000000000000 RBX: ffff880058e359d8 RCX: 0000000000000000 [ 1814.848793] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff880058e359d8 [ 1814.848998] RBP: ffff88005adcb5b0 R08: 00000000014080c0 R09: 00000000ffffffff [ 1814.849204] R10: ffff88005adcb660 R11: 0000000000000020 R12: 0000000000000000 [ 1814.849410] R13: ffff880058e359d8 R14: 00000000ffffffff R15: 0000000000000001 [ 1814.849616] FS: 00007f733bbca740(0000) GS:ffff88005d980000(0000) knlGS:0000000000000000 [ 1814.849919] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1814.850107] CR2: 0000000000000000 CR3: 0000000059f0d000 CR4: 00000000000406e0 [ 1814.850313] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1814.850518] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1814.850723] Call Trace: [ 1814.850875] hrtimer_try_to_cancel+0x1a/0x93 [ 1814.851047] hrtimer_cancel+0x15/0x20 [ 1814.851211] qdisc_watchdog_cancel+0x12/0x14 [ 1814.851383] netem_reset+0xe6/0xed [sch_netem] [ 1814.851561] qdisc_destroy+0x8b/0xe5 [ 1814.851723] qdisc_create_dflt+0x86/0x94 [ 1814.851890] ? dev_activate+0x129/0x129 [ 1814.852057] attach_one_default_qdisc+0x36/0x63 [ 1814.852232] netdev_for_each_tx_queue+0x3d/0x48 [ 1814.852406] dev_activate+0x4b/0x129 [ 1814.852569] __dev_open+0xe7/0x104 [ 1814.852730] __dev_change_flags+0xc6/0x15c [ 1814.852899] dev_change_flags+0x25/0x59 [ 1814.853064] do_setlink+0x30c/0xb3f [ 1814.853228] ? check_chain_key+0xb0/0xfd [ 1814.853396] ? check_chain_key+0xb0/0xfd [ 1814.853565] rtnl_newlink+0x3a4/0x729 [ 1814.853728] ? rtnl_newlink+0x117/0x729 [ 1814.853905] ? ns_capable_common+0xd/0xb1 [ 1814.854072] ? ns_capable+0x13/0x15 [ 1814.854234] rtnetlink_rcv_msg+0x188/0x197 [ 1814.854404] ? rcu_read_unlock+0x3e/0x5f [ 1814.854572] ? rtnl_newlink+0x729/0x729 [ 1814.854737] netlink_rcv_skb+0x6c/0xce [ 1814.854902] rtnetlink_rcv+0x23/0x2a [ 1814.855064] netlink_unicast+0x103/0x181 [ 1814.855230] netlink_sendmsg+0x326/0x337 [ 1814.855398] sock_sendmsg_nosec+0x14/0x3f [ 1814.855584] sock_sendmsg+0x29/0x2e [ 1814.855747] ___sys_sendmsg+0x209/0x28b [ 1814.855912] ? do_raw_spin_unlock+0xcd/0xf8 [ 1814.856082] ? _raw_spin_unlock+0x27/0x31 [ 1814.856251] ? __handle_mm_fault+0x651/0xdb1 [ 1814.856421] ? check_chain_key+0xb0/0xfd [ 1814.856592] __sys_sendmsg+0x45/0x63 [ 1814.856755] ? __sys_sendmsg+0x45/0x63 [ 1814.856923] SyS_sendmsg+0x19/0x1b [ 1814.857083] entry_SYSCALL_64_fastpath+0x23/0xc2 [ 1814.857256] RIP: 0033:0x7f733b2dd690 [ 1814.857419] RSP: 002b:00007ffe1d3387d8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 1814.858238] RAX: ffffffffffffffda RBX: ffffffff810d278c RCX: 00007f733b2dd690 [ 1814.858445] RDX: 0000000000000000 RSI: 00007ffe1d338820 RDI: 0000000000000003 [ 1814.858651] RBP: ffff88005adcbf98 R08: 0000000000000001 R09: 0000000000000003 [ 1814.858856] R10: 00007ffe1d3385a0 R11: 0000000000000246 R12: 0000000000000002 [ 1814.859060] R13: 000000000066f1a0 R14: 00007ffe1d3408d0 R15: 0000000000000000 [ 1814.859267] ? trace_hardirqs_off_caller+0xa7/0xcf [ 1814.859446] Code: 10 55 48 89 c7 48 89 e5 e8 45 a1 fb ff 31 c0 5d c3 31 c0 c3 66 66 66 66 90 55 48 89 e5 41 56 41 55 41 54 53 49 89 fd 49 8b 45 30 <4c> 8b 20 41 8b 5c 24 38 31 c9 31 d2 48 c7 c7 50 8e 1d 82 41 89 [ 1814.860022] RIP: hrtimer_active+0x17/0x8a RSP: ffff88005adcb590 [ 1814.860214] CR2: 0000000000000000 Fixes: 87b60cfacf9f ("net_sched: fix error recovery at qdisc creation") Fixes: 0fbbeb1ba43b ("[PKT_SCHED]: Fix missing qdisc_destroy() in qdisc_create_dflt()") Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-08-30 17:49:03 +08:00
qdisc_watchdog_init(&q->watchdog, sch);
if (!opt)
return -EINVAL;
q->loss_model = CLG_RANDOM;
ret = netem_change(sch, opt, extack);
if (ret)
pr_info("netem: change failed\n");
return ret;
}
static void netem_destroy(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
qdisc_watchdog_cancel(&q->watchdog);
if (q->qdisc)
qdisc_put(q->qdisc);
dist_free(q->delay_dist);
dist_free(q->slot_dist);
}
static int dump_loss_model(const struct netem_sched_data *q,
struct sk_buff *skb)
{
struct nlattr *nest;
nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
if (nest == NULL)
goto nla_put_failure;
switch (q->loss_model) {
case CLG_RANDOM:
/* legacy loss model */
nla_nest_cancel(skb, nest);
return 0; /* no data */
case CLG_4_STATES: {
struct tc_netem_gimodel gi = {
.p13 = q->clg.a1,
.p31 = q->clg.a2,
.p32 = q->clg.a3,
.p14 = q->clg.a4,
.p23 = q->clg.a5,
};
if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
goto nla_put_failure;
break;
}
case CLG_GILB_ELL: {
struct tc_netem_gemodel ge = {
.p = q->clg.a1,
.r = q->clg.a2,
.h = q->clg.a3,
.k1 = q->clg.a4,
};
if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
goto nla_put_failure;
break;
}
}
nla_nest_end(skb, nest);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
{
const struct netem_sched_data *q = qdisc_priv(sch);
struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
struct tc_netem_qopt qopt;
struct tc_netem_corr cor;
struct tc_netem_reorder reorder;
struct tc_netem_corrupt corrupt;
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
struct tc_netem_rate rate;
struct tc_netem_slot slot;
qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
UINT_MAX);
qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
UINT_MAX);
qopt.limit = q->limit;
qopt.loss = q->loss;
qopt.gap = q->gap;
qopt.duplicate = q->duplicate;
if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
goto nla_put_failure;
if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
goto nla_put_failure;
if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
goto nla_put_failure;
cor.delay_corr = q->delay_cor.rho;
cor.loss_corr = q->loss_cor.rho;
cor.dup_corr = q->dup_cor.rho;
if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
goto nla_put_failure;
reorder.probability = q->reorder;
reorder.correlation = q->reorder_cor.rho;
if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
goto nla_put_failure;
corrupt.probability = q->corrupt;
corrupt.correlation = q->corrupt_cor.rho;
if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
goto nla_put_failure;
if (q->rate >= (1ULL << 32)) {
if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
TCA_NETEM_PAD))
goto nla_put_failure;
rate.rate = ~0U;
} else {
rate.rate = q->rate;
}
rate.packet_overhead = q->packet_overhead;
rate.cell_size = q->cell_size;
rate.cell_overhead = q->cell_overhead;
if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
goto nla_put_failure;
netem: rate extension Currently netem is not in the ability to emulate channel bandwidth. Only static delay (and optional random jitter) can be configured. To emulate the channel rate the token bucket filter (sch_tbf) can be used. But TBF has some major emulation flaws. The buffer (token bucket depth/rate) cannot be 0. Also the idea behind TBF is that the credit (token in buckets) fills if no packet is transmitted. So that there is always a "positive" credit for new packets. In real life this behavior contradicts the law of nature where nothing can travel faster as speed of light. E.g.: on an emulated 1000 byte/s link a small IPv4/TCP SYN packet with ~50 byte require ~0.05 seconds - not 0 seconds. Netem is an excellent place to implement a rate limiting feature: static delay is already implemented, tfifo already has time information and the user can skip TBF configuration completely. This patch implement rate feature which can be configured via tc. e.g: tc qdisc add dev eth0 root netem rate 10kbit To emulate a link of 5000byte/s and add an additional static delay of 10ms: tc qdisc add dev eth0 root netem delay 10ms rate 5KBps Note: similar to TBF the rate extension is bounded to the kernel timing system. Depending on the architecture timer granularity, higher rates (e.g. 10mbit/s and higher) tend to transmission bursts. Also note: further queues living in network adaptors; see ethtool(8). Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net> Acked-by: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: David S. Miller <davem@drr.davemloft.net>
2011-11-30 20:20:26 +08:00
if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
goto nla_put_failure;
if (dump_loss_model(q, skb) != 0)
goto nla_put_failure;
if (q->slot_config.min_delay | q->slot_config.max_delay |
q->slot_config.dist_jitter) {
slot = q->slot_config;
if (slot.max_packets == INT_MAX)
slot.max_packets = 0;
if (slot.max_bytes == INT_MAX)
slot.max_bytes = 0;
if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
goto nla_put_failure;
}
return nla_nest_end(skb, nla);
nla_put_failure:
nlmsg_trim(skb, nla);
return -1;
}
static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
struct sk_buff *skb, struct tcmsg *tcm)
{
struct netem_sched_data *q = qdisc_priv(sch);
if (cl != 1 || !q->qdisc) /* only one class */
return -ENOENT;
tcm->tcm_handle |= TC_H_MIN(1);
tcm->tcm_info = q->qdisc->handle;
return 0;
}
static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
{
struct netem_sched_data *q = qdisc_priv(sch);
*old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
{
struct netem_sched_data *q = qdisc_priv(sch);
return q->qdisc;
}
static unsigned long netem_find(struct Qdisc *sch, u32 classid)
{
return 1;
}
static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
if (!walker->stop) {
if (walker->count >= walker->skip)
if (walker->fn(sch, 1, walker) < 0) {
walker->stop = 1;
return;
}
walker->count++;
}
}
static const struct Qdisc_class_ops netem_class_ops = {
.graft = netem_graft,
.leaf = netem_leaf,
.find = netem_find,
.walk = netem_walk,
.dump = netem_dump_class,
};
static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
.id = "netem",
.cl_ops = &netem_class_ops,
.priv_size = sizeof(struct netem_sched_data),
.enqueue = netem_enqueue,
.dequeue = netem_dequeue,
.peek = qdisc_peek_dequeued,
.init = netem_init,
.reset = netem_reset,
.destroy = netem_destroy,
.change = netem_change,
.dump = netem_dump,
.owner = THIS_MODULE,
};
static int __init netem_module_init(void)
{
pr_info("netem: version " VERSION "\n");
return register_qdisc(&netem_qdisc_ops);
}
static void __exit netem_module_exit(void)
{
unregister_qdisc(&netem_qdisc_ops);
}
module_init(netem_module_init)
module_exit(netem_module_exit)
MODULE_LICENSE("GPL");