2005-04-17 06:20:36 +08:00
|
|
|
/* -*- linux-c -*-
|
|
|
|
* sysctl_net_core.c: sysctl interface to net core subsystem.
|
|
|
|
*
|
|
|
|
* Begun April 1, 1996, Mike Shaver.
|
|
|
|
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <linux/module.h>
|
2005-08-16 13:18:02 +08:00
|
|
|
#include <linux/socket.h>
|
2007-10-24 12:13:53 +08:00
|
|
|
#include <linux/netdevice.h>
|
2009-09-22 22:18:09 +08:00
|
|
|
#include <linux/ratelimit.h>
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 07:01:27 +08:00
|
|
|
#include <linux/vmalloc.h>
|
2007-12-05 17:37:34 +08:00
|
|
|
#include <linux/init.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2012-04-19 02:05:46 +08:00
|
|
|
#include <linux/kmemleak.h>
|
2009-09-22 22:18:09 +08:00
|
|
|
|
2009-02-25 18:32:14 +08:00
|
|
|
#include <net/ip.h>
|
2005-08-16 13:18:02 +08:00
|
|
|
#include <net/sock.h>
|
2011-05-28 01:41:33 +08:00
|
|
|
#include <net/net_ratelimit.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-01-24 04:35:28 +08:00
|
|
|
static int one = 1;
|
|
|
|
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 07:01:27 +08:00
|
|
|
#ifdef CONFIG_RPS
|
|
|
|
static int rps_sock_flow_sysctl(ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp, loff_t *ppos)
|
|
|
|
{
|
|
|
|
unsigned int orig_size, size;
|
|
|
|
int ret, i;
|
|
|
|
ctl_table tmp = {
|
|
|
|
.data = &size,
|
|
|
|
.maxlen = sizeof(size),
|
|
|
|
.mode = table->mode
|
|
|
|
};
|
|
|
|
struct rps_sock_flow_table *orig_sock_table, *sock_table;
|
|
|
|
static DEFINE_MUTEX(sock_flow_mutex);
|
|
|
|
|
|
|
|
mutex_lock(&sock_flow_mutex);
|
|
|
|
|
2010-10-25 11:02:02 +08:00
|
|
|
orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
|
|
|
|
lockdep_is_held(&sock_flow_mutex));
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 07:01:27 +08:00
|
|
|
size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
|
|
|
|
|
|
|
|
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
|
|
|
|
|
|
|
|
if (write) {
|
|
|
|
if (size) {
|
|
|
|
if (size > 1<<30) {
|
|
|
|
/* Enforce limit to prevent overflow */
|
|
|
|
mutex_unlock(&sock_flow_mutex);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
size = roundup_pow_of_two(size);
|
|
|
|
if (size != orig_size) {
|
|
|
|
sock_table =
|
|
|
|
vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size));
|
|
|
|
if (!sock_table) {
|
|
|
|
mutex_unlock(&sock_flow_mutex);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
sock_table->mask = size - 1;
|
|
|
|
} else
|
|
|
|
sock_table = orig_sock_table;
|
|
|
|
|
|
|
|
for (i = 0; i < size; i++)
|
|
|
|
sock_table->ents[i] = RPS_NO_CPU;
|
|
|
|
} else
|
|
|
|
sock_table = NULL;
|
|
|
|
|
|
|
|
if (sock_table != orig_sock_table) {
|
|
|
|
rcu_assign_pointer(rps_sock_flow_table, sock_table);
|
2011-11-17 11:13:26 +08:00
|
|
|
if (sock_table)
|
2012-02-24 15:31:31 +08:00
|
|
|
static_key_slow_inc(&rps_needed);
|
2011-11-17 11:13:26 +08:00
|
|
|
if (orig_sock_table) {
|
2012-02-24 15:31:31 +08:00
|
|
|
static_key_slow_dec(&rps_needed);
|
2011-11-17 11:13:26 +08:00
|
|
|
synchronize_rcu();
|
|
|
|
vfree(orig_sock_table);
|
|
|
|
}
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 07:01:27 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&sock_flow_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_RPS */
|
|
|
|
|
2013-05-20 12:02:32 +08:00
|
|
|
#ifdef CONFIG_NET_FLOW_LIMIT
|
|
|
|
static DEFINE_MUTEX(flow_limit_update_mutex);
|
|
|
|
|
|
|
|
static int flow_limit_cpu_sysctl(ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
struct sd_flow_limit *cur;
|
|
|
|
struct softnet_data *sd;
|
|
|
|
cpumask_var_t mask;
|
|
|
|
int i, len, ret = 0;
|
|
|
|
|
|
|
|
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (write) {
|
|
|
|
ret = cpumask_parse_user(buffer, *lenp, mask);
|
|
|
|
if (ret)
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
mutex_lock(&flow_limit_update_mutex);
|
|
|
|
len = sizeof(*cur) + netdev_flow_limit_table_len;
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
sd = &per_cpu(softnet_data, i);
|
|
|
|
cur = rcu_dereference_protected(sd->flow_limit,
|
|
|
|
lockdep_is_held(&flow_limit_update_mutex));
|
|
|
|
if (cur && !cpumask_test_cpu(i, mask)) {
|
|
|
|
RCU_INIT_POINTER(sd->flow_limit, NULL);
|
|
|
|
synchronize_rcu();
|
|
|
|
kfree(cur);
|
|
|
|
} else if (!cur && cpumask_test_cpu(i, mask)) {
|
|
|
|
cur = kzalloc(len, GFP_KERNEL);
|
|
|
|
if (!cur) {
|
|
|
|
/* not unwinding previous changes */
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto write_unlock;
|
|
|
|
}
|
|
|
|
cur->num_buckets = netdev_flow_limit_table_len;
|
|
|
|
rcu_assign_pointer(sd->flow_limit, cur);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
write_unlock:
|
|
|
|
mutex_unlock(&flow_limit_update_mutex);
|
|
|
|
} else {
|
|
|
|
if (*ppos || !*lenp) {
|
|
|
|
*lenp = 0;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpumask_clear(mask);
|
|
|
|
rcu_read_lock();
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
sd = &per_cpu(softnet_data, i);
|
|
|
|
if (rcu_dereference(sd->flow_limit))
|
|
|
|
cpumask_set_cpu(i, mask);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
len = cpumask_scnprintf(buffer, *lenp, mask);
|
|
|
|
*lenp = len + 1;
|
|
|
|
*ppos += len + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
free_cpumask_var(mask);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int flow_limit_table_len_sysctl(ctl_table *table, int write,
|
|
|
|
void __user *buffer, size_t *lenp,
|
|
|
|
loff_t *ppos)
|
|
|
|
{
|
|
|
|
unsigned int old, *ptr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&flow_limit_update_mutex);
|
|
|
|
|
|
|
|
ptr = table->data;
|
|
|
|
old = *ptr;
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
|
|
if (!ret && write && !is_power_of_2(*ptr)) {
|
|
|
|
*ptr = old;
|
|
|
|
ret = -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&flow_limit_update_mutex);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_NET_FLOW_LIMIT */
|
|
|
|
|
2007-12-05 17:37:34 +08:00
|
|
|
static struct ctl_table net_core_table[] = {
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_NET
|
|
|
|
{
|
|
|
|
.procname = "wmem_max",
|
|
|
|
.data = &sysctl_wmem_max,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2013-01-24 04:35:28 +08:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
.extra1 = &one,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rmem_max",
|
|
|
|
.data = &sysctl_rmem_max,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2013-01-24 04:35:28 +08:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
.extra1 = &one,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "wmem_default",
|
|
|
|
.data = &sysctl_wmem_default,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2013-01-24 04:35:28 +08:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
.extra1 = &one,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "rmem_default",
|
|
|
|
.data = &sysctl_rmem_default,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2013-01-24 04:35:28 +08:00
|
|
|
.proc_handler = proc_dointvec_minmax,
|
|
|
|
.extra1 = &one,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "dev_weight",
|
|
|
|
.data = &weight_p,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "netdev_max_backlog",
|
|
|
|
.data = &netdev_max_backlog,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
2011-04-20 17:27:32 +08:00
|
|
|
#ifdef CONFIG_BPF_JIT
|
|
|
|
{
|
|
|
|
.procname = "bpf_jit_enable",
|
|
|
|
.data = &bpf_jit_enable,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = proc_dointvec
|
|
|
|
},
|
|
|
|
#endif
|
net: Consistent skb timestamping
With RPS inclusion, skb timestamping is not consistent in RX path.
If netif_receive_skb() is used, its deferred after RPS dispatch.
If netif_rx() is used, its done before RPS dispatch.
This can give strange tcpdump timestamps results.
I think timestamping should be done as soon as possible in the receive
path, to get meaningful values (ie timestamps taken at the time packet
was delivered by NIC driver to our stack), even if NAPI already can
defer timestamping a bit (RPS can help to reduce the gap)
Tom Herbert prefer to sample timestamps after RPS dispatch. In case
sampling is expensive (HPET/acpi_pm on x86), this makes sense.
Let admins switch from one mode to another, using a new
sysctl, /proc/sys/net/core/netdev_tstamp_prequeue
Its default value (1), means timestamps are taken as soon as possible,
before backlog queueing, giving accurate timestamps.
Setting a 0 value permits to sample timestamps when processing backlog,
after RPS dispatch, to lower the load of the pre-RPS cpu.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-05-16 14:57:10 +08:00
|
|
|
{
|
|
|
|
.procname = "netdev_tstamp_prequeue",
|
|
|
|
.data = &netdev_tstamp_prequeue,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = proc_dointvec
|
|
|
|
},
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
.procname = "message_cost",
|
2008-07-25 16:45:58 +08:00
|
|
|
.data = &net_ratelimit_state.interval,
|
2005-04-17 06:20:36 +08:00
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec_jiffies,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "message_burst",
|
2008-07-25 16:45:58 +08:00
|
|
|
.data = &net_ratelimit_state.burst,
|
2005-04-17 06:20:36 +08:00
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec,
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "optmem_max",
|
|
|
|
.data = &sysctl_optmem_max,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2005-04-17 06:20:36 +08:00
|
|
|
},
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 07:01:27 +08:00
|
|
|
#ifdef CONFIG_RPS
|
|
|
|
{
|
|
|
|
.procname = "rps_sock_flow_entries",
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = rps_sock_flow_sysctl
|
|
|
|
},
|
|
|
|
#endif
|
2013-05-20 12:02:32 +08:00
|
|
|
#ifdef CONFIG_NET_FLOW_LIMIT
|
|
|
|
{
|
|
|
|
.procname = "flow_limit_cpu_bitmap",
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = flow_limit_cpu_sysctl
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.procname = "flow_limit_table_len",
|
|
|
|
.data = &netdev_flow_limit_table_len,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
|
|
|
.proc_handler = flow_limit_table_len_sysctl
|
|
|
|
},
|
|
|
|
#endif /* CONFIG_NET_FLOW_LIMIT */
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* CONFIG_NET */
|
2005-06-24 11:14:40 +08:00
|
|
|
{
|
|
|
|
.procname = "netdev_budget",
|
|
|
|
.data = &netdev_budget,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2005-06-24 11:14:40 +08:00
|
|
|
},
|
2007-03-09 12:41:08 +08:00
|
|
|
{
|
|
|
|
.procname = "warnings",
|
|
|
|
.data = &net_msg_warn,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2007-03-09 12:41:08 +08:00
|
|
|
},
|
2009-11-06 05:32:03 +08:00
|
|
|
{ }
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
2007-12-05 17:37:34 +08:00
|
|
|
|
2008-05-20 04:49:52 +08:00
|
|
|
static struct ctl_table netns_core_table[] = {
|
|
|
|
{
|
|
|
|
.procname = "somaxconn",
|
|
|
|
.data = &init_net.core.sysctl_somaxconn,
|
|
|
|
.maxlen = sizeof(int),
|
|
|
|
.mode = 0644,
|
2008-11-04 10:21:05 +08:00
|
|
|
.proc_handler = proc_dointvec
|
2008-05-20 04:49:52 +08:00
|
|
|
},
|
2009-11-06 05:32:03 +08:00
|
|
|
{ }
|
2008-05-20 04:49:52 +08:00
|
|
|
};
|
|
|
|
|
2007-12-08 16:09:24 +08:00
|
|
|
static __net_init int sysctl_core_net_init(struct net *net)
|
2007-12-05 17:37:34 +08:00
|
|
|
{
|
2008-05-20 04:49:52 +08:00
|
|
|
struct ctl_table *tbl;
|
2007-12-08 16:09:24 +08:00
|
|
|
|
2008-04-01 10:41:14 +08:00
|
|
|
net->core.sysctl_somaxconn = SOMAXCONN;
|
2007-12-08 16:12:33 +08:00
|
|
|
|
2008-05-20 04:49:52 +08:00
|
|
|
tbl = netns_core_table;
|
2009-11-26 07:14:13 +08:00
|
|
|
if (!net_eq(net, &init_net)) {
|
2008-05-20 04:49:52 +08:00
|
|
|
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
|
2007-12-08 16:09:24 +08:00
|
|
|
if (tbl == NULL)
|
|
|
|
goto err_dup;
|
|
|
|
|
2008-05-20 04:49:52 +08:00
|
|
|
tbl[0].data = &net->core.sysctl_somaxconn;
|
2012-11-16 11:02:59 +08:00
|
|
|
|
|
|
|
/* Don't export any sysctls to unprivileged users */
|
|
|
|
if (net->user_ns != &init_user_ns) {
|
|
|
|
tbl[0].procname = NULL;
|
|
|
|
}
|
2007-12-08 16:09:24 +08:00
|
|
|
}
|
|
|
|
|
2012-04-19 21:44:49 +08:00
|
|
|
net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
|
2008-04-01 10:41:14 +08:00
|
|
|
if (net->core.sysctl_hdr == NULL)
|
2007-12-08 16:09:24 +08:00
|
|
|
goto err_reg;
|
2007-12-05 17:37:34 +08:00
|
|
|
|
2007-12-08 16:09:24 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_reg:
|
2008-05-20 04:49:52 +08:00
|
|
|
if (tbl != netns_core_table)
|
2007-12-08 16:09:24 +08:00
|
|
|
kfree(tbl);
|
|
|
|
err_dup:
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static __net_exit void sysctl_core_net_exit(struct net *net)
|
|
|
|
{
|
|
|
|
struct ctl_table *tbl;
|
|
|
|
|
2008-04-01 10:41:14 +08:00
|
|
|
tbl = net->core.sysctl_hdr->ctl_table_arg;
|
|
|
|
unregister_net_sysctl_table(net->core.sysctl_hdr);
|
2008-05-20 04:49:52 +08:00
|
|
|
BUG_ON(tbl == netns_core_table);
|
2007-12-08 16:09:24 +08:00
|
|
|
kfree(tbl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __net_initdata struct pernet_operations sysctl_core_ops = {
|
|
|
|
.init = sysctl_core_net_init,
|
|
|
|
.exit = sysctl_core_net_exit,
|
|
|
|
};
|
|
|
|
|
|
|
|
static __init int sysctl_core_init(void)
|
|
|
|
{
|
2012-04-19 21:22:55 +08:00
|
|
|
register_net_sysctl(&init_net, "net/core", net_core_table);
|
2007-12-08 16:09:24 +08:00
|
|
|
return register_pernet_subsys(&sysctl_core_ops);
|
2007-12-05 17:37:34 +08:00
|
|
|
}
|
|
|
|
|
2008-11-26 10:00:48 +08:00
|
|
|
fs_initcall(sysctl_core_init);
|