net: implement threaded-able napi poll loop support

This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.

Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.

The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.

Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Wei Wang 2021-02-08 11:34:09 -08:00 committed by David S. Miller
parent 898f8015ff
commit 29863d41bb
2 changed files with 119 additions and 14 deletions

View File

@ -347,6 +347,7 @@ struct napi_struct {
struct list_head dev_list; struct list_head dev_list;
struct hlist_node napi_hash_node; struct hlist_node napi_hash_node;
unsigned int napi_id; unsigned int napi_id;
struct task_struct *thread;
}; };
enum { enum {
@ -358,6 +359,7 @@ enum {
NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */
NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
}; };
enum { enum {
@ -369,6 +371,7 @@ enum {
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL),
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
}; };
enum gro_result { enum gro_result {
@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n)
*/ */
void napi_disable(struct napi_struct *n); void napi_disable(struct napi_struct *n);
/** void napi_enable(struct napi_struct *n);
* napi_enable - enable NAPI scheduling
* @n: NAPI context
*
* Resume NAPI from being scheduled on this context.
* Must be paired with napi_disable.
*/
static inline void napi_enable(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
clear_bit(NAPI_STATE_NPSVC, &n->state);
}
/** /**
* napi_synchronize - wait until NAPI is not running * napi_synchronize - wait until NAPI is not running
@ -1827,6 +1817,8 @@ enum netdev_priv_flags {
* *
* @wol_enabled: Wake-on-LAN is enabled * @wol_enabled: Wake-on-LAN is enabled
* *
* @threaded: napi threaded mode is enabled
*
* @net_notifier_list: List of per-net netdev notifier block * @net_notifier_list: List of per-net netdev notifier block
* that follow this device when it is moved * that follow this device when it is moved
* to another network namespace. * to another network namespace.
@ -2145,6 +2137,7 @@ struct net_device {
struct lock_class_key *qdisc_running_key; struct lock_class_key *qdisc_running_key;
bool proto_down; bool proto_down;
unsigned wol_enabled:1; unsigned wol_enabled:1;
unsigned threaded:1;
struct list_head net_notifier_list; struct list_head net_notifier_list;

View File

@ -91,6 +91,7 @@
#include <linux/etherdevice.h> #include <linux/etherdevice.h>
#include <linux/ethtool.h> #include <linux/ethtool.h>
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/bpf_trace.h> #include <linux/bpf_trace.h>
#include <net/net_namespace.h> #include <net/net_namespace.h>
@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev)
} }
EXPORT_SYMBOL(netdev_notify_peers); EXPORT_SYMBOL(netdev_notify_peers);
static int napi_threaded_poll(void *data);
static int napi_kthread_create(struct napi_struct *n)
{
int err = 0;
/* Create and wake up the kthread once to put it in
* TASK_INTERRUPTIBLE mode to avoid the blocked task
* warning and work with loadavg.
*/
n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
n->dev->name, n->napi_id);
if (IS_ERR(n->thread)) {
err = PTR_ERR(n->thread);
pr_err("kthread_run failed with err %d\n", err);
n->thread = NULL;
}
return err;
}
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{ {
const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_ops *ops = dev->netdev_ops;
@ -4265,6 +4287,21 @@ int gro_normal_batch __read_mostly = 8;
static inline void ____napi_schedule(struct softnet_data *sd, static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi) struct napi_struct *napi)
{ {
struct task_struct *thread;
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
* napi_enable(). Use READ_ONCE() to guarantee
* a complete read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread);
if (thread) {
wake_up_process(thread);
return;
}
}
list_add_tail(&napi->poll_list, &sd->poll_list); list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ); __raise_softirq_irqoff(NET_RX_SOFTIRQ);
} }
@ -6728,6 +6765,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
set_bit(NAPI_STATE_NPSVC, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list); list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi); napi_hash_add(napi);
/* Create kthread for this napi if dev->threaded is set.
* Clear dev->threaded if kthread creation failed so that
* threaded mode will not be enabled in napi_enable().
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = 0;
} }
EXPORT_SYMBOL(netif_napi_add); EXPORT_SYMBOL(netif_napi_add);
@ -6745,9 +6788,28 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state);
clear_bit(NAPI_STATE_THREADED, &n->state);
} }
EXPORT_SYMBOL(napi_disable); EXPORT_SYMBOL(napi_disable);
/**
* napi_enable - enable NAPI scheduling
* @n: NAPI context
*
* Resume NAPI from being scheduled on this context.
* Must be paired with napi_disable.
*/
void napi_enable(struct napi_struct *n)
{
BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
smp_mb__before_atomic();
clear_bit(NAPI_STATE_SCHED, &n->state);
clear_bit(NAPI_STATE_NPSVC, &n->state);
if (n->dev->threaded && n->thread)
set_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_enable);
static void flush_gro_hash(struct napi_struct *napi) static void flush_gro_hash(struct napi_struct *napi)
{ {
int i; int i;
@ -6773,6 +6835,11 @@ void __netif_napi_del(struct napi_struct *napi)
flush_gro_hash(napi); flush_gro_hash(napi);
napi->gro_bitmask = 0; napi->gro_bitmask = 0;
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
} }
EXPORT_SYMBOL(__netif_napi_del); EXPORT_SYMBOL(__netif_napi_del);
@ -6867,6 +6934,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
return work; return work;
} }
static int napi_thread_wait(struct napi_struct *napi)
{
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && !napi_disable_pending(napi)) {
if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return -1;
}
static int napi_threaded_poll(void *data)
{
struct napi_struct *napi = data;
void *have;
while (!napi_thread_wait(napi)) {
for (;;) {
bool repoll = false;
local_bh_disable();
have = netpoll_poll_lock(napi);
__napi_poll(napi, &repoll);
netpoll_poll_unlock(have);
__kfree_skb_flush();
local_bh_enable();
if (!repoll)
break;
cond_resched();
}
}
return 0;
}
static __latent_entropy void net_rx_action(struct softirq_action *h) static __latent_entropy void net_rx_action(struct softirq_action *h)
{ {
struct softnet_data *sd = this_cpu_ptr(&softnet_data); struct softnet_data *sd = this_cpu_ptr(&softnet_data);