dev: add per net_device packet type chains

When many pf_packet listeners are created on a lot of interfaces the
current implementation using global packet type lists scales poorly.
This patch adds per net_device packet type lists to fix this problem.

The patch was originally written by Eric Biederman for linux-2.6.29.
Tested on linux-3.16.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Salam Noureddine 2015-01-27 11:35:48 -08:00 committed by David S. Miller
parent 7b4ce694b2
commit 7866a62104
2 changed files with 89 additions and 51 deletions

View File

@ -1514,6 +1514,8 @@ struct net_device {
struct list_head napi_list; struct list_head napi_list;
struct list_head unreg_list; struct list_head unreg_list;
struct list_head close_list; struct list_head close_list;
struct list_head ptype_all;
struct list_head ptype_specific;
struct { struct {
struct list_head upper; struct list_head upper;

View File

@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt) static inline struct list_head *ptype_head(const struct packet_type *pt)
{ {
if (pt->type == htons(ETH_P_ALL)) if (pt->type == htons(ETH_P_ALL))
return &ptype_all; return pt->dev ? &pt->dev->ptype_all : &ptype_all;
else else
return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; return pt->dev ? &pt->dev->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
} }
/** /**
@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb,
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} }
static inline void deliver_ptype_list_skb(struct sk_buff *skb,
struct packet_type **pt,
struct net_device *dev, __be16 type,
struct list_head *ptype_list)
{
struct packet_type *ptype, *pt_prev = *pt;
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
if (pt_prev)
deliver_skb(skb, pt_prev, dev);
pt_prev = ptype;
}
*pt = pt_prev;
}
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{ {
if (!ptype->af_packet_priv || !skb->sk) if (!ptype->af_packet_priv || !skb->sk)
@ -1757,29 +1775,33 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
struct packet_type *ptype; struct packet_type *ptype;
struct sk_buff *skb2 = NULL; struct sk_buff *skb2 = NULL;
struct packet_type *pt_prev = NULL; struct packet_type *pt_prev = NULL;
struct list_head *ptype_list = &ptype_all;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(ptype, &ptype_all, list) { again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
/* Never send packets back to the socket /* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org) * they originated from - MvS (miquels@drinkel.ow.org)
*/ */
if ((ptype->dev == dev || !ptype->dev) && if (skb_loop_sk(ptype, skb))
(!skb_loop_sk(ptype, skb))) { continue;
if (pt_prev) { if (pt_prev) {
deliver_skb(skb2, pt_prev, skb->dev); deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype; pt_prev = ptype;
continue; continue;
} }
/* need to clone skb, done only once */
skb2 = skb_clone(skb, GFP_ATOMIC); skb2 = skb_clone(skb, GFP_ATOMIC);
if (!skb2) if (!skb2)
break; goto out_unlock;
net_timestamp_set(skb2); net_timestamp_set(skb2);
/* skb->nh should be correctly /* skb->nh should be correctly
set by sender, so that the second statement is * set by sender, so that the second statement is
just protection against buggy protocols. * just protection against buggy protocols.
*/ */
skb_reset_mac_header(skb2); skb_reset_mac_header(skb2);
@ -1795,7 +1817,12 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
skb2->pkt_type = PACKET_OUTGOING; skb2->pkt_type = PACKET_OUTGOING;
pt_prev = ptype; pt_prev = ptype;
} }
if (ptype_list == &ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
} }
out_unlock:
if (pt_prev) if (pt_prev)
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock(); rcu_read_unlock();
@ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len; unsigned int len;
int rc; int rc;
if (!list_empty(&ptype_all)) if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
dev_queue_xmit_nit(skb, dev); dev_queue_xmit_nit(skb, dev);
len = skb->len; len = skb->len;
@ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *ptype, *pt_prev; struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler; rx_handler_func_t *rx_handler;
struct net_device *orig_dev; struct net_device *orig_dev;
struct net_device *null_or_dev;
bool deliver_exact = false; bool deliver_exact = false;
int ret = NET_RX_DROP; int ret = NET_RX_DROP;
__be16 type; __be16 type;
@ -3658,11 +3684,15 @@ another_round:
goto skip_taps; goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) { list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (!ptype->dev || ptype->dev == skb->dev) {
if (pt_prev) if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev); ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype; pt_prev = ptype;
} }
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
} }
skip_taps: skip_taps:
@ -3718,19 +3748,21 @@ ncls:
skb->vlan_tci = 0; skb->vlan_tci = 0;
} }
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol; type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { /* deliver only exact match when indicated */
if (ptype->type == type && if (likely(!deliver_exact)) {
(ptype->dev == null_or_dev || ptype->dev == skb->dev || deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
ptype->dev == orig_dev)) { &ptype_base[ntohs(type) &
if (pt_prev) PTYPE_HASH_MASK]);
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
} }
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&orig_dev->ptype_specific);
if (unlikely(skb->dev != orig_dev)) {
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&skb->dev->ptype_specific);
} }
if (pt_prev) { if (pt_prev) {
@ -6579,6 +6611,8 @@ void netdev_run_todo(void)
/* paranoia */ /* paranoia */
BUG_ON(netdev_refcnt_read(dev)); BUG_ON(netdev_refcnt_read(dev));
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr)); WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr)); WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr); WARN_ON(dev->dn_ptr);
@ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower); INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper); INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower); INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev); setup(dev);