2
0
mirror of https://github.com/edk2-porting/linux-next.git synced 2024-12-19 18:53:52 +08:00

Merge branch 'nfp-XDP_TX-optimizations'

Jakub Kicinski says:

====================
nfp: optimize XDP TX and small fixes

This series optimizes the nfp XDP TX performance a little bit.
I run quick tests on an Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz.
Single core/queue performance for both touch and drop and touch and
forward is above 20Mpps @64B packets, drop being 2Mpps faster.
I think this is max for a single queue on the low power NFPs.

There are also a few minor fixes included for code in net-next.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-04-30 22:37:01 -04:00
commit ba1d82e681
6 changed files with 118 additions and 101 deletions

View File

@ -798,7 +798,7 @@ wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
const struct bpf_insn *insn = &meta->insn;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
insn->src_reg * 2, br_mask, insn->off);
@ -818,7 +818,7 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
u32 tmp_reg;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
if (!swap)
@ -847,7 +847,7 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
u8 areg = insn->src_reg * 2, breg = insn->dst_reg * 2;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
if (swap) {
areg ^= breg;
@ -1132,7 +1132,7 @@ static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2),
reg_none(), ALU_OP_NONE, NFP_BPF_ABI_LEN);
else
return -ENOTSUPP;
return -EOPNOTSUPP;
return 0;
}
@ -1143,7 +1143,7 @@ static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
if (meta->insn.off != offsetof(struct xdp_md, data) &&
meta->insn.off != offsetof(struct xdp_md, data_end))
return -ENOTSUPP;
return -EOPNOTSUPP;
emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
@ -1174,12 +1174,12 @@ static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
if (meta->insn.off == offsetof(struct sk_buff, mark))
return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2);
return -ENOTSUPP;
return -EOPNOTSUPP;
}
static int mem_stx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
return -ENOTSUPP;
return -EOPNOTSUPP;
}
static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
@ -1192,7 +1192,7 @@ static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
if (meta->insn.off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
return 0;
@ -1206,7 +1206,7 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u32 tmp_reg;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
if (imm & ~0U) {
tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
@ -1245,7 +1245,7 @@ static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u32 tmp_reg;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
if (!imm) {
meta->skip = true;
@ -1276,7 +1276,7 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
u32 tmp_reg;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
if (!imm) {
emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
@ -1302,7 +1302,7 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
const struct bpf_insn *insn = &meta->insn;
if (insn->off < 0) /* TODO */
return -ENOTSUPP;
return -EOPNOTSUPP;
emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
ALU_OP_XOR, reg_b(insn->src_reg * 2));

View File

@ -201,6 +201,7 @@ struct nfp_net_tx_buf {
* @txds: Virtual address of TX ring in host memory
* @dma: DMA address of the TX ring
* @size: Size, in bytes, of the TX ring (needed to free)
* @is_xdp: Is this a XDP TX ring?
*/
struct nfp_net_tx_ring {
struct nfp_net_r_vector *r_vec;
@ -221,6 +222,7 @@ struct nfp_net_tx_ring {
dma_addr_t dma;
unsigned int size;
bool is_xdp;
} ____cacheline_aligned;
/* RX and freelist descriptor format */
@ -468,10 +470,10 @@ struct nfp_net_dp {
u8 chained_metadata_format:1;
u8 rx_dma_dir;
u8 rx_dma_off;
u8 rx_offset;
u32 rx_dma_off;
u32 ctrl;
u32 fl_bufsz;

View File

@ -478,15 +478,18 @@ static irqreturn_t nfp_net_irq_exn(int irq, void *data)
* @tx_ring: TX ring structure
* @r_vec: IRQ vector servicing this ring
* @idx: Ring index
* @is_xdp: Is this an XDP TX ring?
*/
static void
nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring,
struct nfp_net_r_vector *r_vec, unsigned int idx)
struct nfp_net_r_vector *r_vec, unsigned int idx,
bool is_xdp)
{
struct nfp_net *nn = r_vec->nfp_net;
tx_ring->idx = idx;
tx_ring->r_vec = r_vec;
tx_ring->is_xdp = is_xdp;
tx_ring->qcidx = tx_ring->idx * nn->stride_tx;
tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx);
@ -924,6 +927,9 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
int fidx;
int idx;
if (tx_ring->wr_p == tx_ring->rd_p)
return;
/* Work out how many descriptors have been transmitted */
qcp_rd_p = nfp_qcp_rd_ptr_read(tx_ring->qcp_q);
@ -994,11 +1000,13 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
{
struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
struct nfp_net_dp *dp = &r_vec->nfp_net->dp;
u32 done_pkts = 0, done_bytes = 0;
int idx, todo;
u32 qcp_rd_p;
if (tx_ring->wr_p == tx_ring->rd_p)
return;
/* Work out how many descriptors have been transmitted */
qcp_rd_p = nfp_qcp_rd_ptr_read(tx_ring->qcp_q);
@ -1010,22 +1018,12 @@ static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
else
todo = qcp_rd_p + tx_ring->cnt - tx_ring->qcp_rd_p;
done_pkts = todo;
while (todo--) {
idx = tx_ring->rd_p & (tx_ring->cnt - 1);
tx_ring->rd_p++;
if (!tx_ring->txbufs[idx].frag)
continue;
nfp_net_dma_unmap_rx(dp, tx_ring->txbufs[idx].dma_addr);
__free_page(virt_to_page(tx_ring->txbufs[idx].frag));
done_pkts++;
done_bytes += tx_ring->txbufs[idx].real_len;
tx_ring->txbufs[idx].dma_addr = 0;
tx_ring->txbufs[idx].frag = NULL;
tx_ring->txbufs[idx].fidx = -2;
}
tx_ring->qcp_rd_p = qcp_rd_p;
@ -1050,42 +1048,35 @@ static void nfp_net_xdp_complete(struct nfp_net_tx_ring *tx_ring)
static void
nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
{
struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
const struct skb_frag_struct *frag;
struct netdev_queue *nd_q;
while (tx_ring->rd_p != tx_ring->wr_p) {
while (!tx_ring->is_xdp && tx_ring->rd_p != tx_ring->wr_p) {
struct nfp_net_tx_buf *tx_buf;
int idx;
struct sk_buff *skb;
int idx, nr_frags;
idx = tx_ring->rd_p & (tx_ring->cnt - 1);
tx_buf = &tx_ring->txbufs[idx];
if (tx_ring == r_vec->xdp_ring) {
nfp_net_dma_unmap_rx(dp, tx_buf->dma_addr);
__free_page(virt_to_page(tx_ring->txbufs[idx].frag));
skb = tx_ring->txbufs[idx].skb;
nr_frags = skb_shinfo(skb)->nr_frags;
if (tx_buf->fidx == -1) {
/* unmap head */
dma_unmap_single(dp->dev, tx_buf->dma_addr,
skb_headlen(skb), DMA_TO_DEVICE);
} else {
struct sk_buff *skb = tx_ring->txbufs[idx].skb;
int nr_frags = skb_shinfo(skb)->nr_frags;
if (tx_buf->fidx == -1) {
/* unmap head */
dma_unmap_single(dp->dev, tx_buf->dma_addr,
skb_headlen(skb),
DMA_TO_DEVICE);
} else {
/* unmap fragment */
frag = &skb_shinfo(skb)->frags[tx_buf->fidx];
dma_unmap_page(dp->dev, tx_buf->dma_addr,
skb_frag_size(frag),
DMA_TO_DEVICE);
}
/* check for last gather fragment */
if (tx_buf->fidx == nr_frags - 1)
dev_kfree_skb_any(skb);
/* unmap fragment */
frag = &skb_shinfo(skb)->frags[tx_buf->fidx];
dma_unmap_page(dp->dev, tx_buf->dma_addr,
skb_frag_size(frag), DMA_TO_DEVICE);
}
/* check for last gather fragment */
if (tx_buf->fidx == nr_frags - 1)
dev_kfree_skb_any(skb);
tx_buf->dma_addr = 0;
tx_buf->skb = NULL;
tx_buf->fidx = -2;
@ -1100,7 +1091,7 @@ nfp_net_tx_ring_reset(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
tx_ring->qcp_rd_p = 0;
tx_ring->wr_ptr_add = 0;
if (tx_ring == r_vec->xdp_ring)
if (tx_ring->is_xdp)
return;
nd_q = netdev_get_tx_queue(dp->netdev, tx_ring->idx);
@ -1153,16 +1144,13 @@ nfp_net_free_frag(void *frag, bool xdp)
/**
* nfp_net_rx_alloc_one() - Allocate and map page frag for RX
* @dp: NFP Net data path struct
* @rx_ring: RX ring structure of the skb
* @dma_addr: Pointer to storage for DMA address (output param)
*
* This function will allcate a new page frag, map it for DMA.
*
* Return: allocated page frag or NULL on failure.
*/
static void *
nfp_net_rx_alloc_one(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
dma_addr_t *dma_addr)
static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
{
void *frag;
@ -1317,8 +1305,7 @@ nfp_net_rx_ring_bufs_alloc(struct nfp_net_dp *dp,
rxbufs = rx_ring->rxbufs;
for (i = 0; i < rx_ring->cnt - 1; i++) {
rxbufs[i].frag =
nfp_net_rx_alloc_one(dp, rx_ring, &rxbufs[i].dma_addr);
rxbufs[i].frag = nfp_net_rx_alloc_one(dp, &rxbufs[i].dma_addr);
if (!rxbufs[i].frag) {
nfp_net_rx_ring_bufs_free(dp, rx_ring);
return -ENOMEM;
@ -1496,8 +1483,6 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
{
struct nfp_net_tx_buf *txbuf;
struct nfp_net_tx_desc *txd;
dma_addr_t new_dma_addr;
void *new_frag;
int wr_idx;
if (unlikely(nfp_net_tx_full(tx_ring, 1))) {
@ -1505,17 +1490,13 @@ nfp_net_tx_xdp_buf(struct nfp_net_dp *dp, struct nfp_net_rx_ring *rx_ring,
return false;
}
new_frag = nfp_net_napi_alloc_one(dp, &new_dma_addr);
if (unlikely(!new_frag)) {
nfp_net_rx_drop(dp, rx_ring->r_vec, rx_ring, rxbuf, NULL);
return false;
}
nfp_net_rx_give_one(dp, rx_ring, new_frag, new_dma_addr);
wr_idx = tx_ring->wr_p & (tx_ring->cnt - 1);
/* Stash the soft descriptor of the head then initialize it */
txbuf = &tx_ring->txbufs[wr_idx];
nfp_net_rx_give_one(dp, rx_ring, txbuf->frag, txbuf->dma_addr);
txbuf->frag = rxbuf->frag;
txbuf->dma_addr = rxbuf->dma_addr;
txbuf->fidx = -1;
@ -1800,13 +1781,11 @@ static void nfp_net_tx_ring_free(struct nfp_net_tx_ring *tx_ring)
* nfp_net_tx_ring_alloc() - Allocate resource for a TX ring
* @dp: NFP Net data path struct
* @tx_ring: TX Ring structure to allocate
* @is_xdp: True if ring will be used for XDP
*
* Return: 0 on success, negative errno otherwise.
*/
static int
nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring,
bool is_xdp)
nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring)
{
struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
int sz;
@ -1824,7 +1803,7 @@ nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct nfp_net_tx_ring *tx_ring,
if (!tx_ring->txbufs)
goto err_alloc;
if (!is_xdp)
if (!tx_ring->is_xdp)
netif_set_xps_queue(dp->netdev, &r_vec->affinity_mask,
tx_ring->idx);
@ -1835,6 +1814,45 @@ err_alloc:
return -ENOMEM;
}
static void
nfp_net_tx_ring_bufs_free(struct nfp_net_dp *dp,
struct nfp_net_tx_ring *tx_ring)
{
unsigned int i;
if (!tx_ring->is_xdp)
return;
for (i = 0; i < tx_ring->cnt; i++) {
if (!tx_ring->txbufs[i].frag)
return;
nfp_net_dma_unmap_rx(dp, tx_ring->txbufs[i].dma_addr);
__free_page(virt_to_page(tx_ring->txbufs[i].frag));
}
}
static int
nfp_net_tx_ring_bufs_alloc(struct nfp_net_dp *dp,
struct nfp_net_tx_ring *tx_ring)
{
struct nfp_net_tx_buf *txbufs = tx_ring->txbufs;
unsigned int i;
if (!tx_ring->is_xdp)
return 0;
for (i = 0; i < tx_ring->cnt; i++) {
txbufs[i].frag = nfp_net_rx_alloc_one(dp, &txbufs[i].dma_addr);
if (!txbufs[i].frag) {
nfp_net_tx_ring_bufs_free(dp, tx_ring);
return -ENOMEM;
}
}
return 0;
}
static int nfp_net_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_dp *dp)
{
unsigned int r;
@ -1851,17 +1869,23 @@ static int nfp_net_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_dp *dp)
bias = dp->num_stack_tx_rings;
nfp_net_tx_ring_init(&dp->tx_rings[r], &nn->r_vecs[r - bias],
r);
r, bias);
if (nfp_net_tx_ring_alloc(dp, &dp->tx_rings[r], bias))
if (nfp_net_tx_ring_alloc(dp, &dp->tx_rings[r]))
goto err_free_prev;
if (nfp_net_tx_ring_bufs_alloc(dp, &dp->tx_rings[r]))
goto err_free_ring;
}
return 0;
err_free_prev:
while (r--)
while (r--) {
nfp_net_tx_ring_bufs_free(dp, &dp->tx_rings[r]);
err_free_ring:
nfp_net_tx_ring_free(&dp->tx_rings[r]);
}
kfree(dp->tx_rings);
return -ENOMEM;
}
@ -1870,8 +1894,10 @@ static void nfp_net_tx_rings_free(struct nfp_net_dp *dp)
{
unsigned int r;
for (r = 0; r < dp->num_tx_rings; r++)
for (r = 0; r < dp->num_tx_rings; r++) {
nfp_net_tx_ring_bufs_free(dp, &dp->tx_rings[r]);
nfp_net_tx_ring_free(&dp->tx_rings[r]);
}
kfree(dp->tx_rings);
}
@ -2369,8 +2395,10 @@ static void nfp_net_close_free_all(struct nfp_net *nn)
nfp_net_rx_ring_bufs_free(&nn->dp, &nn->dp.rx_rings[r]);
nfp_net_rx_ring_free(&nn->dp.rx_rings[r]);
}
for (r = 0; r < nn->dp.num_tx_rings; r++)
for (r = 0; r < nn->dp.num_tx_rings; r++) {
nfp_net_tx_ring_bufs_free(&nn->dp, &nn->dp.tx_rings[r]);
nfp_net_tx_ring_free(&nn->dp.tx_rings[r]);
}
for (r = 0; r < nn->dp.num_r_vecs; r++)
nfp_net_cleanup_vector(nn, &nn->r_vecs[r]);
@ -2651,9 +2679,9 @@ nfp_net_setup_tc(struct net_device *netdev, u32 handle, __be16 proto,
struct nfp_net *nn = netdev_priv(netdev);
if (TC_H_MAJ(handle) != TC_H_MAJ(TC_H_INGRESS))
return -ENOTSUPP;
return -EOPNOTSUPP;
if (proto != htons(ETH_P_ALL))
return -ENOTSUPP;
return -EOPNOTSUPP;
if (tc->type == TC_SETUP_CLSBPF && nfp_net_ebpf_capable(nn)) {
if (!nn->dp.bpf_offload_xdp)
@ -2938,11 +2966,7 @@ static int nfp_net_xdp_setup(struct nfp_net *nn, struct bpf_prog *prog)
dp->xdp_prog = prog;
dp->num_tx_rings += prog ? nn->dp.num_rx_rings : -nn->dp.num_rx_rings;
dp->rx_dma_dir = prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
if (prog)
dp->rx_dma_off = XDP_PACKET_HEADROOM -
(nn->dp.rx_offset ?: NFP_NET_MAX_PREPEND);
else
dp->rx_dma_off = 0;
dp->rx_dma_off = prog ? XDP_PACKET_HEADROOM - nn->dp.rx_offset : 0;
/* We need RX reconfig to remap the buffers (BIDIR vs FROM_DEV) */
err = nfp_net_ring_reconfig(nn, dp);
@ -3170,13 +3194,6 @@ int nfp_net_netdev_init(struct net_device *netdev)
struct nfp_net *nn = netdev_priv(netdev);
int err;
/* XDP calls for 256 byte packet headroom which wouldn't fit in a u8.
* We, however, reuse the metadata prepend space for XDP buffers which
* is at least 1 byte long and as long as XDP headroom doesn't increase
* above 256 the *extra* XDP headroom will fit on 8 bits.
*/
BUILD_BUG_ON(XDP_PACKET_HEADROOM > 256);
nn->dp.chained_metadata_format = nn->fw_ver.major > 3;
nn->dp.rx_dma_dir = DMA_FROM_DEVICE;

View File

@ -791,7 +791,7 @@ static int nfp_net_set_coalesce(struct net_device *netdev,
ec->tx_coalesce_usecs_high ||
ec->tx_max_coalesced_frames_high ||
ec->rate_sample_interval)
return -ENOTSUPP;
return -EOPNOTSUPP;
/* Compute factor used to convert coalesce '_usecs' parameters to
* ME timestamp ticks. There are 16 ME clock cycles for each timestamp

View File

@ -119,12 +119,12 @@ nfp_net_bpf_get_act(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
if (tc_no_actions(cls_bpf->exts))
return NN_ACT_DIRECT;
return -ENOTSUPP;
return -EOPNOTSUPP;
}
/* TC legacy mode */
if (!tc_single_action(cls_bpf->exts))
return -ENOTSUPP;
return -EOPNOTSUPP;
tcf_exts_to_list(cls_bpf->exts, &actions);
list_for_each_entry(a, &actions, list) {
@ -136,7 +136,7 @@ nfp_net_bpf_get_act(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
return NN_ACT_TC_REDIR;
}
return -ENOTSUPP;
return -EOPNOTSUPP;
}
static int
@ -152,7 +152,7 @@ nfp_net_bpf_offload_prepare(struct nfp_net *nn,
int ret;
if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
return -ENOTSUPP;
return -EOPNOTSUPP;
ret = nfp_net_bpf_get_act(nn, cls_bpf);
if (ret < 0)
@ -162,7 +162,7 @@ nfp_net_bpf_offload_prepare(struct nfp_net *nn,
max_mtu = nn_readb(nn, NFP_NET_CFG_BPF_INL_MTU) * 64 - 32;
if (max_mtu < nn->dp.netdev->mtu) {
nn_info(nn, "BPF offload not supported with MTU larger than HW packet split boundary\n");
return -ENOTSUPP;
return -EOPNOTSUPP;
}
start_off = nn_readw(nn, NFP_NET_CFG_BPF_START);
@ -289,6 +289,6 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
return nfp_net_bpf_stats_update(nn, cls_bpf);
default:
return -ENOTSUPP;
return -EOPNOTSUPP;
}
}

View File

@ -78,7 +78,7 @@
#define NSP_MAGIC 0xab10
#define NSP_MAJOR 0
#define NSP_MINOR (__MAX_SPCODE - 1)
#define NSP_MINOR 8
#define NSP_CODE_MAJOR GENMASK(15, 12)
#define NSP_CODE_MINOR GENMASK(11, 0)
@ -94,8 +94,6 @@ enum nfp_nsp_cmd {
SPCODE_ETH_RESCAN = 7, /* Rescan ETHs, write ETH_TABLE to buf */
SPCODE_ETH_CONTROL = 8, /* Update media config from buffer */
SPCODE_NSP_IDENTIFY = 13, /* Read NSP version */
__MAX_SPCODE,
};
static const struct {