Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2020-05-23

The following pull-request contains BPF updates for your *net-next* tree.

We've added 50 non-merge commits during the last 8 day(s) which contain
a total of 109 files changed, 2776 insertions(+), 2887 deletions(-).

The main changes are:

1) Add a new AF_XDP buffer allocation API to the core in order to help
   lowering the bar for drivers adopting AF_XDP support. i40e, ice, ixgbe
   as well as mlx5 have been moved over to the new API and also gained a
   small improvement in performance, from Björn Töpel and Magnus Karlsson.

2) Add getpeername()/getsockname() attach types for BPF sock_addr programs
   in order to allow for e.g. reverse translation of load-balancer backend
   to service address/port tuple from a connected peer, from Daniel Borkmann.

3) Improve the BPF verifier is_branch_taken() logic to evaluate pointers
   being non-NULL, e.g. if after an initial test another non-NULL test on
   that pointer follows in a given path, then it can be pruned right away,
   from John Fastabend.

4) Larger rework of BPF sockmap selftests to make output easier to understand
   and to reduce overall runtime as well as adding new BPF kTLS selftests
   that run in combination with sockmap, also from John Fastabend.

5) Batch of misc updates to BPF selftests including fixing up test_align
   to match verifier output again and moving it under test_progs, allowing
   bpf_iter selftest to compile on machines with older vmlinux.h, and
   updating config options for lirc and v6 segment routing helpers, from
   Stanislav Fomichev, Andrii Nakryiko and Alan Maguire.

6) Conversion of BPF tracing samples outdated internal BPF loader to use
   libbpf API instead, from Daniel T. Lee.

7) Follow-up to BPF kernel test infrastructure in order to fix a flake in
   the XDP selftests, from Jesper Dangaard Brouer.

8) Minor improvements to libbpf's internal hashmap implementation, from
   Ian Rogers.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2020-05-22 18:30:34 -07:00
commit a152b85984
109 changed files with 2814 additions and 2925 deletions

View File

@ -437,6 +437,21 @@ needed::
See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
document for further documentation.
To maximize the number of tests passing, the .config of the kernel
under test should match the config file fragment in
tools/testing/selftests/bpf as closely as possible.
Finally to ensure support for latest BPF Type Format features -
discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
pahole is delivered in the dwarves package or can be built
from source at
https://github.com/acmel/dwarves
Some distros have pahole version 1.16 packaged already, e.g.
Fedora, Gentoo.
Q: Which BPF kernel selftests version should I run my kernel against?
---------------------------------------------------------------------
A: If you run a kernel ``xyz``, then always run the BPF kernel selftests

View File

@ -18443,8 +18443,12 @@ R: Jonathan Lemon <jonathan.lemon@gmail.com>
L: netdev@vger.kernel.org
L: bpf@vger.kernel.org
S: Maintained
F: kernel/bpf/xskmap.c
F: include/net/xdp_sock*
F: include/net/xsk_buffer_pool.h
F: include/uapi/linux/if_xdp.h
F: net/xdp/
F: samples/bpf/xdpsock*
F: tools/lib/bpf/xsk*
XEN BLOCK SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

View File

@ -11,7 +11,7 @@
#include "i40e_diag.h"
#include "i40e_xsk.h"
#include <net/udp_tunnel.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
/* All i40e tracepoints are defined by the include below, which
* must be included exactly once across the whole kernel with
* CREATE_TRACE_POINTS defined
@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
if (ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
kfree(ring->rx_bi);
ring->xsk_umem = i40e_xsk_umem(ring);
if (ring->xsk_umem) {
ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
XDP_PACKET_HEADROOM;
ret = i40e_alloc_rx_bi_zc(ring);
if (ret)
return ret;
ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
ring->zca.free = i40e_zca_free;
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_ZERO_COPY,
&ring->zca);
MEM_TYPE_XSK_BUFF_POOL,
NULL);
if (ret)
return ret;
dev_info(&vsi->back->pdev->dev,
"Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
"Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->queue_index);
} else {
ret = i40e_alloc_rx_bi(ring);
if (ret)
return ret;
ring->rx_buf_len = vsi->rx_buf_len;
if (ring->vsi->type == I40E_VSI_MAIN) {
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
ok = ring->xsk_umem ?
i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
!i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
if (ring->xsk_umem) {
xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring));
} else {
ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
}
if (!ok) {
/* Log this in case the user has forgotten to give the kernel
* any buffers, even later in the application.

View File

@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
/**
* i40e_fd_handle_status - check the Programming Status for FD
* @rx_ring: the Rx ring for this descriptor
* @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
* @qword0_raw: qword0
* @qword1: qword1 after le_to_cpu
* @prog_id: the id originally used for programming
*
* This is used to verify if the FD programming or invalidation
* requested by SW to the HW is successful or not and take actions accordingly.
**/
void i40e_fd_handle_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, u8 prog_id)
static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
u64 qword1, u8 prog_id)
{
struct i40e_pf *pf = rx_ring->vsi->back;
struct pci_dev *pdev = pf->pdev;
struct i40e_32b_rx_wb_qw0 *qw0;
u32 fcnt_prog, fcnt_avail;
u32 error;
u64 qw;
qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;
if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
if (qw0->hi_dword.fd_id != 0 ||
(I40E_DEBUG_FD & pf->hw.debug_mask))
dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
pf->fd_inv);
@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
/* store the current atr filter count */
pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);
if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
if (qw0->hi_dword.fd_id == 0 &&
test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
/* These set_bit() calls aren't atomic with the
* test_bit() here, but worse case we potentially
@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
if (I40E_DEBUG_FD & pf->hw.debug_mask)
dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
rx_desc->wb.qword0.hi_dword.fd_id);
qw0->hi_dword.fd_id);
}
}
@ -1195,6 +1196,11 @@ clear_counts:
rc->total_packets = 0;
}
static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
{
return &rx_ring->rx_bi[idx];
}
/**
* i40e_reuse_rx_page - page flip buffer and store it back on the ring
* @rx_ring: rx descriptor ring to store buffers on
@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *new_buff;
u16 nta = rx_ring->next_to_alloc;
new_buff = &rx_ring->rx_bi[nta];
new_buff = i40e_rx_bi(rx_ring, nta);
/* update, and store next to alloc */
nta++;
@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
}
/**
* i40e_rx_is_programming_status - check for programming status descriptor
* @qw: qword representing status_error_len in CPU ordering
*
* The value of in the descriptor length field indicate if this
* is a programming status descriptor for flow director or FCoE
* by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
* it is a packet descriptor.
**/
static inline bool i40e_rx_is_programming_status(u64 qw)
{
/* The Rx filter programming status and SPH bit occupy the same
* spot in the descriptor. Since we don't support packet split we
* can just reuse the bit as an indication that this is a
* programming status descriptor.
*/
return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
}
/**
* i40e_clean_programming_status - try clean the programming status descriptor
* i40e_clean_programming_status - clean the programming status descriptor
* @rx_ring: the rx ring that has this descriptor
* @rx_desc: the rx descriptor written back by HW
* @qw: qword representing status_error_len in CPU ordering
* @qword0_raw: qword0
* @qword1: qword1 representing status_error_len in CPU ordering
*
* Flow director should handle FD_FILTER_STATUS to check its filter programming
* status being successful or not and take actions accordingly. FCoE should
@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw)
*
* Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
**/
struct i40e_rx_buffer *i40e_clean_programming_status(
struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
u64 qw)
void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
u64 qword1)
{
struct i40e_rx_buffer *rx_buffer;
u32 ntc;
u8 id;
if (!i40e_rx_is_programming_status(qw))
return NULL;
ntc = rx_ring->next_to_clean;
/* fetch, update, and store next to clean */
rx_buffer = &rx_ring->rx_bi[ntc++];
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(I40E_RX_DESC(rx_ring, ntc));
id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;
if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
i40e_fd_handle_status(rx_ring, rx_desc, id);
return rx_buffer;
i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
}
/**
@ -1336,13 +1305,25 @@ err:
return -ENOMEM;
}
int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
{
unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;
rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
return rx_ring->rx_bi ? 0 : -ENOMEM;
}
static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
{
memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
}
/**
* i40e_clean_rx_ring - Free Rx buffers
* @rx_ring: ring to be cleaned
**/
void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
{
unsigned long bi_size;
u16 i;
/* ring already cleared, nothing to do */
@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);
if (!rx_bi->page)
continue;
@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
}
skip_free:
bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
memset(rx_ring->rx_bi, 0, bi_size);
if (rx_ring->xsk_umem)
i40e_clear_rx_bi_zc(rx_ring);
else
i40e_clear_rx_bi(rx_ring);
/* Zero out the descriptor ring */
memset(rx_ring->desc, 0, rx_ring->size);
@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int err = -ENOMEM;
int bi_size;
/* warn if we are about to overwrite the pointer */
WARN_ON(rx_ring->rx_bi);
bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
if (!rx_ring->rx_bi)
goto err;
int err;
u64_stats_init(&rx_ring->syncp);
@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
if (!rx_ring->desc) {
dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
rx_ring->size);
goto err;
return -ENOMEM;
}
rx_ring->next_to_alloc = 0;
@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
rx_ring->queue_index);
if (err < 0)
goto err;
return err;
}
rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
return 0;
err:
kfree(rx_ring->rx_bi);
rx_ring->rx_bi = NULL;
return err;
}
/**
@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
return false;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
bi = &rx_ring->rx_bi[ntu];
bi = i40e_rx_bi(rx_ring, ntu);
do {
if (!i40e_alloc_mapped_page(rx_ring, bi))
@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
ntu++;
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
bi = rx_ring->rx_bi;
bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
{
struct i40e_rx_buffer *rx_buffer;
rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
prefetchw(rx_buffer->page);
/* we are reusing so sync this buffer for CPU use */
@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc,
if (i40e_rx_is_programming_status(qword)) {
i40e_clean_programming_status(rx_ring,
rx_desc->raw.qword[0],
qword);
if (unlikely(rx_buffer)) {
rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
i40e_inc_ntc(rx_ring);
i40e_reuse_rx_page(rx_ring, rx_buffer);
cleaned_count++;
continue;

View File

@ -296,17 +296,9 @@ struct i40e_tx_buffer {
struct i40e_rx_buffer {
dma_addr_t dma;
union {
struct {
struct page *page;
__u32 page_offset;
__u16 pagecnt_bias;
};
struct {
void *addr;
u64 handle;
};
};
};
struct i40e_queue_stats {
@ -358,6 +350,7 @@ struct i40e_ring {
union {
struct i40e_tx_buffer *tx_bi;
struct i40e_rx_buffer *rx_bi;
struct xdp_buff **rx_bi_zc;
};
DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
u16 queue_index; /* Queue number of ring */
@ -419,7 +412,6 @@ struct i40e_ring {
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
struct zero_copy_allocator zca; /* ZC allocator anchor */
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)
@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
u32 flags);
int i40e_alloc_rx_bi(struct i40e_ring *rx_ring);
/**
* i40e_get_head - Retrieve head from head writeback

View File

@ -4,13 +4,9 @@
#ifndef I40E_TXRX_COMMON_
#define I40E_TXRX_COMMON_
void i40e_fd_handle_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, u8 prog_id);
int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
struct i40e_rx_buffer *i40e_clean_programming_status(
struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
u64 qw);
void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
u64 qword1);
void i40e_process_skb_fields(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, struct sk_buff *skb);
void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
@ -84,6 +80,38 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
}
}
/**
* i40e_rx_is_programming_status - check for programming status descriptor
* @qword1: qword1 representing status_error_len in CPU ordering
*
* The value of in the descriptor length field indicate if this
* is a programming status descriptor for flow director or FCoE
* by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
* it is a packet descriptor.
**/
static inline bool i40e_rx_is_programming_status(u64 qword1)
{
/* The Rx filter programming status and SPH bit occupy the same
* spot in the descriptor. Since we don't support packet split we
* can just reuse the bit as an indication that this is a
* programming status descriptor.
*/
return qword1 & I40E_RXD_QW1_LENGTH_SPH_MASK;
}
/**
* i40e_inc_ntc: Advance the next_to_clean index
* @rx_ring: Rx ring
**/
static inline void i40e_inc_ntc(struct i40e_ring *rx_ring)
{
u32 ntc = rx_ring->next_to_clean + 1;
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(I40E_RX_DESC(rx_ring, ntc));
}
void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);

View File

@ -689,7 +689,7 @@ union i40e_32byte_rx_desc {
__le64 rsvd2;
} read;
struct {
struct {
struct i40e_32b_rx_wb_qw0 {
struct {
union {
__le16 mirroring_status;
@ -727,6 +727,9 @@ union i40e_32byte_rx_desc {
} hi_dword;
} qword3;
} wb; /* writeback */
struct {
u64 qword[4];
} raw;
};
enum i40e_rx_desc_status_bits {

View File

@ -2,68 +2,30 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "i40e.h"
#include "i40e_txrx_common.h"
#include "i40e_xsk.h"
/**
* i40e_xsk_umem_dma_map - DMA maps all UMEM memory for the netdev
* @vsi: Current VSI
* @umem: UMEM to DMA map
*
* Returns 0 on success, <0 on failure
**/
static int i40e_xsk_umem_dma_map(struct i40e_vsi *vsi, struct xdp_umem *umem)
int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring)
{
struct i40e_pf *pf = vsi->back;
struct device *dev;
unsigned int i, j;
dma_addr_t dma;
unsigned long sz = sizeof(*rx_ring->rx_bi_zc) * rx_ring->count;
dev = &pf->pdev->dev;
for (i = 0; i < umem->npgs; i++) {
dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
if (dma_mapping_error(dev, dma))
goto out_unmap;
umem->pages[i].dma = dma;
}
return 0;
out_unmap:
for (j = 0; j < i; j++) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
return -1;
rx_ring->rx_bi_zc = kzalloc(sz, GFP_KERNEL);
return rx_ring->rx_bi_zc ? 0 : -ENOMEM;
}
/**
* i40e_xsk_umem_dma_unmap - DMA unmaps all UMEM memory for the netdev
* @vsi: Current VSI
* @umem: UMEM to DMA map
**/
static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem)
void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring)
{
struct i40e_pf *pf = vsi->back;
struct device *dev;
unsigned int i;
memset(rx_ring->rx_bi_zc, 0,
sizeof(*rx_ring->rx_bi_zc) * rx_ring->count);
}
dev = &pf->pdev->dev;
for (i = 0; i < umem->npgs; i++) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, I40E_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
static struct xdp_buff **i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
{
return &rx_ring->rx_bi_zc[idx];
}
/**
@ -78,7 +40,6 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid)
{
struct net_device *netdev = vsi->netdev;
struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
@ -92,13 +53,7 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
qid >= netdev->real_num_tx_queues)
return -EINVAL;
reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
if (!reuseq)
return -ENOMEM;
xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
err = i40e_xsk_umem_dma_map(vsi, umem);
err = xsk_buff_dma_map(umem, &vsi->back->pdev->dev, I40E_RX_DMA_ATTR);
if (err)
return err;
@ -151,7 +106,7 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid)
}
clear_bit(qid, vsi->af_xdp_zc_qps);
i40e_xsk_umem_dma_unmap(vsi, umem);
xsk_buff_dma_unmap(umem, I40E_RX_DMA_ATTR);
if (if_running) {
err = i40e_queue_pair_enable(vsi, qid);
@ -190,11 +145,9 @@ int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
**/
static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = I40E_XDP_PASS;
struct i40e_ring *xdp_ring;
struct bpf_prog *xdp_prog;
u64 offset;
u32 act;
rcu_read_lock();
@ -203,9 +156,6 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
*/
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
offset = xdp->data - xdp->data_hard_start;
xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
@ -232,107 +182,26 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp)
return result;
}
/**
* i40e_alloc_buffer_zc - Allocates an i40e_rx_buffer
* @rx_ring: Rx ring
* @bi: Rx buffer to populate
*
* This function allocates an Rx buffer. The buffer can come from fill
* queue, or via the recycle queue (next_to_alloc).
*
* Returns true for a successful allocation, false otherwise
**/
static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *bi)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
void *addr = bi->addr;
u64 handle, hr;
if (addr) {
rx_ring->rx_stats.page_reuse_count++;
return true;
}
if (!xsk_umem_peek_addr(umem, &handle)) {
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
hr = umem->headroom + XDP_PACKET_HEADROOM;
bi->dma = xdp_umem_get_dma(umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_release_addr(umem);
return true;
}
/**
* i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
* @rx_ring: Rx ring
* @bi: Rx buffer to populate
*
* This function allocates an Rx buffer. The buffer can come from fill
* queue, or via the reuse queue.
*
* Returns true for a successful allocation, false otherwise
**/
static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *bi)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
u64 handle, hr;
if (!xsk_umem_peek_addr_rq(umem, &handle)) {
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
handle &= rx_ring->xsk_umem->chunk_mask;
hr = umem->headroom + XDP_PACKET_HEADROOM;
bi->dma = xdp_umem_get_dma(umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_release_addr_rq(umem);
return true;
}
static __always_inline bool
__i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
bool alloc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *bi))
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
{
u16 ntu = rx_ring->next_to_use;
union i40e_rx_desc *rx_desc;
struct i40e_rx_buffer *bi;
struct xdp_buff **bi, *xdp;
dma_addr_t dma;
bool ok = true;
rx_desc = I40E_RX_DESC(rx_ring, ntu);
bi = &rx_ring->rx_bi[ntu];
bi = i40e_rx_bi(rx_ring, ntu);
do {
if (!alloc(rx_ring, bi)) {
xdp = xsk_buff_alloc(rx_ring->xsk_umem);
if (!xdp) {
ok = false;
goto no_buffers;
}
dma_sync_single_range_for_device(rx_ring->dev, bi->dma, 0,
rx_ring->rx_buf_len,
DMA_BIDIRECTIONAL);
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
*bi = xdp;
dma = xsk_buff_xdp_get_dma(xdp);
rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc->read.hdr_addr = 0;
rx_desc++;
bi++;
@ -340,11 +209,10 @@ __i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count,
if (unlikely(ntu == rx_ring->count)) {
rx_desc = I40E_RX_DESC(rx_ring, 0);
bi = rx_ring->rx_bi;
bi = i40e_rx_bi(rx_ring, 0);
ntu = 0;
}
rx_desc->wb.qword1.status_error_len = 0;
count--;
} while (count);
@ -355,128 +223,9 @@ no_buffers:
return ok;
}
/**
* i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
* @rx_ring: Rx ring
* @count: The number of buffers to allocate
*
* This function allocates a number of Rx buffers from the reuse queue
* or fill ring and places them on the Rx ring.
*
* Returns true for a successful allocation, false otherwise
**/
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
{
return __i40e_alloc_rx_buffers_zc(rx_ring, count,
i40e_alloc_buffer_slow_zc);
}
/**
* i40e_alloc_rx_buffers_fast_zc - Allocates a number of Rx buffers
* @rx_ring: Rx ring
* @count: The number of buffers to allocate
*
* This function allocates a number of Rx buffers from the fill ring
* or the internal recycle mechanism and places them on the Rx ring.
*
* Returns true for a successful allocation, false otherwise
**/
static bool i40e_alloc_rx_buffers_fast_zc(struct i40e_ring *rx_ring, u16 count)
{
return __i40e_alloc_rx_buffers_zc(rx_ring, count,
i40e_alloc_buffer_zc);
}
/**
* i40e_get_rx_buffer_zc - Return the current Rx buffer
* @rx_ring: Rx ring
* @size: The size of the rx buffer (read from descriptor)
*
* This function returns the current, received Rx buffer, and also
* does DMA synchronization. the Rx ring.
*
* Returns the received Rx buffer
**/
static struct i40e_rx_buffer *i40e_get_rx_buffer_zc(struct i40e_ring *rx_ring,
const unsigned int size)
{
struct i40e_rx_buffer *bi;
bi = &rx_ring->rx_bi[rx_ring->next_to_clean];
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
bi->dma, 0,
size,
DMA_BIDIRECTIONAL);
return bi;
}
/**
* i40e_reuse_rx_buffer_zc - Recycle an Rx buffer
* @rx_ring: Rx ring
* @old_bi: The Rx buffer to recycle
*
* This function recycles a finished Rx buffer, and places it on the
* recycle queue (next_to_alloc).
**/
static void i40e_reuse_rx_buffer_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *old_bi)
{
struct i40e_rx_buffer *new_bi = &rx_ring->rx_bi[rx_ring->next_to_alloc];
u16 nta = rx_ring->next_to_alloc;
/* update, and store next to alloc */
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
new_bi->dma = old_bi->dma;
new_bi->addr = old_bi->addr;
new_bi->handle = old_bi->handle;
old_bi->addr = NULL;
}
/**
* i40e_zca_free - Free callback for MEM_TYPE_ZERO_COPY allocations
* @alloc: Zero-copy allocator
* @handle: Buffer handle
**/
void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
{
struct i40e_rx_buffer *bi;
struct i40e_ring *rx_ring;
u64 hr, mask;
u16 nta;
rx_ring = container_of(alloc, struct i40e_ring, zca);
hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
mask = rx_ring->xsk_umem->chunk_mask;
nta = rx_ring->next_to_alloc;
bi = &rx_ring->rx_bi[nta];
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
handle &= mask;
bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
rx_ring->xsk_umem->headroom);
}
/**
* i40e_construct_skb_zc - Create skbufff from zero-copy Rx buffer
* @rx_ring: Rx ring
* @bi: Rx buffer
* @xdp: xdp_buff
*
* This functions allocates a new skb from a zero-copy Rx buffer.
@ -484,7 +233,6 @@ void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
* Returns the skb, or NULL on failure.
**/
static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
struct i40e_rx_buffer *bi,
struct xdp_buff *xdp)
{
unsigned int metasize = xdp->data - xdp->data_meta;
@ -503,23 +251,10 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
if (metasize)
skb_metadata_set(skb, metasize);
i40e_reuse_rx_buffer_zc(rx_ring, bi);
xsk_buff_free(xdp);
return skb;
}
/**
* i40e_inc_ntc: Advance the next_to_clean index
* @rx_ring: Rx ring
**/
static void i40e_inc_ntc(struct i40e_ring *rx_ring)
{
u32 ntc = rx_ring->next_to_clean + 1;
ntc = (ntc < rx_ring->count) ? ntc : 0;
rx_ring->next_to_clean = ntc;
prefetch(I40E_RX_DESC(rx_ring, ntc));
}
/**
* i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring
* @rx_ring: Rx ring
@ -531,24 +266,19 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = I40E_DESC_UNUSED(rx_ring);
struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
struct i40e_rx_buffer *bi;
union i40e_rx_desc *rx_desc;
struct xdp_buff **bi;
unsigned int size;
u64 qword;
if (cleaned_count >= I40E_RX_BUFFER_WRITE) {
failure = failure ||
!i40e_alloc_rx_buffers_fast_zc(rx_ring,
!i40e_alloc_rx_buffers_zc(rx_ring,
cleaned_count);
cleaned_count = 0;
}
@ -562,35 +292,36 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
*/
dma_rmb();
bi = i40e_clean_programming_status(rx_ring, rx_desc,
if (i40e_rx_is_programming_status(qword)) {
i40e_clean_programming_status(rx_ring,
rx_desc->raw.qword[0],
qword);
if (unlikely(bi)) {
i40e_reuse_rx_buffer_zc(rx_ring, bi);
bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
xsk_buff_free(*bi);
*bi = NULL;
cleaned_count++;
i40e_inc_ntc(rx_ring);
continue;
}
bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
size = (qword & I40E_RXD_QW1_LENGTH_PBUF_MASK) >>
I40E_RXD_QW1_LENGTH_PBUF_SHIFT;
if (!size)
break;
bi = i40e_get_rx_buffer_zc(rx_ring, size);
xdp.data = bi->addr;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
xdp.data_end = xdp.data + size;
xdp.handle = bi->handle;
bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
(*bi)->data_end = (*bi)->data + size;
xsk_buff_dma_sync_for_cpu(*bi);
xdp_res = i40e_run_xdp_zc(rx_ring, &xdp);
xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
if (xdp_res) {
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR)) {
if (xdp_res & (I40E_XDP_TX | I40E_XDP_REDIR))
xdp_xmit |= xdp_res;
bi->addr = NULL;
} else {
i40e_reuse_rx_buffer_zc(rx_ring, bi);
}
else
xsk_buff_free(*bi);
*bi = NULL;
total_rx_bytes += size;
total_rx_packets++;
@ -606,7 +337,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
* BIT(I40E_RXD_QW1_ERROR_SHIFT). This is due to that
* SBP is *not* set in PRT_SBPVSI (default not set).
*/
skb = i40e_construct_skb_zc(rx_ring, bi, &xdp);
skb = i40e_construct_skb_zc(rx_ring, *bi);
*bi = NULL;
if (!skb) {
rx_ring->rx_stats.alloc_buff_failed++;
break;
@ -664,10 +396,9 @@ static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
desc.len);
tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;
@ -826,13 +557,13 @@ void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring)
u16 i;
for (i = 0; i < rx_ring->count; i++) {
struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
struct xdp_buff *rx_bi = *i40e_rx_bi(rx_ring, i);
if (!rx_bi->addr)
if (!rx_bi)
continue;
xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_bi->handle);
rx_bi->addr = NULL;
xsk_buff_free(rx_bi);
rx_bi = NULL;
}
}

View File

@ -12,12 +12,13 @@ int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair);
int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid);
void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
int i40e_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags);
int i40e_alloc_rx_bi_zc(struct i40e_ring *rx_ring);
void i40e_clear_rx_bi_zc(struct i40e_ring *rx_ring);
#endif /* _I40E_XSK_H_ */

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019, Intel Corporation. */
#include <net/xdp_sock_drv.h>
#include "ice_base.h"
#include "ice_dcb_lib.h"
@ -308,24 +309,23 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
if (ring->xsk_umem) {
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
XDP_PACKET_HEADROOM;
ring->rx_buf_len =
xsk_umem_get_rx_frame_size(ring->xsk_umem);
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
ring->zca.free = ice_zca_free;
err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_ZERO_COPY,
&ring->zca);
MEM_TYPE_XSK_BUFF_POOL,
NULL);
if (err)
return err;
xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
dev_info(ice_pf_to_dev(vsi->back), "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
ring->q_index);
} else {
ring->zca.free = NULL;
if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
/* coverity[check_return] */
xdp_rxq_info_reg(&ring->xdp_rxq,
@ -426,7 +426,7 @@ int ice_setup_rx_ctx(struct ice_ring *ring)
writel(0, ring->tail);
err = ring->xsk_umem ?
ice_alloc_rx_bufs_slow_zc(ring, ICE_DESC_UNUSED(ring)) :
ice_alloc_rx_bufs_zc(ring, ICE_DESC_UNUSED(ring)) :
ice_alloc_rx_bufs(ring, ICE_DESC_UNUSED(ring));
if (err)
dev_info(ice_pf_to_dev(vsi->back), "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",

View File

@ -158,17 +158,16 @@ struct ice_tx_offload_params {
};
struct ice_rx_buf {
struct sk_buff *skb;
dma_addr_t dma;
union {
struct {
struct sk_buff *skb;
dma_addr_t dma;
struct page *page;
unsigned int page_offset;
u16 pagecnt_bias;
};
struct {
void *addr;
u64 handle;
struct xdp_buff *xdp;
};
};
};
@ -292,7 +291,6 @@ struct ice_ring {
struct rcu_head rcu; /* to avoid race on free */
struct bpf_prog *xdp_prog;
struct xdp_umem *xsk_umem;
struct zero_copy_allocator zca;
/* CL3 - 3rd cacheline starts here */
struct xdp_rxq_info xdp_rxq;
/* CLX - the below items are only accessed infrequently and should be

View File

@ -2,7 +2,7 @@
/* Copyright (c) 2019, Intel Corporation. */
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ice.h"
#include "ice_base.h"
@ -279,28 +279,6 @@ static int ice_xsk_alloc_umems(struct ice_vsi *vsi)
return 0;
}
/**
* ice_xsk_add_umem - add a UMEM region for XDP sockets
* @vsi: VSI to which the UMEM will be added
* @umem: pointer to a requested UMEM region
* @qid: queue ID
*
* Returns 0 on success, negative on error
*/
static int ice_xsk_add_umem(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
{
int err;
err = ice_xsk_alloc_umems(vsi);
if (err)
return err;
vsi->xsk_umems[qid] = umem;
vsi->num_xsk_umems_used++;
return 0;
}
/**
* ice_xsk_remove_umem - Remove an UMEM for a certain ring/qid
* @vsi: VSI from which the VSI will be removed
@ -318,65 +296,6 @@ static void ice_xsk_remove_umem(struct ice_vsi *vsi, u16 qid)
}
}
/**
* ice_xsk_umem_dma_map - DMA map UMEM region for XDP sockets
* @vsi: VSI to map the UMEM region
* @umem: UMEM to map
*
* Returns 0 on success, negative on error
*/
static int ice_xsk_umem_dma_map(struct ice_vsi *vsi, struct xdp_umem *umem)
{
struct ice_pf *pf = vsi->back;
struct device *dev;
unsigned int i;
dev = ice_pf_to_dev(pf);
for (i = 0; i < umem->npgs; i++) {
dma_addr_t dma = dma_map_page_attrs(dev, umem->pgs[i], 0,
PAGE_SIZE,
DMA_BIDIRECTIONAL,
ICE_RX_DMA_ATTR);
if (dma_mapping_error(dev, dma)) {
dev_dbg(dev, "XSK UMEM DMA mapping error on page num %d\n",
i);
goto out_unmap;
}
umem->pages[i].dma = dma;
}
return 0;
out_unmap:
for (; i > 0; i--) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
return -EFAULT;
}
/**
* ice_xsk_umem_dma_unmap - DMA unmap UMEM region for XDP sockets
* @vsi: VSI from which the UMEM will be unmapped
* @umem: UMEM to unmap
*/
static void ice_xsk_umem_dma_unmap(struct ice_vsi *vsi, struct xdp_umem *umem)
{
struct ice_pf *pf = vsi->back;
struct device *dev;
unsigned int i;
dev = ice_pf_to_dev(pf);
for (i = 0; i < umem->npgs; i++) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, ICE_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
}
/**
* ice_xsk_umem_disable - disable a UMEM region
@ -391,7 +310,7 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
!vsi->xsk_umems[qid])
return -EINVAL;
ice_xsk_umem_dma_unmap(vsi, vsi->xsk_umems[qid]);
xsk_buff_dma_unmap(vsi->xsk_umems[qid], ICE_RX_DMA_ATTR);
ice_xsk_remove_umem(vsi, qid);
return 0;
@ -408,7 +327,6 @@ static int ice_xsk_umem_disable(struct ice_vsi *vsi, u16 qid)
static int
ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
{
struct xdp_umem_fq_reuse *reuseq;
int err;
if (vsi->type != ICE_VSI_PF)
@ -419,20 +337,18 @@ ice_xsk_umem_enable(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid)
if (qid >= vsi->num_xsk_umems)
return -EINVAL;
if (vsi->xsk_umems && vsi->xsk_umems[qid])
return -EBUSY;
reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
if (!reuseq)
return -ENOMEM;
xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
err = ice_xsk_umem_dma_map(vsi, umem);
err = ice_xsk_alloc_umems(vsi);
if (err)
return err;
err = ice_xsk_add_umem(vsi, umem, qid);
if (vsi->xsk_umems && vsi->xsk_umems[qid])
return -EBUSY;
vsi->xsk_umems[qid] = umem;
vsi->num_xsk_umems_used++;
err = xsk_buff_dma_map(vsi->xsk_umems[qid], ice_pf_to_dev(vsi->back),
ICE_RX_DMA_ATTR);
if (err)
return err;
@ -483,138 +399,23 @@ xsk_umem_if_up:
return ret;
}
/**
* ice_zca_free - Callback for MEM_TYPE_ZERO_COPY allocations
* @zca: zero-cpoy allocator
* @handle: Buffer handle
*/
void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
{
struct ice_rx_buf *rx_buf;
struct ice_ring *rx_ring;
struct xdp_umem *umem;
u64 hr, mask;
u16 nta;
rx_ring = container_of(zca, struct ice_ring, zca);
umem = rx_ring->xsk_umem;
hr = umem->headroom + XDP_PACKET_HEADROOM;
mask = umem->chunk_mask;
nta = rx_ring->next_to_alloc;
rx_buf = &rx_ring->rx_buf[nta];
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
handle &= mask;
rx_buf->dma = xdp_umem_get_dma(umem, handle);
rx_buf->dma += hr;
rx_buf->addr = xdp_umem_get_data(umem, handle);
rx_buf->addr += hr;
rx_buf->handle = (u64)handle + umem->headroom;
}
/**
* ice_alloc_buf_fast_zc - Retrieve buffer address from XDP umem
* @rx_ring: ring with an xdp_umem bound to it
* @rx_buf: buffer to which xsk page address will be assigned
*
* This function allocates an Rx buffer in the hot path.
* The buffer can come from fill queue or recycle queue.
*
* Returns true if an assignment was successful, false if not.
*/
static __always_inline bool
ice_alloc_buf_fast_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
void *addr = rx_buf->addr;
u64 handle, hr;
if (addr) {
rx_ring->rx_stats.page_reuse_count++;
return true;
}
if (!xsk_umem_peek_addr(umem, &handle)) {
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
hr = umem->headroom + XDP_PACKET_HEADROOM;
rx_buf->dma = xdp_umem_get_dma(umem, handle);
rx_buf->dma += hr;
rx_buf->addr = xdp_umem_get_data(umem, handle);
rx_buf->addr += hr;
rx_buf->handle = handle + umem->headroom;
xsk_umem_release_addr(umem);
return true;
}
/**
* ice_alloc_buf_slow_zc - Retrieve buffer address from XDP umem
* @rx_ring: ring with an xdp_umem bound to it
* @rx_buf: buffer to which xsk page address will be assigned
*
* This function allocates an Rx buffer in the slow path.
* The buffer can come from fill queue or recycle queue.
*
* Returns true if an assignment was successful, false if not.
*/
static __always_inline bool
ice_alloc_buf_slow_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
u64 handle, headroom;
if (!xsk_umem_peek_addr_rq(umem, &handle)) {
rx_ring->rx_stats.alloc_page_failed++;
return false;
}
handle &= umem->chunk_mask;
headroom = umem->headroom + XDP_PACKET_HEADROOM;
rx_buf->dma = xdp_umem_get_dma(umem, handle);
rx_buf->dma += headroom;
rx_buf->addr = xdp_umem_get_data(umem, handle);
rx_buf->addr += headroom;
rx_buf->handle = handle + umem->headroom;
xsk_umem_release_addr_rq(umem);
return true;
}
/**
* ice_alloc_rx_bufs_zc - allocate a number of Rx buffers
* @rx_ring: Rx ring
* @count: The number of buffers to allocate
* @alloc: the function pointer to call for allocation
*
* This function allocates a number of Rx buffers from the fill ring
* or the internal recycle mechanism and places them on the Rx ring.
*
* Returns false if all allocations were successful, true if any fail.
*/
static bool
ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
bool (*alloc)(struct ice_ring *, struct ice_rx_buf *))
bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
{
union ice_32b_rx_flex_desc *rx_desc;
u16 ntu = rx_ring->next_to_use;
struct ice_rx_buf *rx_buf;
bool ret = false;
dma_addr_t dma;
if (!count)
return false;
@ -623,16 +424,14 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
rx_buf = &rx_ring->rx_buf[ntu];
do {
if (!alloc(rx_ring, rx_buf)) {
rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
if (!rx_buf->xdp) {
ret = true;
break;
}
dma_sync_single_range_for_device(rx_ring->dev, rx_buf->dma, 0,
rx_ring->rx_buf_len,
DMA_BIDIRECTIONAL);
rx_desc->read.pkt_addr = cpu_to_le64(rx_buf->dma);
dma = xsk_buff_xdp_get_dma(rx_buf->xdp);
rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc->wb.status_error0 = 0;
rx_desc++;
@ -652,32 +451,6 @@ ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, int count,
return ret;
}
/**
* ice_alloc_rx_bufs_fast_zc - allocate zero copy bufs in the hot path
* @rx_ring: Rx ring
* @count: number of bufs to allocate
*
* Returns false on success, true on failure.
*/
static bool ice_alloc_rx_bufs_fast_zc(struct ice_ring *rx_ring, u16 count)
{
return ice_alloc_rx_bufs_zc(rx_ring, count,
ice_alloc_buf_fast_zc);
}
/**
* ice_alloc_rx_bufs_slow_zc - allocate zero copy bufs in the slow path
* @rx_ring: Rx ring
* @count: number of bufs to allocate
*
* Returns false on success, true on failure.
*/
bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count)
{
return ice_alloc_rx_bufs_zc(rx_ring, count,
ice_alloc_buf_slow_zc);
}
/**
* ice_bump_ntc - Bump the next_to_clean counter of an Rx ring
* @rx_ring: Rx ring
@ -691,77 +464,22 @@ static void ice_bump_ntc(struct ice_ring *rx_ring)
prefetch(ICE_RX_DESC(rx_ring, ntc));
}
/**
* ice_get_rx_buf_zc - Fetch the current Rx buffer
* @rx_ring: Rx ring
* @size: size of a buffer
*
* This function returns the current, received Rx buffer and does
* DMA synchronization.
*
* Returns a pointer to the received Rx buffer.
*/
static struct ice_rx_buf *ice_get_rx_buf_zc(struct ice_ring *rx_ring, int size)
{
struct ice_rx_buf *rx_buf;
rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, 0,
size, DMA_BIDIRECTIONAL);
return rx_buf;
}
/**
* ice_reuse_rx_buf_zc - reuse an Rx buffer
* @rx_ring: Rx ring
* @old_buf: The buffer to recycle
*
* This function recycles a finished Rx buffer, and places it on the recycle
* queue (next_to_alloc).
*/
static void
ice_reuse_rx_buf_zc(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
{
unsigned long mask = (unsigned long)rx_ring->xsk_umem->chunk_mask;
u64 hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
u16 nta = rx_ring->next_to_alloc;
struct ice_rx_buf *new_buf;
new_buf = &rx_ring->rx_buf[nta++];
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
new_buf->dma = old_buf->dma & mask;
new_buf->dma += hr;
new_buf->addr = (void *)((unsigned long)old_buf->addr & mask);
new_buf->addr += hr;
new_buf->handle = old_buf->handle & mask;
new_buf->handle += rx_ring->xsk_umem->headroom;
old_buf->addr = NULL;
}
/**
* ice_construct_skb_zc - Create an sk_buff from zero-copy buffer
* @rx_ring: Rx ring
* @rx_buf: zero-copy Rx buffer
* @xdp: XDP buffer
*
* This function allocates a new skb from a zero-copy Rx buffer.
*
* Returns the skb on success, NULL on failure.
*/
static struct sk_buff *
ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
struct xdp_buff *xdp)
ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
{
unsigned int metasize = xdp->data - xdp->data_meta;
unsigned int datasize = xdp->data_end - xdp->data;
unsigned int datasize_hard = xdp->data_end -
xdp->data_hard_start;
unsigned int metasize = rx_buf->xdp->data - rx_buf->xdp->data_meta;
unsigned int datasize = rx_buf->xdp->data_end - rx_buf->xdp->data;
unsigned int datasize_hard = rx_buf->xdp->data_end -
rx_buf->xdp->data_hard_start;
struct sk_buff *skb;
skb = __napi_alloc_skb(&rx_ring->q_vector->napi, datasize_hard,
@ -769,13 +487,13 @@ ice_construct_skb_zc(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
if (unlikely(!skb))
return NULL;
skb_reserve(skb, xdp->data - xdp->data_hard_start);
memcpy(__skb_put(skb, datasize), xdp->data, datasize);
skb_reserve(skb, rx_buf->xdp->data - rx_buf->xdp->data_hard_start);
memcpy(__skb_put(skb, datasize), rx_buf->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
ice_reuse_rx_buf_zc(rx_ring, rx_buf);
xsk_buff_free(rx_buf->xdp);
rx_buf->xdp = NULL;
return skb;
}
@ -802,7 +520,6 @@ ice_run_xdp_zc(struct ice_ring *rx_ring, struct xdp_buff *xdp)
}
act = bpf_prog_run_xdp(xdp_prog, xdp);
xdp->handle += xdp->data - xdp->data_hard_start;
switch (act) {
case XDP_PASS:
break;
@ -840,13 +557,8 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
{
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_xmit = 0;
bool failure = false;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < (unsigned int)budget)) {
union ice_32b_rx_flex_desc *rx_desc;
@ -858,7 +570,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
u8 rx_ptype;
if (cleaned_count >= ICE_RX_BUF_WRITE) {
failure |= ice_alloc_rx_bufs_fast_zc(rx_ring,
failure |= ice_alloc_rx_bufs_zc(rx_ring,
cleaned_count);
cleaned_count = 0;
}
@ -880,25 +592,19 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
if (!size)
break;
rx_buf = ice_get_rx_buf_zc(rx_ring, size);
if (!rx_buf->addr)
break;
xdp.data = rx_buf->addr;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
xdp.data_end = xdp.data + size;
xdp.handle = rx_buf->handle;
rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
rx_buf->xdp->data_end = rx_buf->xdp->data + size;
xsk_buff_dma_sync_for_cpu(rx_buf->xdp);
xdp_res = ice_run_xdp_zc(rx_ring, &xdp);
xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
if (xdp_res) {
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR))
xdp_xmit |= xdp_res;
rx_buf->addr = NULL;
} else {
ice_reuse_rx_buf_zc(rx_ring, rx_buf);
}
else
xsk_buff_free(rx_buf->xdp);
rx_buf->xdp = NULL;
total_rx_bytes += size;
total_rx_packets++;
cleaned_count++;
@ -908,7 +614,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
}
/* XDP_PASS path */
skb = ice_construct_skb_zc(rx_ring, rx_buf, &xdp);
skb = ice_construct_skb_zc(rx_ring, rx_buf);
if (!skb) {
rx_ring->rx_stats.alloc_buf_failed++;
break;
@ -979,10 +685,9 @@ static bool ice_xmit_zc(struct ice_ring *xdp_ring, int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
desc.len);
tx_buf->bytecount = desc.len;
@ -1165,11 +870,10 @@ void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring)
for (i = 0; i < rx_ring->count; i++) {
struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i];
if (!rx_buf->addr)
if (!rx_buf->xdp)
continue;
xsk_umem_fq_reuse(rx_ring->xsk_umem, rx_buf->handle);
rx_buf->addr = NULL;
rx_buf->xdp = NULL;
}
}

View File

@ -10,11 +10,10 @@ struct ice_vsi;
#ifdef CONFIG_XDP_SOCKETS
int ice_xsk_umem_setup(struct ice_vsi *vsi, struct xdp_umem *umem, u16 qid);
void ice_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget);
bool ice_clean_tx_irq_zc(struct ice_ring *xdp_ring, int budget);
int ice_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
bool ice_alloc_rx_bufs_slow_zc(struct ice_ring *rx_ring, u16 count);
bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count);
bool ice_xsk_any_rx_ring_ena(struct ice_vsi *vsi);
void ice_xsk_clean_rx_ring(struct ice_ring *rx_ring);
void ice_xsk_clean_xdp_ring(struct ice_ring *xdp_ring);
@ -27,12 +26,6 @@ ice_xsk_umem_setup(struct ice_vsi __always_unused *vsi,
return -EOPNOTSUPP;
}
static inline void
ice_zca_free(struct zero_copy_allocator __always_unused *zca,
unsigned long __always_unused handle)
{
}
static inline int
ice_clean_rx_irq_zc(struct ice_ring __always_unused *rx_ring,
int __always_unused budget)
@ -48,7 +41,7 @@ ice_clean_tx_irq_zc(struct ice_ring __always_unused *xdp_ring,
}
static inline bool
ice_alloc_rx_bufs_slow_zc(struct ice_ring __always_unused *rx_ring,
ice_alloc_rx_bufs_zc(struct ice_ring __always_unused *rx_ring,
u16 __always_unused count)
{
return false;

View File

@ -224,17 +224,17 @@ struct ixgbe_tx_buffer {
};
struct ixgbe_rx_buffer {
struct sk_buff *skb;
dma_addr_t dma;
union {
struct {
struct sk_buff *skb;
dma_addr_t dma;
struct page *page;
__u32 page_offset;
__u16 pagecnt_bias;
};
struct {
void *addr;
u64 handle;
bool discard;
struct xdp_buff *xdp;
};
};
};
@ -351,7 +351,6 @@ struct ixgbe_ring {
};
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
struct zero_copy_allocator zca; /* ZC allocator anchor */
u16 ring_idx; /* {rx,tx,xdp}_ring back reference idx */
u16 rx_buf_len;
} ____cacheline_internodealigned_in_smp;

View File

@ -35,7 +35,7 @@
#include <net/tc_act/tc_mirred.h>
#include <net/vxlan.h>
#include <net/mpls.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/xfrm.h>
#include "ixgbe.h"
@ -3745,8 +3745,7 @@ static void ixgbe_configure_srrctl(struct ixgbe_adapter *adapter,
/* configure the packet buffer length */
if (rx_ring->xsk_umem) {
u32 xsk_buf_len = rx_ring->xsk_umem->chunk_size_nohr -
XDP_PACKET_HEADROOM;
u32 xsk_buf_len = xsk_umem_get_rx_frame_size(rx_ring->xsk_umem);
/* If the MAC support setting RXDCTL.RLPML, the
* SRRCTL[n].BSIZEPKT is set to PAGE_SIZE and
@ -4093,11 +4092,10 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
ring->xsk_umem = ixgbe_xsk_umem(adapter, ring);
if (ring->xsk_umem) {
ring->zca.free = ixgbe_zca_free;
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_ZERO_COPY,
&ring->zca));
MEM_TYPE_XSK_BUFF_POOL,
NULL));
xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
} else {
WARN_ON(xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED, NULL));
@ -4153,8 +4151,7 @@ void ixgbe_configure_rx_ring(struct ixgbe_adapter *adapter,
}
if (ring->xsk_umem && hw->mac.type != ixgbe_mac_82599EB) {
u32 xsk_buf_len = ring->xsk_umem->chunk_size_nohr -
XDP_PACKET_HEADROOM;
u32 xsk_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
rxdctl &= ~(IXGBE_RXDCTL_RLPMLMASK |
IXGBE_RXDCTL_RLPML_EN);

View File

@ -35,7 +35,7 @@ int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem,
void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count);
int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
struct ixgbe_ring *rx_ring,
const int budget);

View File

@ -2,7 +2,7 @@
/* Copyright(c) 2018 Intel Corporation. */
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "ixgbe.h"
@ -20,54 +20,11 @@ struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter,
return xdp_get_umem_from_qid(adapter->netdev, qid);
}
static int ixgbe_xsk_umem_dma_map(struct ixgbe_adapter *adapter,
struct xdp_umem *umem)
{
struct device *dev = &adapter->pdev->dev;
unsigned int i, j;
dma_addr_t dma;
for (i = 0; i < umem->npgs; i++) {
dma = dma_map_page_attrs(dev, umem->pgs[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
if (dma_mapping_error(dev, dma))
goto out_unmap;
umem->pages[i].dma = dma;
}
return 0;
out_unmap:
for (j = 0; j < i; j++) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
return -1;
}
static void ixgbe_xsk_umem_dma_unmap(struct ixgbe_adapter *adapter,
struct xdp_umem *umem)
{
struct device *dev = &adapter->pdev->dev;
unsigned int i;
for (i = 0; i < umem->npgs; i++) {
dma_unmap_page_attrs(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, IXGBE_RX_DMA_ATTR);
umem->pages[i].dma = 0;
}
}
static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
struct xdp_umem *umem,
u16 qid)
{
struct net_device *netdev = adapter->netdev;
struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
@ -78,13 +35,7 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter,
qid >= netdev->real_num_tx_queues)
return -EINVAL;
reuseq = xsk_reuseq_prepare(adapter->rx_ring[0]->count);
if (!reuseq)
return -ENOMEM;
xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
err = ixgbe_xsk_umem_dma_map(adapter, umem);
err = xsk_buff_dma_map(umem, &adapter->pdev->dev, IXGBE_RX_DMA_ATTR);
if (err)
return err;
@ -124,7 +75,7 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid)
ixgbe_txrx_ring_disable(adapter, qid);
clear_bit(qid, adapter->af_xdp_zc_qps);
ixgbe_xsk_umem_dma_unmap(adapter, umem);
xsk_buff_dma_unmap(umem, IXGBE_RX_DMA_ATTR);
if (if_running)
ixgbe_txrx_ring_enable(adapter, qid);
@ -143,19 +94,14 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
struct ixgbe_ring *rx_ring,
struct xdp_buff *xdp)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
int err, result = IXGBE_XDP_PASS;
struct bpf_prog *xdp_prog;
struct xdp_frame *xdpf;
u64 offset;
u32 act;
rcu_read_lock();
xdp_prog = READ_ONCE(rx_ring->xdp_prog);
act = bpf_prog_run_xdp(xdp_prog, xdp);
offset = xdp->data - xdp->data_hard_start;
xdp->handle = xsk_umem_adjust_offset(umem, xdp->handle, offset);
switch (act) {
case XDP_PASS:
@ -186,140 +132,16 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter,
return result;
}
static struct
ixgbe_rx_buffer *ixgbe_get_rx_buffer_zc(struct ixgbe_ring *rx_ring,
unsigned int size)
{
struct ixgbe_rx_buffer *bi;
bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
/* we are reusing so sync this buffer for CPU use */
dma_sync_single_range_for_cpu(rx_ring->dev,
bi->dma, 0,
size,
DMA_BIDIRECTIONAL);
return bi;
}
static void ixgbe_reuse_rx_buffer_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *obi)
{
u16 nta = rx_ring->next_to_alloc;
struct ixgbe_rx_buffer *nbi;
nbi = &rx_ring->rx_buffer_info[rx_ring->next_to_alloc];
/* update, and store next to alloc */
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
/* transfer page from old buffer to new buffer */
nbi->dma = obi->dma;
nbi->addr = obi->addr;
nbi->handle = obi->handle;
obi->addr = NULL;
obi->skb = NULL;
}
void ixgbe_zca_free(struct zero_copy_allocator *alloc, unsigned long handle)
{
struct ixgbe_rx_buffer *bi;
struct ixgbe_ring *rx_ring;
u64 hr, mask;
u16 nta;
rx_ring = container_of(alloc, struct ixgbe_ring, zca);
hr = rx_ring->xsk_umem->headroom + XDP_PACKET_HEADROOM;
mask = rx_ring->xsk_umem->chunk_mask;
nta = rx_ring->next_to_alloc;
bi = rx_ring->rx_buffer_info;
nta++;
rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
handle &= mask;
bi->dma = xdp_umem_get_dma(rx_ring->xsk_umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(rx_ring->xsk_umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(rx_ring->xsk_umem, (u64)handle,
rx_ring->xsk_umem->headroom);
}
static bool ixgbe_alloc_buffer_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *bi)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
void *addr = bi->addr;
u64 handle, hr;
if (addr)
return true;
if (!xsk_umem_peek_addr(umem, &handle)) {
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
hr = umem->headroom + XDP_PACKET_HEADROOM;
bi->dma = xdp_umem_get_dma(umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_release_addr(umem);
return true;
}
static bool ixgbe_alloc_buffer_slow_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *bi)
{
struct xdp_umem *umem = rx_ring->xsk_umem;
u64 handle, hr;
if (!xsk_umem_peek_addr_rq(umem, &handle)) {
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
handle &= rx_ring->xsk_umem->chunk_mask;
hr = umem->headroom + XDP_PACKET_HEADROOM;
bi->dma = xdp_umem_get_dma(umem, handle);
bi->dma += hr;
bi->addr = xdp_umem_get_data(umem, handle);
bi->addr += hr;
bi->handle = xsk_umem_adjust_offset(umem, handle, umem->headroom);
xsk_umem_release_addr_rq(umem);
return true;
}
static __always_inline bool
__ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
bool alloc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *bi))
bool ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
{
union ixgbe_adv_rx_desc *rx_desc;
struct ixgbe_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
dma_addr_t dma;
bool ok = true;
/* nothing to do */
if (!cleaned_count)
if (!count)
return true;
rx_desc = IXGBE_RX_DESC(rx_ring, i);
@ -327,21 +149,18 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
i -= rx_ring->count;
do {
if (!alloc(rx_ring, bi)) {
bi->xdp = xsk_buff_alloc(rx_ring->xsk_umem);
if (!bi->xdp) {
ok = false;
break;
}
/* sync the buffer for use by the device */
dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
bi->page_offset,
rx_ring->rx_buf_len,
DMA_BIDIRECTIONAL);
dma = xsk_buff_xdp_get_dma(bi->xdp);
/* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
rx_desc->read.pkt_addr = cpu_to_le64(dma);
rx_desc++;
bi++;
@ -355,17 +174,14 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
/* clear the length for the next_to_use descriptor */
rx_desc->wb.upper.length = 0;
cleaned_count--;
} while (cleaned_count);
count--;
} while (count);
i += rx_ring->count;
if (rx_ring->next_to_use != i) {
rx_ring->next_to_use = i;
/* update next to alloc since we have filled the ring */
rx_ring->next_to_alloc = i;
/* Force memory writes to complete before letting h/w
* know there are new descriptors to fetch. (Only
* applicable for weak-ordered memory model archs,
@ -378,40 +194,27 @@ __ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 cleaned_count,
return ok;
}
void ixgbe_alloc_rx_buffers_zc(struct ixgbe_ring *rx_ring, u16 count)
{
__ixgbe_alloc_rx_buffers_zc(rx_ring, count,
ixgbe_alloc_buffer_slow_zc);
}
static bool ixgbe_alloc_rx_buffers_fast_zc(struct ixgbe_ring *rx_ring,
u16 count)
{
return __ixgbe_alloc_rx_buffers_zc(rx_ring, count,
ixgbe_alloc_buffer_zc);
}
static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
struct ixgbe_rx_buffer *bi,
struct xdp_buff *xdp)
struct ixgbe_rx_buffer *bi)
{
unsigned int metasize = xdp->data - xdp->data_meta;
unsigned int datasize = xdp->data_end - xdp->data;
unsigned int metasize = bi->xdp->data - bi->xdp->data_meta;
unsigned int datasize = bi->xdp->data_end - bi->xdp->data;
struct sk_buff *skb;
/* allocate a skb to store the frags */
skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
xdp->data_end - xdp->data_hard_start,
bi->xdp->data_end - bi->xdp->data_hard_start,
GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
skb_reserve(skb, xdp->data - xdp->data_hard_start);
memcpy(__skb_put(skb, datasize), xdp->data, datasize);
skb_reserve(skb, bi->xdp->data - bi->xdp->data_hard_start);
memcpy(__skb_put(skb, datasize), bi->xdp->data, datasize);
if (metasize)
skb_metadata_set(skb, metasize);
ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
xsk_buff_free(bi->xdp);
bi->xdp = NULL;
return skb;
}
@ -431,14 +234,9 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
struct ixgbe_adapter *adapter = q_vector->adapter;
u16 cleaned_count = ixgbe_desc_unused(rx_ring);
struct xdp_umem *umem = rx_ring->xsk_umem;
unsigned int xdp_res, xdp_xmit = 0;
bool failure = false;
struct sk_buff *skb;
struct xdp_buff xdp;
xdp.rxq = &rx_ring->xdp_rxq;
xdp.frame_sz = xsk_umem_xdp_frame_sz(umem);
while (likely(total_rx_packets < budget)) {
union ixgbe_adv_rx_desc *rx_desc;
@ -448,7 +246,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) {
failure = failure ||
!ixgbe_alloc_rx_buffers_fast_zc(rx_ring,
!ixgbe_alloc_rx_buffers_zc(rx_ring,
cleaned_count);
cleaned_count = 0;
}
@ -464,42 +262,40 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
*/
dma_rmb();
bi = ixgbe_get_rx_buffer_zc(rx_ring, size);
bi = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
if (unlikely(!ixgbe_test_staterr(rx_desc,
IXGBE_RXD_STAT_EOP))) {
struct ixgbe_rx_buffer *next_bi;
ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
xsk_buff_free(bi->xdp);
bi->xdp = NULL;
ixgbe_inc_ntc(rx_ring);
next_bi =
&rx_ring->rx_buffer_info[rx_ring->next_to_clean];
next_bi->skb = ERR_PTR(-EINVAL);
next_bi->discard = true;
continue;
}
if (unlikely(bi->skb)) {
ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
if (unlikely(bi->discard)) {
xsk_buff_free(bi->xdp);
bi->xdp = NULL;
bi->discard = false;
ixgbe_inc_ntc(rx_ring);
continue;
}
xdp.data = bi->addr;
xdp.data_meta = xdp.data;
xdp.data_hard_start = xdp.data - XDP_PACKET_HEADROOM;
xdp.data_end = xdp.data + size;
xdp.handle = bi->handle;
xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, &xdp);
bi->xdp->data_end = bi->xdp->data + size;
xsk_buff_dma_sync_for_cpu(bi->xdp);
xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
if (xdp_res) {
if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR)) {
if (xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))
xdp_xmit |= xdp_res;
bi->addr = NULL;
bi->skb = NULL;
} else {
ixgbe_reuse_rx_buffer_zc(rx_ring, bi);
}
else
xsk_buff_free(bi->xdp);
bi->xdp = NULL;
total_rx_packets++;
total_rx_bytes += size;
@ -509,7 +305,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
}
/* XDP_PASS path */
skb = ixgbe_construct_skb_zc(rx_ring, bi, &xdp);
skb = ixgbe_construct_skb_zc(rx_ring, bi);
if (!skb) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
break;
@ -561,17 +357,17 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
void ixgbe_xsk_clean_rx_ring(struct ixgbe_ring *rx_ring)
{
u16 i = rx_ring->next_to_clean;
struct ixgbe_rx_buffer *bi = &rx_ring->rx_buffer_info[i];
struct ixgbe_rx_buffer *bi;
u16 i;
while (i != rx_ring->next_to_alloc) {
xsk_umem_fq_reuse(rx_ring->xsk_umem, bi->handle);
i++;
bi++;
if (i == rx_ring->count) {
i = 0;
bi = rx_ring->rx_buffer_info;
}
for (i = 0; i < rx_ring->count; i++) {
bi = &rx_ring->rx_buffer_info[i];
if (!bi->xdp)
continue;
xsk_buff_free(bi->xdp);
bi->xdp = NULL;
}
}
@ -594,10 +390,9 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget)
if (!xsk_umem_consume_tx(xdp_ring->xsk_umem, &desc))
break;
dma = xdp_umem_get_dma(xdp_ring->xsk_umem, desc.addr);
dma_sync_single_for_device(xdp_ring->dev, dma, desc.len,
DMA_BIDIRECTIONAL);
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_umem, desc.addr);
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_umem, dma,
desc.len);
tx_bi = &xdp_ring->tx_buffer_info[xdp_ring->next_to_use];
tx_bi->bytecount = desc.len;

View File

@ -365,10 +365,7 @@ struct mlx5e_dma_info {
dma_addr_t addr;
union {
struct page *page;
struct {
u64 handle;
void *data;
} xsk;
struct xdp_buff *xsk;
};
};
@ -581,7 +578,6 @@ struct mlx5e_rq {
} mpwqe;
};
struct {
u16 umem_headroom;
u16 headroom;
u32 frame0_sz;
u8 map_dir; /* dma map direction */
@ -614,7 +610,6 @@ struct mlx5e_rq {
struct page_pool *page_pool;
/* AF_XDP zero-copy */
struct zero_copy_allocator zca;
struct xdp_umem *umem;
struct work_struct recover_work;

View File

@ -12,15 +12,16 @@ static inline bool mlx5e_rx_is_xdp(struct mlx5e_params *params,
u16 mlx5e_get_linear_rq_headroom(struct mlx5e_params *params,
struct mlx5e_xsk_param *xsk)
{
u16 headroom = NET_IP_ALIGN;
u16 headroom;
if (mlx5e_rx_is_xdp(params, xsk)) {
headroom += XDP_PACKET_HEADROOM;
if (xsk)
headroom += xsk->headroom;
} else {
return xsk->headroom;
headroom = NET_IP_ALIGN;
if (mlx5e_rx_is_xdp(params, xsk))
headroom += XDP_PACKET_HEADROOM;
else
headroom += MLX5_RX_HEADROOM;
}
return headroom;
}

View File

@ -31,7 +31,7 @@
*/
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include "en/xdp.h"
#include "en/params.h"
@ -71,7 +71,7 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
xdptxd.data = xdpf->data;
xdptxd.len = xdpf->len;
if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) {
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
/* The xdp_buff was in the UMEM and was copied into a newly
* allocated page. The UMEM page was returned via the ZCA, and
* this new page has to be mapped at this point and has to be
@ -119,50 +119,33 @@ mlx5e_xmit_xdp_buff(struct mlx5e_xdpsq *sq, struct mlx5e_rq *rq,
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len, bool xsk)
u32 *len, struct xdp_buff *xdp)
{
struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
struct xdp_umem *umem = rq->umem;
struct xdp_buff xdp;
u32 act;
int err;
if (!prog)
return false;
xdp.data = va + *rx_headroom;
xdp_set_data_meta_invalid(&xdp);
xdp.data_end = xdp.data + *len;
xdp.data_hard_start = va;
if (xsk)
xdp.handle = di->xsk.handle;
xdp.rxq = &rq->xdp_rxq;
xdp.frame_sz = rq->buff.frame0_sz;
act = bpf_prog_run_xdp(prog, &xdp);
if (xsk) {
u64 off = xdp.data - xdp.data_hard_start;
xdp.handle = xsk_umem_adjust_offset(umem, xdp.handle, off);
}
act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
*rx_headroom = xdp.data - xdp.data_hard_start;
*len = xdp.data_end - xdp.data;
*len = xdp->data_end - xdp->data;
return false;
case XDP_TX:
if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, &xdp)))
if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, di, xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
err = xdp_do_redirect(rq->netdev, &xdp, prog);
err = xdp_do_redirect(rq->netdev, xdp, prog);
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
if (!xsk)
if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
mlx5e_page_dma_unmap(rq, di);
rq->stats->xdp_redirect++;
return true;

View File

@ -61,7 +61,7 @@
struct mlx5e_xsk_param;
int mlx5e_xdp_max_mtu(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk);
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di,
void *va, u16 *rx_headroom, u32 *len, bool xsk);
u32 *len, struct xdp_buff *xdp);
void mlx5e_xdp_mpwqe_complete(struct mlx5e_xdpsq *sq);
bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq);
void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq);

View File

@ -3,71 +3,10 @@
#include "rx.h"
#include "en/xdp.h"
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
/* RX data path */
bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count)
{
/* Check in advance that we have enough frames, instead of allocating
* one-by-one, failing and moving frames to the Reuse Ring.
*/
return xsk_umem_has_addrs_rq(rq->umem, count);
}
int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
struct xdp_umem *umem = rq->umem;
u64 handle;
if (!xsk_umem_peek_addr_rq(umem, &handle))
return -ENOMEM;
dma_info->xsk.handle = xsk_umem_adjust_offset(umem, handle,
rq->buff.umem_headroom);
dma_info->xsk.data = xdp_umem_get_data(umem, dma_info->xsk.handle);
/* No need to add headroom to the DMA address. In striding RQ case, we
* just provide pages for UMR, and headroom is counted at the setup
* stage when creating a WQE. In non-striding RQ case, headroom is
* accounted in mlx5e_alloc_rx_wqe.
*/
dma_info->addr = xdp_umem_get_dma(umem, handle);
xsk_umem_release_addr_rq(umem);
dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
DMA_BIDIRECTIONAL);
return 0;
}
static inline void mlx5e_xsk_recycle_frame(struct mlx5e_rq *rq, u64 handle)
{
xsk_umem_fq_reuse(rq->umem, handle & rq->umem->chunk_mask);
}
/* XSKRQ uses pages from UMEM, they must not be released. They are returned to
* the userspace if possible, and if not, this function is called to reuse them
* in the driver.
*/
void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
mlx5e_xsk_recycle_frame(rq, dma_info->xsk.handle);
}
/* Return a frame back to the hardware to fill in again. It is used by XDP when
* the XDP program returns XDP_TX or XDP_REDIRECT not to an XSKMAP.
*/
void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle)
{
struct mlx5e_rq *rq = container_of(zca, struct mlx5e_rq, zca);
mlx5e_xsk_recycle_frame(rq, handle);
}
static struct sk_buff *mlx5e_xsk_construct_skb(struct mlx5e_rq *rq, void *data,
u32 cqe_bcnt)
{
@ -90,11 +29,8 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
u32 head_offset,
u32 page_idx)
{
struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
struct xdp_buff *xdp = wi->umr.dma_info[page_idx].xsk;
u32 cqe_bcnt32 = cqe_bcnt;
void *va, *data;
u32 frag_size;
bool consumed;
/* Check packet size. Note LRO doesn't use linear SKB */
@ -103,22 +39,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
return NULL;
}
/* head_offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
* current implementation, UMR pages are mapped to XSK frames, so
/* head_offset is not used in this function, because xdp->data and the
* DMA address point directly to the necessary place. Furthermore, in
* the current implementation, UMR pages are mapped to XSK frames, so
* head_offset should always be 0.
*/
WARN_ON_ONCE(head_offset);
va = di->xsk.data;
data = va + rx_headroom;
frag_size = rq->buff.headroom + cqe_bcnt32;
dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
prefetch(data);
xdp->data_end = xdp->data + cqe_bcnt32;
xdp_set_data_meta_invalid(xdp);
xsk_buff_dma_sync_for_cpu(xdp);
prefetch(xdp->data);
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, true);
consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt32, xdp);
rcu_read_unlock();
/* Possible flows:
@ -145,7 +79,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
/* XDP_PASS: copy the data from the UMEM to a new SKB and reuse the
* frame. On SKB allocation failure, NULL is returned.
*/
return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt32);
return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt32);
}
struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
@ -153,25 +87,20 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
struct mlx5e_dma_info *di = wi->di;
u16 rx_headroom = rq->buff.headroom - rq->buff.umem_headroom;
void *va, *data;
struct xdp_buff *xdp = wi->di->xsk;
bool consumed;
u32 frag_size;
/* wi->offset is not used in this function, because di->xsk.data and
* di->addr point directly to the necessary place. Furthermore, in the
* current implementation, one page = one packet = one frame, so
/* wi->offset is not used in this function, because xdp->data and the
* DMA address point directly to the necessary place. Furthermore, the
* XSK allocator allocates frames per packet, instead of pages, so
* wi->offset should always be 0.
*/
WARN_ON_ONCE(wi->offset);
va = di->xsk.data;
data = va + rx_headroom;
frag_size = rq->buff.headroom + cqe_bcnt;
dma_sync_single_for_cpu(rq->pdev, di->addr, frag_size, DMA_BIDIRECTIONAL);
prefetch(data);
xdp->data_end = xdp->data + cqe_bcnt;
xdp_set_data_meta_invalid(xdp);
xsk_buff_dma_sync_for_cpu(xdp);
prefetch(xdp->data);
if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
rq->stats->wqe_err++;
@ -179,7 +108,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
}
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, true);
consumed = mlx5e_xdp_handle(rq, NULL, &cqe_bcnt, xdp);
rcu_read_unlock();
if (likely(consumed))
@ -189,5 +118,5 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
* will be handled by mlx5e_put_rx_frag.
* On SKB allocation failure, NULL is returned.
*/
return mlx5e_xsk_construct_skb(rq, data, cqe_bcnt);
return mlx5e_xsk_construct_skb(rq, xdp->data, cqe_bcnt);
}

View File

@ -5,16 +5,10 @@
#define __MLX5_EN_XSK_RX_H__
#include "en.h"
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
/* RX data path */
bool mlx5e_xsk_pages_enough_umem(struct mlx5e_rq *rq, int count);
int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info);
void mlx5e_xsk_page_release(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info);
void mlx5e_xsk_zca_free(struct zero_copy_allocator *zca, unsigned long handle);
struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
struct mlx5e_mpw_info *wi,
u16 cqe_bcnt,
@ -25,6 +19,23 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt);
static inline int mlx5e_xsk_page_alloc_umem(struct mlx5e_rq *rq,
struct mlx5e_dma_info *dma_info)
{
dma_info->xsk = xsk_buff_alloc(rq->umem);
if (!dma_info->xsk)
return -ENOMEM;
/* Store the DMA address without headroom. In striding RQ case, we just
* provide pages for UMR, and headroom is counted at the setup stage
* when creating a WQE. In non-striding RQ case, headroom is accounted
* in mlx5e_alloc_rx_wqe.
*/
dma_info->addr = xsk_buff_xdp_get_frame_dma(dma_info->xsk);
return 0;
}
static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
{
if (!xsk_umem_uses_need_wakeup(rq->umem))

View File

@ -5,7 +5,7 @@
#include "umem.h"
#include "en/xdp.h"
#include "en/params.h"
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
{
@ -92,12 +92,11 @@ bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget)
break;
}
xdptxd.dma_addr = xdp_umem_get_dma(umem, desc.addr);
xdptxd.data = xdp_umem_get_data(umem, desc.addr);
xdptxd.dma_addr = xsk_buff_raw_get_dma(umem, desc.addr);
xdptxd.data = xsk_buff_raw_get_data(umem, desc.addr);
xdptxd.len = desc.len;
dma_sync_single_for_device(sq->pdev, xdptxd.dma_addr,
xdptxd.len, DMA_BIDIRECTIONAL);
xsk_buff_raw_dma_sync_for_device(umem, xdptxd.dma_addr, xdptxd.len);
if (unlikely(!sq->xmit_xdp_frame(sq, &xdptxd, &xdpi, check_result))) {
if (sq->mpwqe.wqe)

View File

@ -5,7 +5,7 @@
#define __MLX5_EN_XSK_TX_H__
#include "en.h"
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
/* TX data path */

View File

@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/* Copyright (c) 2019 Mellanox Technologies. */
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include "umem.h"
#include "setup.h"
#include "en/params.h"
@ -10,40 +10,14 @@ static int mlx5e_xsk_map_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
u32 i;
for (i = 0; i < umem->npgs; i++) {
dma_addr_t dma = dma_map_page(dev, umem->pgs[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL);
if (unlikely(dma_mapping_error(dev, dma)))
goto err_unmap;
umem->pages[i].dma = dma;
}
return 0;
err_unmap:
while (i--) {
dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL);
umem->pages[i].dma = 0;
}
return -ENOMEM;
return xsk_buff_dma_map(umem, dev, 0);
}
static void mlx5e_xsk_unmap_umem(struct mlx5e_priv *priv,
struct xdp_umem *umem)
{
struct device *dev = priv->mdev->device;
u32 i;
for (i = 0; i < umem->npgs; i++) {
dma_unmap_page(dev, umem->pages[i].dma, PAGE_SIZE,
DMA_BIDIRECTIONAL);
umem->pages[i].dma = 0;
}
return xsk_buff_dma_unmap(umem, 0);
}
static int mlx5e_xsk_get_umems(struct mlx5e_xsk *xsk)
@ -90,13 +64,14 @@ static void mlx5e_xsk_remove_umem(struct mlx5e_xsk *xsk, u16 ix)
static bool mlx5e_xsk_is_umem_sane(struct xdp_umem *umem)
{
return umem->headroom <= 0xffff && umem->chunk_size_nohr <= 0xffff;
return xsk_umem_get_headroom(umem) <= 0xffff &&
xsk_umem_get_chunk_size(umem) <= 0xffff;
}
void mlx5e_build_xsk_param(struct xdp_umem *umem, struct mlx5e_xsk_param *xsk)
{
xsk->headroom = umem->headroom;
xsk->chunk_size = umem->chunk_size_nohr + umem->headroom;
xsk->headroom = xsk_umem_get_headroom(umem);
xsk->chunk_size = xsk_umem_get_chunk_size(umem);
}
static int mlx5e_xsk_enable_locked(struct mlx5e_priv *priv,
@ -241,18 +216,6 @@ int mlx5e_xsk_setup_umem(struct net_device *dev, struct xdp_umem *umem, u16 qid)
mlx5e_xsk_disable_umem(priv, ix);
}
int mlx5e_xsk_resize_reuseq(struct xdp_umem *umem, u32 nentries)
{
struct xdp_umem_fq_reuse *reuseq;
reuseq = xsk_reuseq_prepare(nentries);
if (unlikely(!reuseq))
return -ENOMEM;
xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
return 0;
}
u16 mlx5e_xsk_first_unused_channel(struct mlx5e_params *params, struct mlx5e_xsk *xsk)
{
u16 res = xsk->refcnt ? params->num_channels : 0;

View File

@ -38,7 +38,7 @@
#include <linux/bpf.h>
#include <linux/if_bridge.h>
#include <net/page_pool.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include "eswitch.h"
#include "en.h"
#include "en/txrx.h"
@ -373,7 +373,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
struct mlx5_core_dev *mdev = c->mdev;
void *rqc = rqp->rqc;
void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
u32 num_xsk_frames = 0;
u32 rq_xdp_ix;
u32 pool_size;
int wq_sz;
@ -413,7 +412,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params, xsk);
rq->buff.umem_headroom = xsk ? xsk->headroom : 0;
pool_size = 1 << params->log_rq_mtu_frames;
switch (rq->wq_type) {
@ -427,10 +425,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_ll_get_size(&rq->mpwqe.wq);
if (xsk)
num_xsk_frames = wq_sz <<
mlx5e_mpwqe_get_log_num_strides(mdev, params, xsk);
pool_size = MLX5_MPWRQ_PAGES_PER_WQE <<
mlx5e_mpwqe_get_log_rq_size(params, xsk);
@ -482,9 +476,6 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
wq_sz = mlx5_wq_cyc_get_size(&rq->wqe.wq);
if (xsk)
num_xsk_frames = wq_sz << rq->wqe.info.log_num_frags;
rq->wqe.info = rqp->frags_info;
rq->buff.frame0_sz = rq->wqe.info.arr[0].frag_stride;
@ -525,19 +516,9 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
}
if (xsk) {
rq->buff.frame0_sz = xsk_umem_xdp_frame_sz(umem);
err = mlx5e_xsk_resize_reuseq(umem, num_xsk_frames);
if (unlikely(err)) {
mlx5_core_err(mdev, "Unable to allocate the Reuse Ring for %u frames\n",
num_xsk_frames);
goto err_free;
}
rq->zca.free = mlx5e_xsk_zca_free;
err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
MEM_TYPE_ZERO_COPY,
&rq->zca);
MEM_TYPE_XSK_BUFF_POOL, NULL);
xsk_buff_set_rxq_info(rq->umem, &rq->xdp_rxq);
} else {
/* Create a page_pool and register it with rxq */
pp_params.order = 0;

View File

@ -300,7 +300,7 @@ static inline void mlx5e_page_release(struct mlx5e_rq *rq,
* put into the Reuse Ring, because there is no way to return
* the page to the userspace when the interface goes down.
*/
mlx5e_xsk_page_release(rq, dma_info);
xsk_buff_free(dma_info->xsk);
else
mlx5e_page_release_dynamic(rq, dma_info, recycle);
}
@ -385,7 +385,11 @@ static int mlx5e_alloc_rx_wqes(struct mlx5e_rq *rq, u16 ix, u8 wqe_bulk)
if (rq->umem) {
int pages_desired = wqe_bulk << rq->wqe.info.log_num_frags;
if (unlikely(!mlx5e_xsk_pages_enough_umem(rq, pages_desired)))
/* Check in advance that we have enough frames, instead of
* allocating one-by-one, failing and moving frames to the
* Reuse Ring.
*/
if (unlikely(!xsk_buff_can_alloc(rq->umem, pages_desired)))
return -ENOMEM;
}
@ -480,8 +484,11 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
int err;
int i;
/* Check in advance that we have enough frames, instead of allocating
* one-by-one, failing and moving frames to the Reuse Ring.
*/
if (rq->umem &&
unlikely(!mlx5e_xsk_pages_enough_umem(rq, MLX5_MPWRQ_PAGES_PER_WQE))) {
unlikely(!xsk_buff_can_alloc(rq->umem, MLX5_MPWRQ_PAGES_PER_WQE))) {
err = -ENOMEM;
goto err;
}
@ -1044,12 +1051,24 @@ struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
return skb;
}
static void mlx5e_fill_xdp_buff(struct mlx5e_rq *rq, void *va, u16 headroom,
u32 len, struct xdp_buff *xdp)
{
xdp->data_hard_start = va;
xdp_set_data_meta_invalid(xdp);
xdp->data = va + headroom;
xdp->data_end = xdp->data + len;
xdp->rxq = &rq->xdp_rxq;
xdp->frame_sz = rq->buff.frame0_sz;
}
struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
struct mlx5e_wqe_frag_info *wi, u32 cqe_bcnt)
{
struct mlx5e_dma_info *di = wi->di;
u16 rx_headroom = rq->buff.headroom;
struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
bool consumed;
@ -1065,11 +1084,13 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
prefetch(data);
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt, false);
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt, &xdp);
rcu_read_unlock();
if (consumed)
return NULL; /* page/packet was consumed by XDP */
rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt);
if (unlikely(!skb))
@ -1343,6 +1364,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
struct mlx5e_dma_info *di = &wi->umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom;
u32 cqe_bcnt32 = cqe_bcnt;
struct xdp_buff xdp;
struct sk_buff *skb;
void *va, *data;
u32 frag_size;
@ -1364,7 +1386,8 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
prefetch(data);
rcu_read_lock();
consumed = mlx5e_xdp_handle(rq, di, va, &rx_headroom, &cqe_bcnt32, false);
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt32, &xdp);
consumed = mlx5e_xdp_handle(rq, di, &cqe_bcnt32, &xdp);
rcu_read_unlock();
if (consumed) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
@ -1372,6 +1395,7 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
return NULL; /* page/packet was consumed by XDP */
}
rx_headroom = xdp.data - xdp.data_hard_start;
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt32);
if (unlikely(!skb))

View File

@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
xdp->data_end = xdp->data + len;
xdp->rxq = &nvchan->xdp_rxq;
xdp->frame_sz = PAGE_SIZE;
xdp->handle = 0;
memcpy(xdp->data, data, len);

View File

@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
}
#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; })
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })

View File

@ -39,7 +39,7 @@ enum xdp_mem_type {
MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
MEM_TYPE_PAGE_POOL,
MEM_TYPE_ZERO_COPY,
MEM_TYPE_XSK_BUFF_POOL,
MEM_TYPE_MAX,
};
@ -54,10 +54,6 @@ struct xdp_mem_info {
struct page_pool;
struct zero_copy_allocator {
void (*free)(struct zero_copy_allocator *zca, unsigned long handle);
};
struct xdp_rxq_info {
struct net_device *dev;
u32 queue_index;
@ -70,7 +66,6 @@ struct xdp_buff {
void *data_end;
void *data_meta;
void *data_hard_start;
unsigned long handle;
struct xdp_rxq_info *rxq;
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
};
@ -119,7 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
int metasize;
int headroom;
if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
return xdp_convert_zc_to_xdp_frame(xdp);
/* Assure headroom is available for storing info */

View File

@ -15,40 +15,15 @@
struct net_device;
struct xsk_queue;
/* Masks for xdp_umem_page flags.
* The low 12-bits of the addr will be 0 since this is the page address, so we
* can use them for flags.
*/
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK (1ULL << XSK_NEXT_PG_CONTIG_SHIFT)
struct xdp_umem_page {
void *addr;
dma_addr_t dma;
};
struct xdp_umem_fq_reuse {
u32 nentries;
u32 length;
u64 handles[];
};
/* Flags for the umem flags field.
*
* The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
* flags. See inlude/uapi/include/linux/if_xdp.h.
*/
#define XDP_UMEM_USES_NEED_WAKEUP (1 << 1)
struct xdp_buff;
struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
struct xdp_umem_page *pages;
u64 chunk_mask;
struct xsk_buff_pool *pool;
u64 size;
u32 headroom;
u32 chunk_size_nohr;
u32 chunk_size;
struct user_struct *user;
refcount_t users;
struct work_struct work;
@ -59,28 +34,17 @@ struct xdp_umem {
u8 flags;
int id;
struct net_device *dev;
struct xdp_umem_fq_reuse *fq_reuse;
bool zc;
spinlock_t xsk_tx_list_lock;
struct list_head xsk_tx_list;
};
/* Nodes are linked in the struct xdp_sock map_list field, and used to
* track which maps a certain socket reside in.
*/
struct xsk_map {
struct bpf_map map;
spinlock_t lock; /* Synchronize map updates */
struct xdp_sock *xsk_map[];
};
struct xsk_map_node {
struct list_head node;
struct xsk_map *map;
struct xdp_sock **map_entry;
};
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
@ -111,32 +75,9 @@ struct xdp_sock {
spinlock_t map_list_lock;
};
struct xdp_buff;
#ifdef CONFIG_XDP_SOCKETS
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
/* Used from netdev driver */
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt);
bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr);
void xsk_umem_release_addr(struct xdp_umem *umem);
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
struct xdp_umem_fq_reuse *newq);
void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
int xsk_map_inc(struct xsk_map *map);
void xsk_map_put(struct xsk_map *map);
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(void);
@ -153,230 +94,13 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
return xs;
}
static inline u64 xsk_umem_extract_addr(u64 addr)
{
return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
}
static inline u64 xsk_umem_extract_offset(u64 addr)
{
return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
}
static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
{
return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr);
}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
unsigned long page_addr;
addr = xsk_umem_add_offset_to_addr(addr);
page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr;
return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK);
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
addr = xsk_umem_add_offset_to_addr(addr);
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK);
}
/* Reuse-queue aware version of FILL queue helpers */
static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
if (rq->length >= cnt)
return true;
return xsk_umem_has_addrs(umem, cnt - rq->length);
}
static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
if (!rq->length)
return xsk_umem_peek_addr(umem, addr);
*addr = rq->handles[rq->length - 1];
return addr;
}
static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
if (!rq->length)
xsk_umem_release_addr(umem);
else
rq->length--;
}
static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
{
struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
rq->handles[rq->length++] = addr;
}
/* Handle the offset appropriately depending on aligned or unaligned mode.
* For unaligned mode, we store the offset in the upper 16-bits of the address.
* For aligned mode, we simply add the offset to the address.
*/
static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address,
u64 offset)
{
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
else
return address + offset;
}
static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
{
return umem->chunk_size_nohr + umem->headroom;
}
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -ENOTSUPP;
}
static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
return false;
}
static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
return false;
}
static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return NULL;
}
static inline void xsk_umem_release_addr(struct xdp_umem *umem)
{
}
static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
}
static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
struct xdp_desc *desc)
{
return false;
}
static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
}
static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
{
return NULL;
}
static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
struct xdp_umem *umem,
struct xdp_umem_fq_reuse *newq)
{
return NULL;
}
static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
{
}
static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
u16 queue_id)
{
return NULL;
}
static inline u64 xsk_umem_extract_addr(u64 addr)
{
return 0;
}
static inline u64 xsk_umem_extract_offset(u64 addr)
{
return 0;
}
static inline u64 xsk_umem_add_offset_to_addr(u64 addr)
{
return 0;
}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return NULL;
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return 0;
}
static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt)
{
return false;
}
static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
{
return NULL;
}
static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem)
{
}
static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
{
}
static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
{
return false;
}
static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle,
u64 offset)
{
return 0;
}
static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem)
{
return 0;
}
static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
return -EOPNOTSUPP;
@ -391,6 +115,7 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map,
{
return NULL;
}
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */

232
include/net/xdp_sock_drv.h Normal file
View File

@ -0,0 +1,232 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Interface for implementing AF_XDP zero-copy support in drivers.
* Copyright(c) 2020 Intel Corporation.
*/
#ifndef _LINUX_XDP_SOCK_DRV_H
#define _LINUX_XDP_SOCK_DRV_H
#include <net/xdp_sock.h>
#include <net/xsk_buff_pool.h>
#ifdef CONFIG_XDP_SOCKETS
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem);
void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
{
return XDP_PACKET_HEADROOM + umem->headroom;
}
static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
{
return umem->chunk_size;
}
static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
{
return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem);
}
static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
struct xdp_rxq_info *rxq)
{
xp_set_rxq_info(umem->pool, rxq);
}
static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
unsigned long attrs)
{
xp_dma_unmap(umem->pool, attrs);
}
static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
unsigned long attrs)
{
return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs);
}
static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
return xp_get_dma(xskb);
}
static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
return xp_get_frame_dma(xskb);
}
static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
{
return xp_alloc(umem->pool);
}
static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
{
return xp_can_alloc(umem->pool, count);
}
static inline void xsk_buff_free(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
xp_free(xskb);
}
static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
{
return xp_raw_get_dma(umem->pool, addr);
}
static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
{
return xp_raw_get_data(umem->pool, addr);
}
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
xp_dma_sync_for_cpu(xskb);
}
static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
dma_addr_t dma,
size_t size)
{
xp_dma_sync_for_device(umem->pool, dma, size);
}
#else
static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
}
static inline bool xsk_umem_consume_tx(struct xdp_umem *umem,
struct xdp_desc *desc)
{
return false;
}
static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
}
static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
u16 queue_id)
{
return NULL;
}
static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
{
}
static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
{
}
static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
{
return false;
}
static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem)
{
return 0;
}
static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem)
{
return 0;
}
static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem)
{
return 0;
}
static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem,
struct xdp_rxq_info *rxq)
{
}
static inline void xsk_buff_dma_unmap(struct xdp_umem *umem,
unsigned long attrs)
{
}
static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev,
unsigned long attrs)
{
return 0;
}
static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp)
{
return 0;
}
static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp)
{
return 0;
}
static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem)
{
return NULL;
}
static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count)
{
return false;
}
static inline void xsk_buff_free(struct xdp_buff *xdp)
{
}
static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr)
{
return 0;
}
static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr)
{
return NULL;
}
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
{
}
static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem,
dma_addr_t dma,
size_t size)
{
}
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_DRV_H */

140
include/net/xsk_buff_pool.h Normal file
View File

@ -0,0 +1,140 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2020 Intel Corporation. */
#ifndef XSK_BUFF_POOL_H_
#define XSK_BUFF_POOL_H_
#include <linux/if_xdp.h>
#include <linux/types.h>
#include <linux/dma-mapping.h>
#include <net/xdp.h>
struct xsk_buff_pool;
struct xdp_rxq_info;
struct xsk_queue;
struct xdp_desc;
struct device;
struct page;
struct xdp_buff_xsk {
struct xdp_buff xdp;
dma_addr_t dma;
dma_addr_t frame_dma;
struct xsk_buff_pool *pool;
bool unaligned;
u64 orig_addr;
struct list_head free_list_node;
};
struct xsk_buff_pool {
struct xsk_queue *fq;
struct list_head free_list;
dma_addr_t *dma_pages;
struct xdp_buff_xsk *heads;
u64 chunk_mask;
u64 addrs_cnt;
u32 free_list_cnt;
u32 dma_pages_cnt;
u32 heads_cnt;
u32 free_heads_cnt;
u32 headroom;
u32 chunk_size;
u32 frame_len;
bool cheap_dma;
bool unaligned;
void *addrs;
struct device *dev;
struct xdp_buff_xsk *free_heads[];
};
/* AF_XDP core. */
struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
u32 chunk_size, u32 headroom, u64 size,
bool unaligned);
void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq);
void xp_destroy(struct xsk_buff_pool *pool);
void xp_release(struct xdp_buff_xsk *xskb);
/* AF_XDP, and XDP core. */
void xp_free(struct xdp_buff_xsk *xskb);
/* AF_XDP ZC drivers, via xdp_sock_buff.h */
void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq);
int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
unsigned long attrs, struct page **pages, u32 nr_pages);
void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs);
struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool);
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
{
return xskb->dma;
}
static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
{
return xskb->frame_dma;
}
void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
{
if (xskb->pool->cheap_dma)
return;
xp_dma_sync_for_cpu_slow(xskb);
}
void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
size_t size);
static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
dma_addr_t dma, size_t size)
{
if (pool->cheap_dma)
return;
xp_dma_sync_for_device_slow(pool, dma, size);
}
/* Masks for xdp_umem_page flags.
* The low 12-bits of the addr will be 0 since this is the page address, so we
* can use them for flags.
*/
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr, u32 len)
{
bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
if (pool->dma_pages_cnt && cross_pg) {
return !(pool->dma_pages[addr >> PAGE_SHIFT] &
XSK_NEXT_PG_CONTIG_MASK);
}
return false;
}
static inline u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr)
{
return addr & pool->chunk_mask;
}
static inline u64 xp_unaligned_extract_addr(u64 addr)
{
return addr & XSK_UNALIGNED_BUF_ADDR_MASK;
}
static inline u64 xp_unaligned_extract_offset(u64 addr)
{
return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT;
}
static inline u64 xp_unaligned_add_offset_to_addr(u64 addr)
{
return xp_unaligned_extract_addr(addr) +
xp_unaligned_extract_offset(addr);
}
#endif /* XSK_BUFF_POOL_H_ */

View File

@ -287,7 +287,7 @@ TRACE_EVENT(xdp_devmap_xmit,
FN(PAGE_SHARED) \
FN(PAGE_ORDER0) \
FN(PAGE_POOL) \
FN(ZERO_COPY)
FN(XSK_BUFF_POOL)
#define __MEM_TYPE_TP_FN(x) \
TRACE_DEFINE_ENUM(MEM_TYPE_##x);

View File

@ -220,6 +220,10 @@ enum bpf_attach_type {
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
BPF_CGROUP_INET4_GETPEERNAME,
BPF_CGROUP_INET6_GETPEERNAME,
BPF_CGROUP_INET4_GETSOCKNAME,
BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};

View File

@ -12,9 +12,6 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
ifeq ($(CONFIG_XDP_SOCKETS),y)
obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)

View File

@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_POST_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:

View File

@ -393,6 +393,15 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
static bool reg_type_not_null(enum bpf_reg_type type)
{
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_SOCK_COMMON ||
type == PTR_TO_BTF_ID;
}
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
@ -6308,9 +6317,26 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode,
bool is_jmp32)
{
if (__is_pointer_value(false, reg))
if (__is_pointer_value(false, reg)) {
if (!reg_type_not_null(reg->type))
return -1;
/* If pointer is valid tests against zero will fail so we can
* use this to direct branch taken.
*/
if (val != 0)
return -1;
switch (opcode) {
case BPF_JEQ:
return 0;
case BPF_JNE:
return 1;
default:
return -1;
}
}
if (is_jmp32)
return is_branch32_taken(reg, val, opcode);
return is_branch64_taken(reg, val, opcode);
@ -6808,6 +6834,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
if (pred >= 0) {
/* If we get here with a dst_reg pointer type it is because
* above is_branch_taken() special cased the 0 comparison.
*/
if (!__is_pointer_value(false, dst_reg))
err = mark_chain_precision(env, insn->dst_reg);
if (BPF_SRC(insn->code) == BPF_X && !err)
err = mark_chain_precision(env, insn->src_reg);
@ -7094,7 +7124,11 @@ static int check_return_code(struct bpf_verifier_env *env)
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME ||
env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME)
range = tnum_range(1, 1);
break;
case BPF_PROG_TYPE_CGROUP_SKB:
@ -7120,10 +7154,11 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_TRACE_FEXIT:
range = tnum_const(0);
break;
case BPF_TRACE_ITER:
case BPF_TRACE_RAW_TP:
case BPF_MODIFY_RETURN:
return 0;
case BPF_TRACE_ITER:
break;
default:
return -ENOTSUPP;
}

View File

@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
u32 user_size = kattr->test.data_size_in;
void *data;
if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
return ERR_PTR(-EINVAL);
if (user_size > size)
return ERR_PTR(-EMSGSIZE);
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
if (copy_from_user(data + headroom, data_in, size)) {
if (copy_from_user(data + headroom, data_in, user_size)) {
kfree(data);
return ERR_PTR(-EFAULT);
}
@ -486,8 +490,6 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
/* XDP have extra tailroom as (most) drivers use full page */
max_data_sz = 4096 - headroom - tailroom;
if (size > max_data_sz)
return -EINVAL;
data = bpf_test_init(kattr, max_data_sz, headroom, tailroom);
if (IS_ERR(data))

View File

@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
case BPF_CGROUP_INET4_GETPEERNAME:
case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP4_RECVMSG:
break;
@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_INET6_GETPEERNAME:
case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
case BPF_CGROUP_UDP6_RECVMSG:
break;

View File

@ -17,6 +17,7 @@
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@ -109,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator)
mutex_unlock(&mem_id_lock);
}
static void mem_id_disconnect(int id)
{
struct xdp_mem_allocator *xa;
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
if (!xa) {
mutex_unlock(&mem_id_lock);
WARN(1, "Request remove non-existing id(%d), driver bug?", id);
return;
}
trace_mem_disconnect(xa);
if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
mutex_unlock(&mem_id_lock);
}
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@ -143,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY)
return mem_id_disconnect(id);
if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) {
rcu_read_lock();
xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
@ -301,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
xdp_rxq->mem.type = type;
if (!allocator) {
if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
if (type == MEM_TYPE_PAGE_POOL)
return -EINVAL; /* Setup time check page_pool req */
return 0;
}
@ -358,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
* of xdp_frames/pages in those cases. This path is never used by the
* MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of
* the switch-statement.
*/
static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
unsigned long handle)
static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
@ -383,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
page = virt_to_page(data); /* Assumes order0 page*/
put_page(page);
break;
case MEM_TYPE_ZERO_COPY:
/* NB! Only valid from an xdp_buff! */
rcu_read_lock();
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
xa->zc_alloc->free(xa->zc_alloc, handle);
rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
__xdp_return(xdpf->data, &xdpf->mem, false, 0);
__xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
__xdp_return(xdpf->data, &xdpf->mem, true, 0);
__xdp_return(xdpf->data, &xdpf->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
void xdp_return_buff(struct xdp_buff *xdp)
{
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
__xdp_return(xdp->data, &xdp->rxq->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
@ -493,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->metasize = metasize;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
xdp_return_buff(xdp);
xsk_buff_free(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);

View File

@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include "netlink.h"
#include "common.h"

View File

@ -24,7 +24,7 @@
#include <linux/sched/signal.h>
#include <linux/net.h>
#include <net/devlink.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/flow_offload.h>
#include <linux/ethtool_netlink.h>
#include <generated/utsrelease.h>

View File

@ -756,7 +756,6 @@ do_err:
}
EXPORT_SYMBOL(inet_accept);
/*
* This does both peername and sockname.
*/
@ -782,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
if (cgroup_bpf_enabled)
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
peer ? BPF_CGROUP_INET4_GETPEERNAME :
BPF_CGROUP_INET4_GETSOCKNAME,
NULL);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
return sizeof(*sin);
}

View File

@ -505,7 +505,6 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
/*
* This does both peername and sockname.
*/
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
int peer)
{
@ -532,9 +531,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
sin->sin6_port = inet->inet_sport;
}
if (cgroup_bpf_enabled)
BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin,
peer ? BPF_CGROUP_INET6_GETPEERNAME :
BPF_CGROUP_INET6_GETSOCKNAME,
NULL);
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
return sizeof(*sin);

View File

@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o
obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o
obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o

View File

@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
umem->zc = false;
}
static void xdp_umem_unmap_pages(struct xdp_umem *umem)
{
unsigned int i;
for (i = 0; i < umem->npgs; i++)
if (PageHighMem(umem->pgs[i]))
vunmap(umem->pages[i].addr);
}
static int xdp_umem_map_pages(struct xdp_umem *umem)
{
unsigned int i;
void *addr;
for (i = 0; i < umem->npgs; i++) {
if (PageHighMem(umem->pgs[i]))
addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
else
addr = page_address(umem->pgs[i]);
if (!addr) {
xdp_umem_unmap_pages(umem);
return -ENOMEM;
}
umem->pages[i].addr = addr;
}
return 0;
}
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
umem->cq = NULL;
}
xsk_reuseq_destroy(umem);
xdp_umem_unmap_pages(umem);
xp_destroy(umem->pool);
xdp_umem_unpin_pages(umem);
kvfree(umem->pages);
umem->pages = NULL;
xdp_umem_unaccount_pages(umem);
kfree(umem);
}
@ -385,11 +349,9 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
: ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->chunk_size = chunk_size;
umem->npgs = size / PAGE_SIZE;
umem->pgs = NULL;
umem->user = NULL;
@ -407,19 +369,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
goto out_account;
umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages),
GFP_KERNEL_ACCOUNT);
if (!umem->pages) {
umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size,
headroom, size, unaligned_chunks);
if (!umem->pool) {
err = -ENOMEM;
goto out_pin;
}
err = xdp_umem_map_pages(umem);
if (!err)
return 0;
kvfree(umem->pages);
out_pin:
xdp_umem_unpin_pages(umem);
out_account:

View File

@ -6,7 +6,7 @@
#ifndef XDP_UMEM_H_
#define XDP_UMEM_H_
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u16 queue_id, u16 flags);

View File

@ -22,7 +22,7 @@
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <net/xdp.h>
#include "xsk_queue.h"
@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
READ_ONCE(xs->umem->fq);
}
bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
{
return xskq_cons_has_entries(umem->fq, cnt);
}
EXPORT_SYMBOL(xsk_umem_has_addrs);
bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return xskq_cons_peek_addr(umem->fq, addr, umem);
}
EXPORT_SYMBOL(xsk_umem_peek_addr);
void xsk_umem_release_addr(struct xdp_umem *umem)
{
xskq_cons_release(umem->fq);
}
EXPORT_SYMBOL(xsk_umem_release_addr);
void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
{
if (umem->need_wakeup & XDP_WAKEUP_RX)
@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
}
EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
* each page. This is only required in copy mode.
*/
static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
u32 len, u32 metalen)
void xp_release(struct xdp_buff_xsk *xskb)
{
void *to_buf = xdp_umem_get_data(umem, addr);
xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
}
addr = xsk_umem_add_offset_to_addr(addr);
if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) {
void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
u64 page_start = addr & ~(PAGE_SIZE - 1);
u64 first_len = PAGE_SIZE - (addr - page_start);
static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
{
u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
memcpy(to_buf, from_buf, first_len);
memcpy(next_pg_addr, from_buf + first_len,
len + metalen - first_len);
offset += xskb->pool->headroom;
if (!xskb->pool->unaligned)
return xskb->orig_addr + offset;
return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}
return;
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
u64 addr;
int err;
addr = xp_get_handle(xskb);
err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (err) {
xs->rx_dropped++;
return err;
}
xp_release(xskb);
return 0;
}
static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
{
void *from_buf, *to_buf;
u32 metalen;
if (unlikely(xdp_data_meta_unsupported(from))) {
from_buf = from->data;
to_buf = to->data;
metalen = 0;
} else {
from_buf = from->data_meta;
metalen = from->data - from->data_meta;
to_buf = to->data - metalen;
}
memcpy(to_buf, from_buf, len + metalen);
}
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len,
bool explicit_free)
{
u64 offset = xs->umem->headroom;
u64 addr, memcpy_addr;
void *from_buf;
u32 metalen;
struct xdp_buff *xsk_xdp;
int err;
if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
if (len > xsk_umem_get_rx_frame_size(xs->umem)) {
xs->rx_dropped++;
return -ENOSPC;
}
if (unlikely(xdp_data_meta_unsupported(xdp))) {
from_buf = xdp->data;
metalen = 0;
} else {
from_buf = xdp->data_meta;
metalen = xdp->data - xdp->data_meta;
xsk_xdp = xsk_buff_alloc(xs->umem);
if (!xsk_xdp) {
xs->rx_dropped++;
return -ENOSPC;
}
memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
__xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
offset += metalen;
addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (!err) {
xskq_cons_release(xs->umem->fq);
xsk_copy_xdp(xsk_xdp, xdp, len);
err = __xsk_rcv_zc(xs, xsk_xdp, len);
if (err) {
xsk_buff_free(xsk_xdp);
return err;
}
if (explicit_free)
xdp_return_buff(xdp);
return 0;
}
xs->rx_dropped++;
return err;
}
static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len);
if (err)
xs->rx_dropped++;
return err;
}
static bool xsk_is_bound(struct xdp_sock *xs)
@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs)
return false;
}
static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp,
bool explicit_free)
{
u32 len;
@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
len = xdp->data_end - xdp->data;
return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
__xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ?
__xsk_rcv_zc(xs, xdp, len) :
__xsk_rcv(xs, xdp, len, explicit_free);
}
static void xsk_flush(struct xdp_sock *xs)
@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
u64 offset = xs->umem->headroom;
void *buffer;
u64 addr;
int err;
spin_lock_bh(&xs->rx_lock);
if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
err = -EINVAL;
goto out_unlock;
}
if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) ||
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
err = -ENOSPC;
goto out_drop;
}
addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data_meta, len + metalen);
addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
err = xskq_prod_reserve_desc(xs->rx, addr, len);
if (err)
goto out_drop;
xskq_cons_release(xs->umem->fq);
xskq_prod_submit(xs->rx);
spin_unlock_bh(&xs->rx_lock);
xs->sk.sk_data_ready(&xs->sk);
return 0;
out_drop:
xs->rx_dropped++;
out_unlock:
err = xsk_rcv(xs, xdp, false);
xsk_flush(xs);
spin_unlock_bh(&xs->rx_lock);
return err;
}
@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
int err;
err = xsk_rcv(xs, xdp);
err = xsk_rcv(xs, xdp, true);
if (err)
return err;
@ -404,7 +359,7 @@ static int xsk_generic_xmit(struct sock *sk)
skb_put(skb, len);
addr = desc.addr;
buffer = xdp_umem_get_data(xs->umem, addr);
buffer = xsk_buff_raw_get_data(xs->umem, addr);
err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd)
return sock;
}
/* Check if umem pages are contiguous.
* If zero-copy mode, use the DMA address to do the page contiguity check
* For all other modes we use addr (kernel virtual address)
* Store the result in the low bits of addr.
*/
static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
{
struct xdp_umem_page *pgs = umem->pages;
int i, is_contig;
for (i = 0; i < umem->npgs - 1; i++) {
is_contig = (flags & XDP_ZEROCOPY) ?
(pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
(pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
}
}
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_unlock;
} else {
/* This xsk has its own umem. */
xskq_set_umem(xs->umem->fq, xs->umem->size,
xs->umem->chunk_mask);
xskq_set_umem(xs->umem->cq, xs->umem->size,
xs->umem->chunk_mask);
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
goto out_unlock;
xsk_check_page_contiguity(xs->umem, flags);
}
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname,
q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
&xs->umem->cq;
err = xsk_init_queue(entries, q, true);
if (optname == XDP_UMEM_FILL_RING)
xp_set_fq(xs->umem->pool, *q);
mutex_unlock(&xs->mutex);
return err;
}

View File

@ -4,6 +4,20 @@
#ifndef XSK_H_
#define XSK_H_
/* Masks for xdp_umem_page flags.
* The low 12-bits of the addr will be 0 since this is the page address, so we
* can use them for flags.
*/
#define XSK_NEXT_PG_CONTIG_SHIFT 0
#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
/* Flags for the umem flags field.
*
* The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public
* flags. See inlude/uapi/include/linux/if_xdp.h.
*/
#define XDP_UMEM_USES_NEED_WAKEUP BIT(1)
struct xdp_ring_offset_v1 {
__u64 producer;
__u64 consumer;
@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 {
struct xdp_ring_offset_v1 cr;
};
/* Nodes are linked in the struct xdp_sock map_list field, and used to
* track which maps a certain socket reside in.
*/
struct xsk_map_node {
struct list_head node;
struct xsk_map *map;
struct xdp_sock **map_entry;
};
static inline struct xdp_sock *xdp_sk(struct sock *sk)
{
return (struct xdp_sock *)sk;
}
bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs);
void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
struct xdp_sock **map_entry);
int xsk_map_inc(struct xsk_map *map);
void xsk_map_put(struct xsk_map *map);
#endif /* XSK_H_ */

336
net/xdp/xsk_buff_pool.c Normal file
View File

@ -0,0 +1,336 @@
// SPDX-License-Identifier: GPL-2.0
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/swiotlb.h>
#include "xsk_queue.h"
static void xp_addr_unmap(struct xsk_buff_pool *pool)
{
vunmap(pool->addrs);
}
static int xp_addr_map(struct xsk_buff_pool *pool,
struct page **pages, u32 nr_pages)
{
pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!pool->addrs)
return -ENOMEM;
return 0;
}
void xp_destroy(struct xsk_buff_pool *pool)
{
if (!pool)
return;
xp_addr_unmap(pool);
kvfree(pool->heads);
kvfree(pool);
}
struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks,
u32 chunk_size, u32 headroom, u64 size,
bool unaligned)
{
struct xsk_buff_pool *pool;
struct xdp_buff_xsk *xskb;
int err;
u32 i;
pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL);
if (!pool)
goto out;
pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL);
if (!pool->heads)
goto out;
pool->chunk_mask = ~((u64)chunk_size - 1);
pool->addrs_cnt = size;
pool->heads_cnt = chunks;
pool->free_heads_cnt = chunks;
pool->headroom = headroom;
pool->chunk_size = chunk_size;
pool->cheap_dma = true;
pool->unaligned = unaligned;
pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM;
INIT_LIST_HEAD(&pool->free_list);
for (i = 0; i < pool->free_heads_cnt; i++) {
xskb = &pool->heads[i];
xskb->pool = pool;
xskb->xdp.frame_sz = chunk_size - headroom;
pool->free_heads[i] = xskb;
}
err = xp_addr_map(pool, pages, nr_pages);
if (!err)
return pool;
out:
xp_destroy(pool);
return NULL;
}
void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq)
{
pool->fq = fq;
}
void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq)
{
u32 i;
for (i = 0; i < pool->heads_cnt; i++)
pool->heads[i].xdp.rxq = rxq;
}
EXPORT_SYMBOL(xp_set_rxq_info);
void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
{
dma_addr_t *dma;
u32 i;
if (pool->dma_pages_cnt == 0)
return;
for (i = 0; i < pool->dma_pages_cnt; i++) {
dma = &pool->dma_pages[i];
if (*dma) {
dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
*dma = 0;
}
}
kvfree(pool->dma_pages);
pool->dma_pages_cnt = 0;
pool->dev = NULL;
}
EXPORT_SYMBOL(xp_dma_unmap);
static void xp_check_dma_contiguity(struct xsk_buff_pool *pool)
{
u32 i;
for (i = 0; i < pool->dma_pages_cnt - 1; i++) {
if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1])
pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
}
static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool)
{
#if defined(CONFIG_SWIOTLB)
phys_addr_t paddr;
u32 i;
for (i = 0; i < pool->dma_pages_cnt; i++) {
paddr = dma_to_phys(pool->dev, pool->dma_pages[i]);
if (is_swiotlb_buffer(paddr))
return false;
}
#endif
return true;
}
static bool xp_check_cheap_dma(struct xsk_buff_pool *pool)
{
#if defined(CONFIG_HAS_DMA)
const struct dma_map_ops *ops = get_dma_ops(pool->dev);
if (ops) {
return !ops->sync_single_for_cpu &&
!ops->sync_single_for_device;
}
if (!dma_is_direct(ops))
return false;
if (!xp_check_swiotlb_dma(pool))
return false;
if (!dev_is_dma_coherent(pool->dev)) {
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \
defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE)
return false;
#endif
}
#endif
return true;
}
int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
unsigned long attrs, struct page **pages, u32 nr_pages)
{
dma_addr_t dma;
u32 i;
pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages),
GFP_KERNEL);
if (!pool->dma_pages)
return -ENOMEM;
pool->dev = dev;
pool->dma_pages_cnt = nr_pages;
for (i = 0; i < pool->dma_pages_cnt; i++) {
dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE,
DMA_BIDIRECTIONAL, attrs);
if (dma_mapping_error(dev, dma)) {
xp_dma_unmap(pool, attrs);
return -ENOMEM;
}
pool->dma_pages[i] = dma;
}
if (pool->unaligned)
xp_check_dma_contiguity(pool);
pool->dev = dev;
pool->cheap_dma = xp_check_cheap_dma(pool);
return 0;
}
EXPORT_SYMBOL(xp_dma_map);
static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr)
{
return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size);
}
static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr)
{
*addr = xp_unaligned_extract_addr(*addr);
if (*addr >= pool->addrs_cnt ||
*addr + pool->chunk_size > pool->addrs_cnt ||
xp_addr_crosses_non_contig_pg(pool, *addr))
return false;
return true;
}
static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr)
{
*addr = xp_aligned_extract_addr(pool, *addr);
return *addr < pool->addrs_cnt;
}
static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb;
u64 addr;
bool ok;
if (pool->free_heads_cnt == 0)
return NULL;
xskb = pool->free_heads[--pool->free_heads_cnt];
for (;;) {
if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) {
xp_release(xskb);
return NULL;
}
ok = pool->unaligned ? xp_check_unaligned(pool, &addr) :
xp_check_aligned(pool, &addr);
if (!ok) {
pool->fq->invalid_descs++;
xskq_cons_release(pool->fq);
continue;
}
break;
}
xskq_cons_release(pool->fq);
xskb->orig_addr = addr;
xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom;
if (pool->dma_pages_cnt) {
xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
xskb->dma = xskb->frame_dma + pool->headroom +
XDP_PACKET_HEADROOM;
}
return xskb;
}
struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
{
struct xdp_buff_xsk *xskb;
if (!pool->free_list_cnt) {
xskb = __xp_alloc(pool);
if (!xskb)
return NULL;
} else {
pool->free_list_cnt--;
xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk,
free_list_node);
list_del(&xskb->free_list_node);
}
xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
xskb->xdp.data_meta = xskb->xdp.data;
if (!pool->cheap_dma) {
dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
pool->frame_len,
DMA_BIDIRECTIONAL);
}
return &xskb->xdp;
}
EXPORT_SYMBOL(xp_alloc);
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count)
{
if (pool->free_list_cnt >= count)
return true;
return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt);
}
EXPORT_SYMBOL(xp_can_alloc);
void xp_free(struct xdp_buff_xsk *xskb)
{
xskb->pool->free_list_cnt++;
list_add(&xskb->free_list_node, &xskb->pool->free_list);
}
EXPORT_SYMBOL(xp_free);
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return pool->addrs + addr;
}
EXPORT_SYMBOL(xp_raw_get_data);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
{
addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr;
return (pool->dma_pages[addr >> PAGE_SHIFT] &
~XSK_NEXT_PG_CONTIG_MASK) +
(addr & ~PAGE_MASK);
}
EXPORT_SYMBOL(xp_raw_get_dma);
void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
{
dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
xskb->pool->frame_len, DMA_BIDIRECTIONAL);
}
EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
size_t size)
{
dma_sync_single_range_for_device(pool->dev, dma, 0,
size, DMA_BIDIRECTIONAL);
}
EXPORT_SYMBOL(xp_dma_sync_for_device_slow);

View File

@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb)
du.id = umem->id;
du.size = umem->size;
du.num_pages = umem->npgs;
du.chunk_size = umem->chunk_size_nohr + umem->headroom;
du.chunk_size = umem->chunk_size;
du.headroom = umem->headroom;
du.ifindex = umem->dev ? umem->dev->ifindex : 0;
du.queue_id = umem->queue_id;

View File

@ -6,18 +6,10 @@
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/overflow.h>
#include <net/xdp_sock_drv.h>
#include "xsk_queue.h"
void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask)
{
if (!q)
return;
q->umem_size = umem_size;
q->chunk_mask = chunk_mask;
}
static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue)
{
struct xdp_umem_ring *umem_ring;
@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q)
page_frag_free(q->ring);
kfree(q);
}
struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
{
struct xdp_umem_fq_reuse *newq;
/* Check for overflow */
if (nentries > (u32)roundup_pow_of_two(nentries))
return NULL;
nentries = roundup_pow_of_two(nentries);
newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
if (!newq)
return NULL;
memset(newq, 0, offsetof(typeof(*newq), handles));
newq->nentries = nentries;
return newq;
}
EXPORT_SYMBOL_GPL(xsk_reuseq_prepare);
struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
struct xdp_umem_fq_reuse *newq)
{
struct xdp_umem_fq_reuse *oldq = umem->fq_reuse;
if (!oldq) {
umem->fq_reuse = newq;
return NULL;
}
if (newq->nentries < oldq->length)
return newq;
memcpy(newq->handles, oldq->handles,
array_size(oldq->length, sizeof(u64)));
newq->length = oldq->length;
umem->fq_reuse = newq;
return oldq;
}
EXPORT_SYMBOL_GPL(xsk_reuseq_swap);
void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
{
kvfree(rq);
}
EXPORT_SYMBOL_GPL(xsk_reuseq_free);
void xsk_reuseq_destroy(struct xdp_umem *umem)
{
xsk_reuseq_free(umem->fq_reuse);
umem->fq_reuse = NULL;
}

View File

@ -9,6 +9,9 @@
#include <linux/types.h>
#include <linux/if_xdp.h>
#include <net/xdp_sock.h>
#include <net/xsk_buff_pool.h>
#include "xsk.h"
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
@ -29,8 +32,6 @@ struct xdp_umem_ring {
};
struct xsk_queue {
u64 chunk_mask;
u64 umem_size;
u32 ring_mask;
u32 nentries;
u32 cached_prod;
@ -103,98 +104,73 @@ struct xsk_queue {
/* Functions that read and validate content from consumer rings. */
static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem,
u64 addr,
u64 length)
{
bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE;
bool next_pg_contig =
(unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr &
XSK_NEXT_PG_CONTIG_MASK;
return cross_pg && !next_pg_contig;
}
static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q,
u64 addr,
u64 length,
struct xdp_umem *umem)
{
u64 base_addr = xsk_umem_extract_addr(addr);
addr = xsk_umem_add_offset_to_addr(addr);
if (base_addr >= q->umem_size || addr >= q->umem_size ||
xskq_cons_crosses_non_contig_pg(umem, addr, length)) {
q->invalid_descs++;
return false;
}
return true;
}
static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr)
{
if (addr >= q->umem_size) {
q->invalid_descs++;
return false;
}
return true;
}
static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr,
struct xdp_umem *umem)
static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
while (q->cached_cons != q->cached_prod) {
if (q->cached_cons != q->cached_prod) {
u32 idx = q->cached_cons & q->ring_mask;
*addr = ring->desc[idx] & q->chunk_mask;
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
if (xskq_cons_is_valid_unaligned(q, *addr,
umem->chunk_size_nohr,
umem))
*addr = ring->desc[idx];
return true;
goto out;
}
if (xskq_cons_is_valid_addr(q, *addr))
return true;
out:
q->cached_cons++;
}
return false;
}
static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
u64 chunk, chunk_end;
chunk = xp_aligned_extract_addr(pool, desc->addr);
chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len);
if (chunk != chunk_end)
return false;
if (chunk >= pool->addrs_cnt)
return false;
if (desc->options)
return false;
return true;
}
static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
u64 addr, base_addr;
base_addr = xp_unaligned_extract_addr(desc->addr);
addr = xp_unaligned_add_offset_to_addr(desc->addr);
if (desc->len > pool->chunk_size)
return false;
if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt ||
xp_desc_crosses_non_contig_pg(pool, addr, desc->len))
return false;
if (desc->options)
return false;
return true;
}
static inline bool xp_validate_desc(struct xsk_buff_pool *pool,
struct xdp_desc *desc)
{
return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) :
xp_aligned_validate_desc(pool, desc);
}
static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q,
struct xdp_desc *d,
struct xdp_umem *umem)
{
if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) {
if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem))
return false;
if (d->len > umem->chunk_size_nohr || d->options) {
if (!xp_validate_desc(umem->pool, d)) {
q->invalid_descs++;
return false;
}
return true;
}
if (!xskq_cons_is_valid_addr(q, d->addr))
return false;
if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) ||
d->options) {
q->invalid_descs++;
return false;
}
return true;
}
@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt)
return entries >= cnt;
}
static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr,
struct xdp_umem *umem)
static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr)
{
if (q->cached_prod == q->cached_cons)
xskq_cons_get_entries(q);
return xskq_cons_read_addr(q, addr, umem);
return xskq_cons_read_addr_unchecked(q, addr);
}
static inline bool xskq_cons_peek_desc(struct xsk_queue *q,
@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q)
return q ? q->invalid_descs : 0;
}
void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);
/* Executed by the core when the entire UMEM gets freed */
void xsk_reuseq_destroy(struct xdp_umem *umem);
#endif /* _LINUX_XSK_QUEUE_H */

View File

@ -9,6 +9,8 @@
#include <linux/slab.h>
#include <linux/sched.h>
#include "xsk.h"
int xsk_map_inc(struct xsk_map *map)
{
bpf_map_inc(&map->map);

View File

@ -50,3 +50,4 @@ xdp_rxq_info
xdp_sample_pkts
xdp_tx_iptunnel
xdpsock
testfile.img

View File

@ -63,14 +63,14 @@ TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
fds_example-objs := fds_example.o
sockex1-objs := sockex1_user.o
sockex2-objs := sockex2_user.o
sockex3-objs := bpf_load.o sockex3_user.o
tracex1-objs := bpf_load.o tracex1_user.o $(TRACE_HELPERS)
tracex2-objs := bpf_load.o tracex2_user.o
tracex3-objs := bpf_load.o tracex3_user.o
tracex4-objs := bpf_load.o tracex4_user.o
tracex5-objs := bpf_load.o tracex5_user.o $(TRACE_HELPERS)
tracex6-objs := bpf_load.o tracex6_user.o
tracex7-objs := bpf_load.o tracex7_user.o
sockex3-objs := sockex3_user.o
tracex1-objs := tracex1_user.o $(TRACE_HELPERS)
tracex2-objs := tracex2_user.o
tracex3-objs := tracex3_user.o
tracex4-objs := tracex4_user.o
tracex5-objs := tracex5_user.o $(TRACE_HELPERS)
tracex6-objs := tracex6_user.o
tracex7-objs := tracex7_user.o
test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
lathist-objs := bpf_load.o lathist_user.o

View File

@ -13,12 +13,12 @@
#define MAX_IPS 8192
struct bpf_map_def SEC("maps") ip_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u64),
.value_size = sizeof(u32),
.max_entries = MAX_IPS,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u64);
__type(value, u32);
__uint(max_entries, MAX_IPS);
} ip_map SEC(".maps");
SEC("perf_event")
int do_sample(struct bpf_perf_event_data *ctx)

View File

@ -18,9 +18,6 @@
#include "perf-sys.h"
#include "trace_helpers.h"
#define __must_check
#include <linux/err.h>
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
#define MAX_IPS 8192
@ -57,7 +54,7 @@ static int sampling_start(int freq, struct bpf_program *prog,
return 1;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
if (IS_ERR(links[i])) {
if (libbpf_get_error(links[i])) {
fprintf(stderr, "ERROR: Attach perf event\n");
links[i] = NULL;
close(pmu_fd);
@ -182,7 +179,7 @@ int main(int argc, char **argv)
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (IS_ERR(obj)) {
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
obj = NULL;
goto cleanup;

View File

@ -19,12 +19,12 @@
#define PROG(F) SEC("socket/"__stringify(F)) int bpf_func_##F
struct bpf_map_def SEC("maps") jmp_table = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(u32),
.max_entries = 8,
};
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
__uint(max_entries, 8);
} jmp_table SEC(".maps");
#define PARSE_VLAN 1
#define PARSE_MPLS 2
@ -92,12 +92,12 @@ struct globals {
struct flow_key_record flow;
};
struct bpf_map_def SEC("maps") percpu_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct globals),
.max_entries = 32,
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, __u32);
__type(value, struct globals);
__uint(max_entries, 32);
} percpu_map SEC(".maps");
/* user poor man's per_cpu until native support is ready */
static struct globals *this_cpu_globals(void)
@ -113,12 +113,12 @@ struct pair {
__u64 bytes;
};
struct bpf_map_def SEC("maps") hash_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct flow_key_record),
.value_size = sizeof(struct pair),
.max_entries = 1024,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct flow_key_record);
__type(value, struct pair);
__uint(max_entries, 1024);
} hash_map SEC(".maps");
static void update_stats(struct __sk_buff *skb, struct globals *g)
{

View File

@ -1,18 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <assert.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/resource.h>
#define PARSE_IP 3
#define PARSE_IP_PROG_FD (prog_fd[0])
#define PROG_ARRAY_FD (map_fd[0])
struct flow_key_record {
__be32 src;
__be32 dst;
@ -30,31 +25,55 @@ struct pair {
int main(int argc, char **argv)
{
int i, sock, key, fd, main_prog_fd, jmp_table_fd, hash_map_fd;
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
const char *title;
FILE *f;
int i, sock, err, id, key = PARSE_IP;
struct bpf_prog_info info = {};
uint32_t info_len = sizeof(info);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
/* Test fd array lookup which returns the id of the bpf_prog */
err = bpf_obj_get_info_by_fd(PARSE_IP_PROG_FD, &info, &info_len);
assert(!err);
err = bpf_map_lookup_elem(PROG_ARRAY_FD, &key, &id);
assert(!err);
assert(id == info.id);
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
jmp_table_fd = bpf_object__find_map_fd_by_name(obj, "jmp_table");
hash_map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
if (jmp_table_fd < 0 || hash_map_fd < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
fd = bpf_program__fd(prog);
title = bpf_program__title(prog, false);
if (sscanf(title, "socket/%d", &key) != 1) {
fprintf(stderr, "ERROR: finding prog failed\n");
goto cleanup;
}
if (key == 0)
main_prog_fd = fd;
else
bpf_map_update_elem(jmp_table_fd, &key, &fd, BPF_ANY);
}
sock = open_raw_sock("lo");
assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd[4],
/* attach BPF program to socket */
assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &main_prog_fd,
sizeof(__u32)) == 0);
if (argc > 1)
@ -69,8 +88,8 @@ int main(int argc, char **argv)
sleep(1);
printf("IP src.port -> dst.port bytes packets\n");
while (bpf_map_get_next_key(map_fd[2], &key, &next_key) == 0) {
bpf_map_lookup_elem(map_fd[2], &next_key, &value);
while (bpf_map_get_next_key(hash_map_fd, &key, &next_key) == 0) {
bpf_map_lookup_elem(hash_map_fd, &next_key, &value);
printf("%s.%05d -> %s.%05d %12lld %12lld\n",
inet_ntoa((struct in_addr){htonl(next_key.src)}),
next_key.port16[0],
@ -80,5 +99,8 @@ int main(int argc, char **argv)
key = next_key;
}
}
cleanup:
bpf_object__close(obj);
return 0;
}

View File

@ -0,0 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef __TRACE_COMMON_H
#define __TRACE_COMMON_H
#ifdef __x86_64__
#define SYSCALL(SYS) "__x64_" __stringify(SYS)
#elif defined(__s390x__)
#define SYSCALL(SYS) "__s390x_" __stringify(SYS)
#else
#define SYSCALL(SYS) __stringify(SYS)
#endif
#endif

View File

@ -18,19 +18,19 @@ struct key_t {
u32 userstack;
};
struct bpf_map_def SEC("maps") counts = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct key_t),
.value_size = sizeof(u64),
.max_entries = 10000,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct key_t);
__type(value, u64);
__uint(max_entries, 10000);
} counts SEC(".maps");
struct bpf_map_def SEC("maps") stackmap = {
.type = BPF_MAP_TYPE_STACK_TRACE,
.key_size = sizeof(u32),
.value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
.max_entries = 10000,
};
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(key_size, sizeof(u32));
__uint(value_size, PERF_MAX_STACK_DEPTH * sizeof(u64));
__uint(max_entries, 10000);
} stackmap SEC(".maps");
#define KERN_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
#define USER_STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK)

View File

@ -16,9 +16,6 @@
#include "perf-sys.h"
#include "trace_helpers.h"
#define __must_check
#include <linux/err.h>
#define SAMPLE_FREQ 50
static int pid;
@ -159,7 +156,7 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
goto all_cpu_err;
}
links[i] = bpf_program__attach_perf_event(prog, pmu_fd);
if (IS_ERR(links[i])) {
if (libbpf_get_error(links[i])) {
printf("bpf_program__attach_perf_event failed\n");
links[i] = NULL;
close(pmu_fd);
@ -198,7 +195,7 @@ static void test_perf_event_task(struct perf_event_attr *attr)
goto err;
}
link = bpf_program__attach_perf_event(prog, pmu_fd);
if (IS_ERR(link)) {
if (libbpf_get_error(link)) {
printf("bpf_program__attach_perf_event failed\n");
link = NULL;
close(pmu_fd);
@ -314,7 +311,7 @@ int main(int argc, char **argv)
}
obj = bpf_object__open_file(filename, NULL);
if (IS_ERR(obj)) {
if (libbpf_get_error(obj)) {
printf("opening BPF object file failed\n");
obj = NULL;
goto cleanup;

View File

@ -1,21 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <linux/bpf.h>
#include <unistd.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include "trace_helpers.h"
int main(int ac, char **argv)
{
FILE *f;
struct bpf_link *link = NULL;
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
if (!prog) {
fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
goto cleanup;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
link = bpf_program__attach(prog);
if (libbpf_get_error(link)) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
link = NULL;
goto cleanup;
}
f = popen("taskset 1 ping -c5 localhost", "r");
@ -23,5 +43,8 @@ int main(int ac, char **argv)
read_trace_pipe();
cleanup:
bpf_link__destroy(link);
bpf_object__close(obj);
return 0;
}

View File

@ -10,13 +10,14 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "trace_common.h"
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(long),
.value_size = sizeof(long),
.max_entries = 1024,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, long);
__type(value, long);
__uint(max_entries, 1024);
} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
@ -70,14 +71,14 @@ struct hist_key {
u64 index;
};
struct bpf_map_def SEC("maps") my_hist_map = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(struct hist_key),
.value_size = sizeof(long),
.max_entries = 1024,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(key_size, sizeof(struct hist_key));
__uint(value_size, sizeof(long));
__uint(max_entries, 1024);
} my_hist_map SEC(".maps");
SEC("kprobe/sys_write")
SEC("kprobe/" SYSCALL(sys_write))
int bpf_prog3(struct pt_regs *ctx)
{
long write_size = PT_REGS_PARM3(ctx);

View File

@ -3,17 +3,19 @@
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <linux/bpf.h>
#include <string.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include "bpf_util.h"
#define MAX_INDEX 64
#define MAX_STARS 38
/* my_map, my_hist_map */
static int map_fd[2];
static void stars(char *str, long val, long max, int width)
{
int i;
@ -115,18 +117,39 @@ static void int_exit(int sig)
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
char filename[256];
long key, next_key, value;
struct bpf_link *links[2];
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
int i, j = 0;
FILE *f;
int i;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
map_fd[0] = bpf_object__find_map_fd_by_name(obj, "my_map");
map_fd[1] = bpf_object__find_map_fd_by_name(obj, "my_hist_map");
if (map_fd[0] < 0 || map_fd[1] < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
@ -138,9 +161,14 @@ int main(int ac, char **argv)
f = popen("dd if=/dev/zero of=/dev/null count=5000000", "r");
(void) f;
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
bpf_object__for_each_program(prog, obj) {
links[j] = bpf_program__attach(prog);
if (libbpf_get_error(links[j])) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
links[j] = NULL;
goto cleanup;
}
j++;
}
for (i = 0; i < 5; i++) {
@ -156,5 +184,10 @@ int main(int ac, char **argv)
}
print_hist(map_fd[1]);
cleanup:
for (j--; j >= 0; j--)
bpf_link__destroy(links[j]);
bpf_object__close(obj);
return 0;
}

View File

@ -11,12 +11,12 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(long),
.value_size = sizeof(u64),
.max_entries = 4096,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, long);
__type(value, u64);
__uint(max_entries, 4096);
} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
@ -42,12 +42,12 @@ static unsigned int log2l(unsigned long long n)
#define SLOTS 100
struct bpf_map_def SEC("maps") lat_map = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(u64),
.max_entries = SLOTS,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u64));
__uint(max_entries, SLOTS);
} lat_map SEC(".maps");
SEC("kprobe/blk_account_io_completion")
int bpf_prog2(struct pt_regs *ctx)

View File

@ -7,11 +7,10 @@
#include <unistd.h>
#include <stdbool.h>
#include <string.h>
#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include "bpf_util.h"
#define SLOTS 100
@ -109,20 +108,11 @@ static void print_hist(int fd)
int main(int ac, char **argv)
{
struct rlimit r = {1024*1024, RLIM_INFINITY};
struct bpf_link *links[2];
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
int i;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
int map_fd, i, j = 0;
for (i = 1; i < ac; i++) {
if (strcmp(argv[i], "-a") == 0) {
@ -137,6 +127,40 @@ int main(int ac, char **argv)
}
}
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return 1;
}
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
map_fd = bpf_object__find_map_fd_by_name(obj, "lat_map");
if (map_fd < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
links[j] = bpf_program__attach(prog);
if (libbpf_get_error(links[j])) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
links[j] = NULL;
goto cleanup;
}
j++;
}
printf(" heatmap of IO latency\n");
if (text_only)
printf(" %s", sym[num_colors - 1]);
@ -153,9 +177,14 @@ int main(int ac, char **argv)
for (i = 0; ; i++) {
if (i % 20 == 0)
print_banner();
print_hist(map_fd[1]);
print_hist(map_fd);
sleep(2);
}
cleanup:
for (j--; j >= 0; j--)
bpf_link__destroy(links[j]);
bpf_object__close(obj);
return 0;
}

View File

@ -15,12 +15,12 @@ struct pair {
u64 ip;
};
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(long),
.value_size = sizeof(struct pair),
.max_entries = 1000000,
};
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, long);
__type(value, struct pair);
__uint(max_entries, 1000000);
} my_map SEC(".maps");
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful

View File

@ -8,11 +8,10 @@
#include <stdbool.h>
#include <string.h>
#include <time.h>
#include <linux/bpf.h>
#include <sys/resource.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
struct pair {
long long val;
@ -36,8 +35,8 @@ static void print_old_objects(int fd)
key = write(1, "\e[1;1H\e[2J", 12); /* clear screen */
key = -1;
while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
bpf_map_lookup_elem(map_fd[0], &next_key, &v);
while (bpf_map_get_next_key(fd, &key, &next_key) == 0) {
bpf_map_lookup_elem(fd, &next_key, &v);
key = next_key;
if (val - v.val < 1000000000ll)
/* object was allocated more then 1 sec ago */
@ -50,25 +49,55 @@ static void print_old_objects(int fd)
int main(int ac, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_link *links[2];
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
int i;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
int map_fd, i, j = 0;
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
return 1;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
if (map_fd < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
links[j] = bpf_program__attach(prog);
if (libbpf_get_error(links[j])) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
links[j] = NULL;
goto cleanup;
}
j++;
}
for (i = 0; ; i++) {
print_old_objects(map_fd[1]);
print_old_objects(map_fd);
sleep(1);
}
cleanup:
for (j--; j >= 0; j--)
bpf_link__destroy(links[j]);
bpf_object__close(obj);
return 0;
}

View File

@ -15,16 +15,16 @@
#define PROG(F) SEC("kprobe/"__stringify(F)) int bpf_func_##F
struct bpf_map_def SEC("maps") progs = {
.type = BPF_MAP_TYPE_PROG_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(u32),
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u32));
#ifdef __mips__
.max_entries = 6000, /* MIPS n64 syscalls start at 5000 */
__uint(max_entries, 6000); /* MIPS n64 syscalls start at 5000 */
#else
.max_entries = 1024,
__uint(max_entries, 1024);
#endif
};
} progs SEC(".maps");
SEC("kprobe/__seccomp_filter")
int bpf_prog1(struct pt_regs *ctx)

View File

@ -1,15 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <linux/bpf.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include <sys/resource.h>
#include "trace_helpers.h"
#ifdef __mips__
#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
#else
#define MAX_ENTRIES 1024
#endif
/* install fake seccomp program to enable seccomp code path inside the kernel,
* so that our kprobe attached to seccomp_phase1() can be triggered
*/
@ -28,16 +34,57 @@ static void install_accept_all_seccomp(void)
int main(int ac, char **argv)
{
FILE *f;
char filename[256];
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_link *link = NULL;
struct bpf_program *prog;
struct bpf_object *obj;
int key, fd, progs_fd;
char filename[256];
const char *title;
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
if (!prog) {
printf("finding a prog in obj file failed\n");
goto cleanup;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
link = bpf_program__attach(prog);
if (libbpf_get_error(link)) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
link = NULL;
goto cleanup;
}
progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
if (progs_fd < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
title = bpf_program__title(prog, false);
/* register only syscalls to PROG_ARRAY */
if (sscanf(title, "kprobe/%d", &key) != 1)
continue;
fd = bpf_program__fd(prog);
bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
}
install_accept_all_seccomp();
@ -47,5 +94,8 @@ int main(int ac, char **argv)
read_trace_pipe();
cleanup:
bpf_link__destroy(link);
bpf_object__close(obj);
return 0;
}

View File

@ -3,24 +3,26 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
struct bpf_map_def SEC("maps") counters = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = 64,
};
struct bpf_map_def SEC("maps") values = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(int),
.value_size = sizeof(u64),
.max_entries = 64,
};
struct bpf_map_def SEC("maps") values2 = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(int),
.value_size = sizeof(struct bpf_perf_event_value),
.max_entries = 64,
};
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(u32));
__uint(max_entries, 64);
} counters SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, int);
__type(value, u64);
__uint(max_entries, 64);
} values SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, int);
__type(value, struct bpf_perf_event_value);
__uint(max_entries, 64);
} values2 SEC(".maps");
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)

View File

@ -4,7 +4,6 @@
#include <assert.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <linux/bpf.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
@ -15,12 +14,15 @@
#include <sys/wait.h>
#include <unistd.h>
#include "bpf_load.h"
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include "perf-sys.h"
#define SAMPLE_PERIOD 0x7fffffffffffffffULL
/* counters, values, values2 */
static int map_fd[3];
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
struct bpf_perf_event_value value2;
@ -174,16 +176,51 @@ static void test_bpf_perf_event(void)
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_link *links[2];
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
int i = 0;
setrlimit(RLIMIT_MEMLOCK, &r);
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
map_fd[0] = bpf_object__find_map_fd_by_name(obj, "counters");
map_fd[1] = bpf_object__find_map_fd_by_name(obj, "values");
map_fd[2] = bpf_object__find_map_fd_by_name(obj, "values2");
if (map_fd[0] < 0 || map_fd[1] < 0 || map_fd[2] < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
links[i] = bpf_program__attach(prog);
if (libbpf_get_error(links[i])) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
links[i] = NULL;
goto cleanup;
}
i++;
}
test_bpf_perf_event();
cleanup:
for (i--; i >= 0; i--)
bpf_link__destroy(links[i]);
bpf_object__close(obj);
return 0;
}

View File

@ -1,28 +1,51 @@
#define _GNU_SOURCE
#include <stdio.h>
#include <linux/bpf.h>
#include <unistd.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
int main(int argc, char **argv)
{
FILE *f;
struct bpf_link *link = NULL;
struct bpf_program *prog;
struct bpf_object *obj;
char filename[256];
char command[256];
int ret;
int ret = 0;
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
prog = bpf_object__find_program_by_name(obj, "bpf_prog1");
if (!prog) {
fprintf(stderr, "ERROR: finding a prog in obj file failed\n");
goto cleanup;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
link = bpf_program__attach(prog);
if (libbpf_get_error(link)) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
link = NULL;
goto cleanup;
}
snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
f = popen(command, "r");
ret = pclose(f);
cleanup:
bpf_link__destroy(link);
bpf_object__close(obj);
return ret ? 0 : 1;
}

View File

@ -19,9 +19,6 @@ static const char *__doc__ =
#include <time.h>
#include <linux/limits.h>
#define __must_check
#include <linux/err.h>
#include <arpa/inet.h>
#include <linux/if_link.h>
@ -622,7 +619,7 @@ static struct bpf_link * attach_tp(struct bpf_object *obj,
}
link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
if (IS_ERR(link))
if (libbpf_get_error(link))
exit(EXIT_FAIL_BPF);
return link;

View File

@ -29,8 +29,8 @@ CGROUP COMMANDS
| *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
| *ATTACH_TYPE* := { **ingress** | **egress** | **sock_create** | **sock_ops** | **device** |
| **bind4** | **bind6** | **post_bind4** | **post_bind6** | **connect4** | **connect6** |
| **sendmsg4** | **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** |
| **getsockopt** | **setsockopt** }
| **getpeername4** | **getpeername6** | **getsockname4** | **getsockname6** | **sendmsg4** |
| **sendmsg6** | **recvmsg4** | **recvmsg6** | **sysctl** | **getsockopt** | **setsockopt** }
| *ATTACH_FLAGS* := { **multi** | **override** }
DESCRIPTION
@ -101,7 +101,11 @@ DESCRIPTION
an unconnected udp6 socket (since 5.2);
**sysctl** sysctl access (since 5.2);
**getsockopt** call to getsockopt (since 5.3);
**setsockopt** call to setsockopt (since 5.3).
**setsockopt** call to setsockopt (since 5.3);
**getpeername4** call to getpeername(2) for an inet4 socket (since 5.8);
**getpeername6** call to getpeername(2) for an inet6 socket (since 5.8);
**getsockname4** call to getsockname(2) for an inet4 socket (since 5.8);
**getsockname6** call to getsockname(2) for an inet6 socket (since 5.8).
**bpftool cgroup detach** *CGROUP* *ATTACH_TYPE* *PROG*
Detach *PROG* from the cgroup *CGROUP* and attach type

View File

@ -41,7 +41,8 @@ PROG COMMANDS
| **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | **lwt_xmit** |
| **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | **lirc_mode2** |
| **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
| **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
| **cgroup/connect4** | **cgroup/connect6** | **cgroup/getpeername4** | **cgroup/getpeername6** |
| **cgroup/getsockname4** | **cgroup/getsockname6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** |
| **cgroup/recvmsg4** | **cgroup/recvmsg6** | **cgroup/sysctl** |
| **cgroup/getsockopt** | **cgroup/setsockopt** |
| **struct_ops** | **fentry** | **fexit** | **freplace**

View File

@ -472,6 +472,8 @@ _bpftool()
lwt_seg6local sockops sk_skb sk_msg \
lirc_mode2 cgroup/bind4 cgroup/bind6 \
cgroup/connect4 cgroup/connect6 \
cgroup/getpeername4 cgroup/getpeername6 \
cgroup/getsockname4 cgroup/getsockname6 \
cgroup/sendmsg4 cgroup/sendmsg6 \
cgroup/recvmsg4 cgroup/recvmsg6 \
cgroup/post_bind4 cgroup/post_bind6 \
@ -966,9 +968,10 @@ _bpftool()
;;
attach|detach)
local ATTACH_TYPES='ingress egress sock_create sock_ops \
device bind4 bind6 post_bind4 post_bind6 connect4 \
connect6 sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl \
getsockopt setsockopt'
device bind4 bind6 post_bind4 post_bind6 connect4 connect6 \
getpeername4 getpeername6 getsockname4 getsockname6 \
sendmsg4 sendmsg6 recvmsg4 recvmsg6 sysctl getsockopt \
setsockopt'
local ATTACH_FLAGS='multi override'
local PROG_TYPE='id pinned tag name'
case $prev in
@ -977,9 +980,9 @@ _bpftool()
return 0
;;
ingress|egress|sock_create|sock_ops|device|bind4|bind6|\
post_bind4|post_bind6|connect4|connect6|sendmsg4|\
sendmsg6|recvmsg4|recvmsg6|sysctl|getsockopt|\
setsockopt)
post_bind4|post_bind6|connect4|connect6|getpeername4|\
getpeername6|getsockname4|getsockname6|sendmsg4|sendmsg6|\
recvmsg4|recvmsg6|sysctl|getsockopt|setsockopt)
COMPREPLY=( $( compgen -W "$PROG_TYPE" -- \
"$cur" ) )
return 0

View File

@ -25,9 +25,10 @@
" ATTACH_TYPE := { ingress | egress | sock_create |\n" \
" sock_ops | device | bind4 | bind6 |\n" \
" post_bind4 | post_bind6 | connect4 |\n" \
" connect6 | sendmsg4 | sendmsg6 |\n" \
" recvmsg4 | recvmsg6 | sysctl |\n" \
" getsockopt | setsockopt }"
" connect6 | getpeername4 | getpeername6 |\n" \
" getsockname4 | getsockname6 | sendmsg4 |\n" \
" sendmsg6 | recvmsg4 | recvmsg6 |\n" \
" sysctl | getsockopt | setsockopt }"
static unsigned int query_flags;

View File

@ -100,6 +100,10 @@ static const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = {
[BPF_CGROUP_INET6_CONNECT] = "connect6",
[BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
[BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
[BPF_CGROUP_INET4_GETPEERNAME] = "getpeername4",
[BPF_CGROUP_INET6_GETPEERNAME] = "getpeername6",
[BPF_CGROUP_INET4_GETSOCKNAME] = "getsockname4",
[BPF_CGROUP_INET6_GETSOCKNAME] = "getsockname6",
[BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
[BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
[BPF_CGROUP_SYSCTL] = "sysctl",

View File

@ -2012,8 +2012,10 @@ static int do_help(int argc, char **argv)
" sk_reuseport | flow_dissector | cgroup/sysctl |\n"
" cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
" cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
" cgroup/sendmsg4 | cgroup/sendmsg6 | cgroup/recvmsg4 |\n"
" cgroup/recvmsg6 | cgroup/getsockopt | cgroup/setsockopt |\n"
" cgroup/getpeername4 | cgroup/getpeername6 |\n"
" cgroup/getsockname4 | cgroup/getsockname6 | cgroup/sendmsg4 |\n"
" cgroup/sendmsg6 | cgroup/recvmsg4 | cgroup/recvmsg6 |\n"
" cgroup/getsockopt | cgroup/setsockopt |\n"
" struct_ops | fentry | fexit | freplace }\n"
" ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n"
" flow_dissector }\n"

View File

@ -73,7 +73,7 @@ struct bpf_insn {
/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */
struct bpf_lpm_trie_key {
__u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */
__u8 data[]; /* Arbitrary size */
__u8 data[0]; /* Arbitrary size */
};
struct bpf_cgroup_storage_key {
@ -220,6 +220,10 @@ enum bpf_attach_type {
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
BPF_CGROUP_INET4_GETPEERNAME,
BPF_CGROUP_INET6_GETPEERNAME,
BPF_CGROUP_INET4_GETSOCKNAME,
BPF_CGROUP_INET6_GETSOCKNAME,
__MAX_BPF_ATTACH_TYPE
};
@ -2015,8 +2019,8 @@ union bpf_attr {
* int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta)
* Description
* Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is
* only possible to shrink the packet as of this writing,
* therefore *delta* must be a negative integer.
* possible to both shrink and grow the packet tail.
* Shrink done via *delta* being a negative integer.
*
* A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers

View File

@ -60,7 +60,7 @@ struct hashmap *hashmap__new(hashmap_hash_fn hash_fn,
void hashmap__clear(struct hashmap *map)
{
struct hashmap_entry *cur, *tmp;
int bkt;
size_t bkt;
hashmap__for_each_entry_safe(map, cur, tmp, bkt) {
free(cur);
@ -100,8 +100,7 @@ static int hashmap_grow(struct hashmap *map)
struct hashmap_entry **new_buckets;
struct hashmap_entry *cur, *tmp;
size_t new_cap_bits, new_cap;
size_t h;
int bkt;
size_t h, bkt;
new_cap_bits = map->cap_bits + 1;
if (new_cap_bits < HASHMAP_MIN_CAP_BITS)

View File

@ -15,7 +15,6 @@
#else
#include <bits/reg.h>
#endif
#include "libbpf_internal.h"
static inline size_t hash_bits(size_t h, int bits)
{

View File

@ -6705,6 +6705,14 @@ static const struct bpf_sec_def section_defs[] = {
BPF_CGROUP_UDP4_RECVMSG),
BPF_EAPROG_SEC("cgroup/recvmsg6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_UDP6_RECVMSG),
BPF_EAPROG_SEC("cgroup/getpeername4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_INET4_GETPEERNAME),
BPF_EAPROG_SEC("cgroup/getpeername6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_INET6_GETPEERNAME),
BPF_EAPROG_SEC("cgroup/getsockname4", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_INET4_GETSOCKNAME),
BPF_EAPROG_SEC("cgroup/getsockname6", BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
BPF_CGROUP_INET6_GETSOCKNAME),
BPF_EAPROG_SEC("cgroup/sysctl", BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_CGROUP_SYSCTL),
BPF_EAPROG_SEC("cgroup/getsockopt", BPF_PROG_TYPE_CGROUP_SOCKOPT,

View File

@ -1,6 +1,8 @@
==================
BPF Selftest Notes
==================
General instructions on running selftests can be found in
`Documentation/bpf/bpf_devel_QA.rst`_.
Additional information about selftest failures are
documented here.

View File

@ -25,6 +25,7 @@ CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
CONFIG_IPV6_SEG6_BPF=y
CONFIG_NET_FOU=m
CONFIG_NET_FOU_IP_TUNNELS=y
CONFIG_IPV6_FOU=m
@ -37,3 +38,4 @@ CONFIG_IPV6_SIT=m
CONFIG_BPF_JIT=y
CONFIG_BPF_LSM=y
CONFIG_SECURITY=y
CONFIG_LIRC=y

View File

@ -5,6 +5,8 @@
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/epoll.h>
#include <linux/err.h>
@ -35,7 +37,7 @@ struct ipv6_packet pkt_v6 = {
.tcp.doff = 5,
};
int start_server(int family, int type)
int start_server_with_port(int family, int type, __u16 port)
{
struct sockaddr_storage addr = {};
socklen_t len;
@ -45,11 +47,13 @@ int start_server(int family, int type)
struct sockaddr_in *sin = (void *)&addr;
sin->sin_family = AF_INET;
sin->sin_port = htons(port);
len = sizeof(*sin);
} else {
struct sockaddr_in6 *sin6 = (void *)&addr;
sin6->sin6_family = AF_INET6;
sin6->sin6_port = htons(port);
len = sizeof(*sin6);
}
@ -76,6 +80,11 @@ int start_server(int family, int type)
return fd;
}
int start_server(int family, int type)
{
return start_server_with_port(family, type, 0);
}
static const struct timeval timeo_sec = { .tv_sec = 3 };
static const size_t timeo_optlen = sizeof(timeo_sec);

View File

@ -34,6 +34,7 @@ struct ipv6_packet {
extern struct ipv6_packet pkt_v6;
int start_server(int family, int type);
int start_server_with_port(int family, int type, __u16 port);
int connect_to_fd(int family, int type, int server_fd);
int connect_fd_to_fd(int client_fd, int server_fd);
int connect_wait(int client_fd);

View File

@ -1,24 +1,5 @@
#include <asm/types.h>
#include <linux/types.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stddef.h>
#include <stdbool.h>
#include <linux/unistd.h>
#include <linux/filter.h>
#include <linux/bpf_perf_event.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
#include "../../../include/linux/filter.h"
#include "bpf_rlimit.h"
#include "bpf_util.h"
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
#define MAX_INSNS 512
#define MAX_MATCHES 16
@ -359,15 +340,15 @@ static struct bpf_align_test tests[] = {
* is still (4n), fixed offset is not changed.
* Also, we create a new reg->id.
*/
{29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc))"},
{29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (18)
* which is 20. Then the variable offset is (4n), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
{33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc))"},
{33, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
{33, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
},
},
{
@ -410,15 +391,15 @@ static struct bpf_align_test tests[] = {
/* Adding 14 makes R6 be (4n+2) */
{9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
/* Packet pointer has (4n+2) offset */
{11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
{13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
{11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
{13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
{15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
/* Newly read value in R6 was shifted left by 2, so has
* known alignment of 4.
*/
@ -426,15 +407,15 @@ static struct bpf_align_test tests[] = {
/* Added (4n) to packet pointer's (4n+2) var_off, giving
* another (4n+2).
*/
{19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
{21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
{19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
{21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc))"},
{23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
},
},
{
@ -469,16 +450,16 @@ static struct bpf_align_test tests[] = {
.matches = {
{4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
/* (ptr - ptr) << 2 == unknown, (4n) */
{6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc))"},
{6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
/* (4n) + 14 == (4n+2). We blow our bounds, because
* the add could overflow.
*/
{7, "R5_w=inv(id=0,var_off=(0x2; 0xfffffffffffffffc))"},
{7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
/* Checked s>=0 */
{9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
{9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
/* packet pointer + nonnegative (4n+2) */
{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
{13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
{11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
{13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
* We checked the bounds, but it might have been able
* to overflow if the packet pointer started in the
@ -486,7 +467,7 @@ static struct bpf_align_test tests[] = {
* So we did not get a 'range' on R6, and the access
* attempt will fail.
*/
{15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc))"},
{15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"},
}
},
{
@ -528,7 +509,7 @@ static struct bpf_align_test tests[] = {
/* New unknown value in R7 is (4n) */
{11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
/* Subtracting it from R6 blows our unsigned bounds */
{12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,var_off=(0x2; 0xfffffffffffffffc))"},
{12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
/* Checked s>= 0 */
{14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
/* At the time the word size load is performed from R5,
@ -537,7 +518,8 @@ static struct bpf_align_test tests[] = {
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
{20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
},
},
{
@ -579,18 +561,18 @@ static struct bpf_align_test tests[] = {
/* Adding 14 makes R6 be (4n+2) */
{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
/* Subtracting from packet pointer overflows ubounds */
{13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c))"},
{13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
/* New unknown value in R7 is (4n), >= 76 */
{15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
/* Adding it to packet pointer gives nice bounds again */
{16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
{16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0x7fc))"},
{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
},
},
};
@ -669,51 +651,16 @@ static int do_test_single(struct bpf_align_test *test)
return ret;
}
static int do_test(unsigned int from, unsigned int to)
void test_align(void)
{
int all_pass = 0;
int all_fail = 0;
unsigned int i;
for (i = from; i < to; i++) {
for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_align_test *test = &tests[i];
int fail;
printf("Test %3d: %s ... ",
i, test->descr);
fail = do_test_single(test);
if (fail) {
all_fail++;
printf("FAIL\n");
} else {
all_pass++;
printf("PASS\n");
if (!test__start_subtest(test->descr))
continue;
CHECK_FAIL(do_test_single(test));
}
}
printf("Results: %d pass %d fail\n",
all_pass, all_fail);
return all_fail ? EXIT_FAILURE : EXIT_SUCCESS;
}
int main(int argc, char **argv)
{
unsigned int from = 0, to = ARRAY_SIZE(tests);
if (argc == 3) {
unsigned int l = atoi(argv[argc - 2]);
unsigned int u = atoi(argv[argc - 1]);
if (l < to && u < to) {
from = l;
to = u + 1;
}
} else if (argc == 2) {
unsigned int t = atoi(argv[argc - 1]);
if (t < to) {
from = t;
to = t + 1;
}
}
return do_test(from, to);
}

View File

@ -4,7 +4,8 @@
#include "cgroup_helpers.h"
#include "network_helpers.h"
static int verify_port(int family, int fd, int expected)
static int verify_ports(int family, int fd,
__u16 expected_local, __u16 expected_peer)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
@ -20,9 +21,25 @@ static int verify_port(int family, int fd, int expected)
else
port = ((struct sockaddr_in6 *)&addr)->sin6_port;
if (ntohs(port) != expected) {
log_err("Unexpected port %d, expected %d", ntohs(port),
expected);
if (ntohs(port) != expected_local) {
log_err("Unexpected local port %d, expected %d", ntohs(port),
expected_local);
return -1;
}
if (getpeername(fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get peer addr");
return -1;
}
if (family == AF_INET)
port = ((struct sockaddr_in *)&addr)->sin_port;
else
port = ((struct sockaddr_in6 *)&addr)->sin6_port;
if (ntohs(port) != expected_peer) {
log_err("Unexpected peer port %d, expected %d", ntohs(port),
expected_peer);
return -1;
}
@ -31,33 +48,67 @@ static int verify_port(int family, int fd, int expected)
static int run_test(int cgroup_fd, int server_fd, int family, int type)
{
bool v4 = family == AF_INET;
__u16 expected_local_port = v4 ? 22222 : 22223;
__u16 expected_peer_port = 60000;
struct bpf_prog_load_attr attr = {
.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
.file = v4 ? "./connect_force_port4.o" :
"./connect_force_port6.o",
};
struct bpf_program *prog;
struct bpf_object *obj;
int expected_port;
int prog_fd;
int err;
int fd;
int xlate_fd, fd, err;
__u32 duration = 0;
if (family == AF_INET) {
attr.file = "./connect_force_port4.o";
attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT;
expected_port = 22222;
} else {
attr.file = "./connect_force_port6.o";
attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT;
expected_port = 22223;
}
err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
if (err) {
log_err("Failed to load BPF object");
return -1;
}
err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type,
0);
prog = bpf_object__find_program_by_title(obj, v4 ?
"cgroup/connect4" :
"cgroup/connect6");
if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
err = -EIO;
goto close_bpf_object;
}
err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
BPF_CGROUP_INET4_CONNECT :
BPF_CGROUP_INET6_CONNECT, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
}
prog = bpf_object__find_program_by_title(obj, v4 ?
"cgroup/getpeername4" :
"cgroup/getpeername6");
if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
err = -EIO;
goto close_bpf_object;
}
err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
BPF_CGROUP_INET4_GETPEERNAME :
BPF_CGROUP_INET6_GETPEERNAME, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
}
prog = bpf_object__find_program_by_title(obj, v4 ?
"cgroup/getsockname4" :
"cgroup/getsockname6");
if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
err = -EIO;
goto close_bpf_object;
}
err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
BPF_CGROUP_INET4_GETSOCKNAME :
BPF_CGROUP_INET6_GETSOCKNAME, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
@ -69,8 +120,8 @@ static int run_test(int cgroup_fd, int server_fd, int family, int type)
goto close_bpf_object;
}
err = verify_port(family, fd, expected_port);
err = verify_ports(family, fd, expected_local_port,
expected_peer_port);
close(fd);
close_bpf_object:
@ -86,25 +137,25 @@ void test_connect_force_port(void)
if (CHECK_FAIL(cgroup_fd < 0))
return;
server_fd = start_server(AF_INET, SOCK_STREAM);
server_fd = start_server_with_port(AF_INET, SOCK_STREAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
close(server_fd);
server_fd = start_server(AF_INET6, SOCK_STREAM);
server_fd = start_server_with_port(AF_INET6, SOCK_STREAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
close(server_fd);
server_fd = start_server(AF_INET, SOCK_DGRAM);
server_fd = start_server_with_port(AF_INET, SOCK_DGRAM, 60123);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
close(server_fd);
server_fd = start_server(AF_INET6, SOCK_DGRAM);
server_fd = start_server_with_port(AF_INET6, SOCK_DGRAM, 60124);
if (CHECK_FAIL(server_fd < 0))
goto close_cgroup_fd;
CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));

View File

@ -1,11 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
/* "undefine" structs in vmlinux.h, because we "override" them below */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__bpf_map
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__bpf_map {
struct bpf_iter_meta *meta;
struct bpf_map *map;
} __attribute__((preserve_access_index));
SEC("iter/bpf_map")
int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
{

View File

@ -1,9 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
/* "undefine" structs in vmlinux.h, because we "override" them below */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__ipv6_route
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__ipv6_route {
struct bpf_iter_meta *meta;
struct fib6_info *rt;
} __attribute__((preserve_access_index));
char _license[] SEC("license") = "GPL";
extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;

View File

@ -1,6 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
/* "undefine" structs in vmlinux.h, because we "override" them below */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__netlink bpf_iter__netlink___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__netlink
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
@ -9,6 +14,17 @@ char _license[] SEC("license") = "GPL";
#define sk_rmem_alloc sk_backlog.rmem_alloc
#define sk_refcnt __sk_common.skc_refcnt
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__netlink {
struct bpf_iter_meta *meta;
struct netlink_sock *sk;
} __attribute__((preserve_access_index));
static inline struct inode *SOCK_INODE(struct socket *socket)
{
return &container_of(socket, struct socket_alloc, socket)->vfs_inode;

View File

@ -1,11 +1,27 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
/* "undefine" structs in vmlinux.h, because we "override" them below */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__task {
struct bpf_iter_meta *meta;
struct task_struct *task;
} __attribute__((preserve_access_index));
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{

View File

@ -1,11 +1,29 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
/* "undefine" structs in vmlinux.h, because we "override" them below */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__task_file bpf_iter__task_file___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__task_file
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__task_file {
struct bpf_iter_meta *meta;
struct task_struct *task;
__u32 fd;
struct file *file;
} __attribute__((preserve_access_index));
SEC("iter/task_file")
int dump_task_file(struct bpf_iter__task_file *ctx)
{

View File

@ -1,10 +1,25 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define bpf_iter_meta bpf_iter_meta___not_used
#define bpf_iter__task bpf_iter__task___not_used
#include "vmlinux.h"
#undef bpf_iter_meta
#undef bpf_iter__task
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
struct bpf_iter_meta {
struct seq_file *seq;
__u64 session_id;
__u64 seq_num;
} __attribute__((preserve_access_index));
struct bpf_iter__task {
struct bpf_iter_meta *meta;
struct task_struct *task;
} __attribute__((preserve_access_index));
SEC("iter/task")
int dump_task(struct bpf_iter__task *ctx)
{

Some files were not shown because too many files have changed in this diff Show More