RDMA/mlx5: Use IB set_netdev and get_netdev functions

The IB layer provides a common interface to store and get net
devices associated to an IB device port (ib_device_set_netdev()
and ib_device_get_netdev()).
Previously, mlx5_ib stored and managed the associated net devices
internally.

Replace internal net device management in mlx5_ib with
ib_device_set_netdev() when attaching/detaching  a net device and
ib_device_get_netdev() when retrieving the net device.

Export ib_device_get_netdev().

For mlx5 representors/PFs/VFs and lag creation we replace the netdev
assignments with the IB set/get netdev functions.

In active-backup mode lag the active slave net device is stored in the
lag itself. To assure the net device stored in a lag bond IB device is
the active slave we implement the following:
- mlx5_core: when modifying the slave of a bond we send the internal driver event
  MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE.
- mlx5_ib: when catching the event call ib_device_set_netdev()

This patch also ensures the correct IB events are sent in switchdev lag.

While at it, when in multiport eswitch mode, only a single IB device is
created for all ports. The said IB device will receive all netdev events
of its VFs once loaded, thus to avoid overwriting the mapping of PF IB
device to PF netdev, ignore NETDEV_REGISTER events if the ib device has
already been mapped to a netdev.

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Signed-off-by: Michael Guralnik <michaelgur@nvidia.com>
Link: https://patch.msgid.link/20240909173025.30422-6-michaelgur@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Chiara Meiohas 2024-09-09 20:30:23 +03:00 committed by Leon Romanovsky
parent 5f8ca04fdd
commit 8d159eb211
8 changed files with 191 additions and 103 deletions

View File

@ -2236,6 +2236,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
if (!rdma_is_port_valid(ib_dev, port))
return NULL;
if (!ib_dev->port_data)
return NULL;
pdata = &ib_dev->port_data[port];
/*
@ -2254,6 +2257,7 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
return res;
}
EXPORT_SYMBOL(ib_device_get_netdev);
/**
* ib_device_get_by_netdev - Find an IB device associated with a netdev

View File

@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
int vport_index)
{
struct mlx5_ib_dev *ibdev;
struct net_device *ndev;
ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
if (!ibdev)
@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
ibdev->port[vport_index].rep = rep;
rep->rep_data[REP_IB].priv = ibdev;
write_lock(&ibdev->port[vport_index].roce.netdev_lock);
ibdev->port[vport_index].roce.netdev =
mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
write_unlock(&ibdev->port[vport_index].roce.netdev_lock);
ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport);
return 0;
return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1);
}
static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
@ -104,11 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
ibdev->is_rep = true;
vport_index = rep->vport_index;
ibdev->port[vport_index].rep = rep;
ibdev->ib_dev.phys_port_cnt = num_ports;
ibdev->port[vport_index].roce.netdev =
mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport);
ibdev->mdev = lag_master;
ibdev->num_ports = num_ports;
ibdev->ib_dev.phys_port_cnt = num_ports;
ret = ib_device_set_netdev(&ibdev->ib_dev,
mlx5_ib_get_rep_netdev(lag_master->priv.eswitch,
rep->vport),
vport_index + 1);
if (ret)
goto fail_add;
ret = __mlx5_ib_add(ibdev, profile);
if (ret)
@ -161,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
}
port = &dev->port[vport_index];
write_lock(&port->roce.netdev_lock);
port->roce.netdev = NULL;
write_unlock(&port->roce.netdev_lock);
ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1);
rep->rep_data[REP_IB].priv = NULL;
port->rep = NULL;

View File

@ -147,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
continue;
read_lock(&port->roce.netdev_lock);
rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
port->rep->vport);
if (rep_ndev == ndev) {
read_unlock(&port->roce.netdev_lock);
rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1);
if (rep_ndev && rep_ndev == ndev) {
dev_put(rep_ndev);
*port_num = i + 1;
return &port->roce;
}
read_unlock(&port->roce.netdev_lock);
dev_put(rep_ndev);
}
return NULL;
}
static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev,
struct net_device *ndev,
struct net_device *upper,
struct net_device *ib_ndev)
{
if (!dev->ib_active)
return false;
/* Event is about our upper device */
if (upper == ndev)
return true;
/* RDMA device is not in lag and not in switchdev */
if (!dev->is_rep && !upper && ndev == ib_ndev)
return true;
/* RDMA devie is in switchdev */
if (dev->is_rep && ndev == ib_ndev)
return true;
return false;
}
static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev)
{
struct mlx5_ib_port *port;
int i;
for (i = 0; i < ibdev->num_ports; i++) {
port = &ibdev->port[i];
if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) {
return ib_device_get_netdev(&ibdev->ib_dev, i + 1);
}
}
return NULL;
@ -168,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb);
struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
u32 port_num = roce->native_port_num;
struct net_device *ib_ndev = NULL;
struct mlx5_core_dev *mdev;
struct mlx5_ib_dev *ibdev;
@ -181,29 +218,38 @@ static int mlx5_netdev_event(struct notifier_block *this,
/* Should already be registered during the load */
if (ibdev->is_rep)
break;
write_lock(&roce->netdev_lock);
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
/* Exit if already registered */
if (ib_ndev)
goto put_ndev;
if (ndev->dev.parent == mdev->device)
roce->netdev = ndev;
write_unlock(&roce->netdev_lock);
ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num);
break;
case NETDEV_UNREGISTER:
/* In case of reps, ib device goes away before the netdevs */
write_lock(&roce->netdev_lock);
if (roce->netdev == ndev)
roce->netdev = NULL;
write_unlock(&roce->netdev_lock);
break;
if (ibdev->is_rep)
break;
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
if (ib_ndev == ndev)
ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num);
goto put_ndev;
case NETDEV_CHANGE:
case NETDEV_UP:
case NETDEV_DOWN: {
struct net_device *upper = NULL;
if (mlx5_lag_is_roce(mdev)) {
if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
struct net_device *lag_ndev;
lag_ndev = mlx5_lag_get_roce_netdev(mdev);
if(mlx5_lag_is_roce(mdev))
lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1);
else /* sriov lag */
lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev);
if (lag_ndev) {
upper = netdev_master_upper_dev_get(lag_ndev);
dev_put(lag_ndev);
@ -216,18 +262,19 @@ static int mlx5_netdev_event(struct notifier_block *this,
roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
if (!roce)
return NOTIFY_DONE;
if ((upper == ndev ||
((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
ibdev->ib_active) {
ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num);
if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) {
struct ib_event ibev = { };
enum ib_port_state port_state;
if (get_port_state(&ibdev->ib_dev, port_num,
&port_state))
goto done;
goto put_ndev;
if (roce->last_port_state == port_state)
goto done;
goto put_ndev;
roce->last_port_state = port_state;
ibev.device = &ibdev->ib_dev;
@ -236,7 +283,7 @@ static int mlx5_netdev_event(struct notifier_block *this,
else if (port_state == IB_PORT_ACTIVE)
ibev.event = IB_EVENT_PORT_ACTIVE;
else
goto done;
goto put_ndev;
ibev.element.port_num = port_num;
ib_dispatch_event(&ibev);
@ -247,39 +294,13 @@ static int mlx5_netdev_event(struct notifier_block *this,
default:
break;
}
put_ndev:
dev_put(ib_ndev);
done:
mlx5_ib_put_native_port_mdev(ibdev, port_num);
return NOTIFY_DONE;
}
static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
u32 port_num)
{
struct mlx5_ib_dev *ibdev = to_mdev(device);
struct net_device *ndev;
struct mlx5_core_dev *mdev;
mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL);
if (!mdev)
return NULL;
if (mlx5_lag_is_roce(mdev)) {
ndev = mlx5_lag_get_roce_netdev(mdev);
goto out;
}
/* Ensure ndev does not disappear before we invoke dev_hold()
*/
read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
ndev = ibdev->port[port_num - 1].roce.netdev;
dev_hold(ndev);
read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
out:
mlx5_ib_put_native_port_mdev(ibdev, port_num);
return ndev;
}
struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev,
u32 ib_port_num,
u32 *native_port_num)
@ -554,7 +575,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num,
if (!put_mdev)
goto out;
ndev = mlx5_ib_get_netdev(device, port_num);
ndev = ib_device_get_netdev(device, port_num);
if (!ndev)
goto out;
@ -3185,6 +3206,60 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str)
fw_rev_sub(dev->mdev));
}
static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
{
struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
lag_events);
struct mlx5_core_dev *mdev = dev->mdev;
struct mlx5_ib_port *port;
struct net_device *ndev;
int i, err;
int portnum;
portnum = 0;
switch (event) {
case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
ndev = data;
if (ndev) {
if (!mlx5_lag_is_roce(mdev)) {
// sriov lag
for (i = 0; i < dev->num_ports; i++) {
port = &dev->port[i];
if (port->rep && port->rep->vport ==
MLX5_VPORT_UPLINK) {
portnum = i;
break;
}
}
}
err = ib_device_set_netdev(&dev->ib_dev, ndev,
portnum + 1);
dev_put(ndev);
if (err)
return err;
/* Rescan gids after new netdev assignment */
rdma_roce_rescan_device(&dev->ib_dev);
}
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
{
dev->lag_events.notifier_call = lag_event;
blocking_notifier_chain_register(&dev->mdev->priv.lag_nh,
&dev->lag_events);
}
static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev)
{
blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh,
&dev->lag_events);
}
static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
{
struct mlx5_core_dev *mdev = dev->mdev;
@ -3206,6 +3281,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
goto err_destroy_vport_lag;
}
mlx5e_lag_event_register(dev);
dev->flow_db->lag_demux_ft = ft;
dev->lag_ports = mlx5_lag_get_num_ports(mdev);
dev->lag_active = true;
@ -3223,6 +3299,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
if (dev->lag_active) {
dev->lag_active = false;
mlx5e_lag_event_unregister(dev);
mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft);
dev->flow_db->lag_demux_ft = NULL;
@ -3939,7 +4016,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
for (i = 0; i < dev->num_ports; i++) {
spin_lock_init(&dev->port[i].mp.mpi_lock);
rwlock_init(&dev->port[i].roce.netdev_lock);
dev->port[i].roce.dev = dev;
dev->port[i].roce.native_port_num = i + 1;
dev->port[i].roce.last_port_state = IB_PORT_DOWN;
@ -4204,7 +4280,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
.create_wq = mlx5_ib_create_wq,
.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table,
.destroy_wq = mlx5_ib_destroy_wq,
.get_netdev = mlx5_ib_get_netdev,
.modify_wq = mlx5_ib_modify_wq,
INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,

View File

@ -888,8 +888,6 @@ struct mlx5_roce {
/* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
* netdev pointer
*/
rwlock_t netdev_lock;
struct net_device *netdev;
struct notifier_block nb;
struct netdev_net_notifier nn;
struct notifier_block mdev_nb;
@ -1138,6 +1136,7 @@ struct mlx5_ib_dev {
/* protect accessing data_direct_dev */
struct mutex data_direct_lock;
struct notifier_block mdev_events;
struct notifier_block lag_events;
int num_ports;
/* serialize update of capability mask
*/

View File

@ -445,6 +445,34 @@ static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
}
static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev)
{
struct net_device *ndev = NULL;
struct mlx5_lag *ldev;
unsigned long flags;
int i;
spin_lock_irqsave(&lag_lock, flags);
ldev = mlx5_lag_dev(dev);
if (!ldev)
goto unlock;
for (i = 0; i < ldev->ports; i++)
if (ldev->tracker.netdev_state[i].tx_enabled)
ndev = ldev->pf[i].netdev;
if (!ndev)
ndev = ldev->pf[ldev->ports - 1].netdev;
if (ndev)
dev_hold(ndev);
unlock:
spin_unlock_irqrestore(&lag_lock, flags);
return ndev;
}
void mlx5_modify_lag(struct mlx5_lag *ldev,
struct lag_tracker *tracker)
{
@ -477,9 +505,18 @@ void mlx5_modify_lag(struct mlx5_lag *ldev,
}
}
if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
!(ldev->mode == MLX5_LAG_MODE_ROCE))
mlx5_lag_drop_rule_setup(ldev, tracker);
if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
struct net_device *ndev = mlx5_lag_active_backup_get_netdev(dev0);
if(!(ldev->mode == MLX5_LAG_MODE_ROCE))
mlx5_lag_drop_rule_setup(ldev, tracker);
/** Only sriov and roce lag should have tracker->tx_type set so
* no need to check the mode
*/
blocking_notifier_call_chain(&dev0->priv.lag_nh,
MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE,
ndev);
}
}
static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
@ -613,6 +650,7 @@ static int mlx5_create_lag(struct mlx5_lag *ldev,
mlx5_core_err(dev0,
"Failed to deactivate RoCE LAG; driver restart required\n");
}
BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh);
return err;
}
@ -1492,38 +1530,6 @@ void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
mlx5_queue_bond_work(ldev, 0);
}
struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
{
struct net_device *ndev = NULL;
struct mlx5_lag *ldev;
unsigned long flags;
int i;
spin_lock_irqsave(&lag_lock, flags);
ldev = mlx5_lag_dev(dev);
if (!(ldev && __mlx5_lag_is_roce(ldev)))
goto unlock;
if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
for (i = 0; i < ldev->ports; i++)
if (ldev->tracker.netdev_state[i].tx_enabled)
ndev = ldev->pf[i].netdev;
if (!ndev)
ndev = ldev->pf[ldev->ports - 1].netdev;
} else {
ndev = ldev->pf[MLX5_LAG_P1].netdev;
}
if (ndev)
dev_hold(ndev);
unlock:
spin_unlock_irqrestore(&lag_lock, flags);
return ndev;
}
EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
struct net_device *slave)
{

View File

@ -371,6 +371,7 @@ enum mlx5_driver_event {
MLX5_DRIVER_EVENT_SF_PEER_DEVLINK,
MLX5_DRIVER_EVENT_AFFILIATION_DONE,
MLX5_DRIVER_EVENT_AFFILIATION_REMOVED,
MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE,
};
enum {

View File

@ -643,6 +643,7 @@ struct mlx5_priv {
struct mlx5_sf_hw_table *sf_hw_table;
struct mlx5_sf_table *sf_table;
#endif
struct blocking_notifier_head lag_nh;
};
enum mlx5_device_state {
@ -1181,7 +1182,6 @@ bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev);
bool mlx5_lag_is_master(struct mlx5_core_dev *dev);
bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev);
bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev);
struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
struct net_device *slave);
int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,

View File

@ -4453,6 +4453,8 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port,
const struct sockaddr *addr);
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
unsigned int port);
struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
u32 port);
struct ib_wq *ib_create_wq(struct ib_pd *pd,
struct ib_wq_init_attr *init_attr);
int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);