net/mlx5e: Activate HW multipath and handle port affinity based on FIB events

To support multipath offload we are going to track SW multipath route and related nexthops. To do that we register to FIB notifier and handle the route and next-hops events and reflect that as port affinity to HW. When there is a new multipath route entry that all next-hops are the ports of an HCA we will activate LAG in HW. Egress wise, we use HW LAG as the means to emulate multipath on current HW which doesn't support port selection based on xmit hash. In the presence of multiple VFs which use multiple SQs (send queues) this yields fairly good distribution. HA wise, HW LAG buys us the ability for a given RQ (receive queue) to receive traffic from both ports and for SQs to migrate xmitting over the active port if their base port fails. When the route entry is being updated to single path we will update the HW port affinity to use that port only. If a next-hop becomes dead we update the HW port affinity to the living port. When all next-hops are alive again we reset the affinity to default. Due to FW/HW limitations, when a route is deleted we are not disabling the HW LAG since doing so will not allow us to enable it again while VFs are bounded. Typically this is just a temporary state when a routing daemon removes dead routes and later adds them back as needed. This patch only handles events for AF_INET. Signed-off-by: Roi Dayan <roid@mellanox.com> Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
2024-12-20 11:13:58 +08:00 · 2019-02-17 11:54:06 +02:00 · 2019-02-17 11:54:06 +02:00 · 544fe7c2e6
commit 544fe7c2e6
parent 724b509ca0
6 changed files with 326 additions and 0 deletions
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@ -2476,3 +2476,10 @@ bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0, struct mlx5_core_dev *dev1)

 	return false;
 }
+
+bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
+			       struct mlx5_core_dev *dev1)
+{
+	return (dev0->priv.eswitch->mode == SRIOV_OFFLOADS &&
+		dev1->priv.eswitch->mode == SRIOV_OFFLOADS);
+}
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@ -371,6 +371,8 @@ static inline bool mlx5_eswitch_vlan_actions_supported(struct mlx5_core_dev *dev

 bool mlx5_esw_lag_prereq(struct mlx5_core_dev *dev0,
 			 struct mlx5_core_dev *dev1);
+bool mlx5_esw_multipath_prereq(struct mlx5_core_dev *dev0,
+			       struct mlx5_core_dev *dev1);

 #define MLX5_DEBUG_ESWITCH_MASK BIT(3)

--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.c
@ -36,6 +36,7 @@
 #include "mlx5_core.h"
 #include "eswitch.h"
 #include "lag.h"
+#include "lag_mp.h"

 /* General purpose, use for short periods of time.
 * Beware of lock dependencies (preferably, no locks should be acquired
@ -559,6 +560,7 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 {
 	struct mlx5_lag *ldev = NULL;
 	struct mlx5_core_dev *tmp_dev;
+	int err;

 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
 	    !MLX5_CAP_GEN(dev, lag_master) ||
@ -586,6 +588,11 @@ void mlx5_lag_add(struct mlx5_core_dev *dev, struct net_device *netdev)
 			mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
 		}
 	}
+
+	err = mlx5_lag_mp_init(ldev);
+	if (err)
+		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
+			      err);
 }

 int mlx5_lag_get_pf_num(struct mlx5_core_dev *dev, int *pf_num)
@ -631,6 +638,7 @@ void mlx5_lag_remove(struct mlx5_core_dev *dev)
 	if (i == MLX5_MAX_PORTS) {
 		if (ldev->nb.notifier_call)
 			unregister_netdevice_notifier(&ldev->nb);
+		mlx5_lag_mp_cleanup(ldev);
 		cancel_delayed_work_sync(&ldev->bond_work);
 		mlx5_lag_dev_free(ldev);
 	}
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag.h
@ -5,6 +5,7 @@
 #define __MLX5_LAG_H__

 #include "mlx5_core.h"
+#include "lag_mp.h"

 enum {
 	MLX5_LAG_FLAG_ROCE   = 1 << 0,
@ -38,6 +39,7 @@ struct mlx5_lag {
 	struct workqueue_struct   *wq;
 	struct delayed_work       bond_work;
 	struct notifier_block     nb;
+	struct lag_mp             lag_mp;
 };

 static inline struct mlx5_lag *
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
@ -3,9 +3,18 @@

 #include <linux/netdevice.h>
 #include "lag.h"
+#include "lag_mp.h"
 #include "mlx5_core.h"
 #include "eswitch.h"

+static bool mlx5_lag_multipath_check_prereq(struct mlx5_lag *ldev)
+{
+	if (!ldev->pf[0].dev || !ldev->pf[1].dev)
+		return false;
+
+	return mlx5_esw_multipath_prereq(ldev->pf[0].dev, ldev->pf[1].dev);
+}
+
 static bool __mlx5_lag_is_multipath(struct mlx5_lag *ldev)
 {
 	return !!(ldev->flags & MLX5_LAG_FLAG_MULTIPATH);
@ -21,3 +30,275 @@ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev)

 	return res;
 }
+
+/**
+ * Set lag port affinity
+ *
+ * @ldev: lag device
+ * @port:
+ *     0 - set normal affinity.
+ *     1 - set affinity to port 1.
+ *     2 - set affinity to port 2.
+ *
+ **/
+static void mlx5_lag_set_port_affinity(struct mlx5_lag *ldev, int port)
+{
+	struct lag_tracker tracker;
+
+	if (!__mlx5_lag_is_multipath(ldev))
+		return;
+
+	switch (port) {
+	case 0:
+		tracker.netdev_state[0].tx_enabled = true;
+		tracker.netdev_state[1].tx_enabled = true;
+		tracker.netdev_state[0].link_up = true;
+		tracker.netdev_state[1].link_up = true;
+		break;
+	case 1:
+		tracker.netdev_state[0].tx_enabled = true;
+		tracker.netdev_state[0].link_up = true;
+		tracker.netdev_state[1].tx_enabled = false;
+		tracker.netdev_state[1].link_up = false;
+		break;
+	case 2:
+		tracker.netdev_state[0].tx_enabled = false;
+		tracker.netdev_state[0].link_up = false;
+		tracker.netdev_state[1].tx_enabled = true;
+		tracker.netdev_state[1].link_up = true;
+		break;
+	default:
+		mlx5_core_warn(ldev->pf[0].dev, "Invalid affinity port %d",
+			       port);
+		return;
+	}
+
+	mlx5_modify_lag(ldev, &tracker);
+}
+
+static void mlx5_lag_fib_event_flush(struct notifier_block *nb)
+{
+	struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
+	struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
+
+	flush_workqueue(ldev->wq);
+}
+
+struct mlx5_fib_event_work {
+	struct work_struct work;
+	struct mlx5_lag *ldev;
+	unsigned long event;
+	union {
+		struct fib_entry_notifier_info fen_info;
+		struct fib_nh_notifier_info fnh_info;
+	};
+};
+
+static void mlx5_lag_fib_route_event(struct mlx5_lag *ldev,
+				     unsigned long event,
+				     struct fib_info *fi)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	/* Handle delete event */
+	if (event == FIB_EVENT_ENTRY_DEL) {
+		/* stop track */
+		if (mp->mfi == fi)
+			mp->mfi = NULL;
+		return;
+	}
+
+	/* Handle add/replace event */
+	if (fi->fib_nhs == 1) {
+		if (__mlx5_lag_is_active(ldev)) {
+			struct net_device *nh_dev = fi->fib_nh[0].nh_dev;
+			int i = mlx5_lag_dev_get_netdev_idx(ldev, nh_dev);
+
+			mlx5_lag_set_port_affinity(ldev, ++i);
+		}
+		return;
+	}
+
+	if (fi->fib_nhs != 2)
+		return;
+
+	/* Verify next hops are ports of the same hca */
+	if (!(fi->fib_nh[0].nh_dev == ldev->pf[0].netdev &&
+	      fi->fib_nh[1].nh_dev == ldev->pf[1].netdev) &&
+	    !(fi->fib_nh[0].nh_dev == ldev->pf[1].netdev &&
+	      fi->fib_nh[1].nh_dev == ldev->pf[0].netdev)) {
+		mlx5_core_warn(ldev->pf[0].dev, "Multipath offload require two ports of the same HCA\n");
+		return;
+	}
+
+	/* First time we see multipath route */
+	if (!mp->mfi && !__mlx5_lag_is_active(ldev)) {
+		struct lag_tracker tracker;
+
+		tracker = ldev->tracker;
+		mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH);
+	}
+
+	mlx5_lag_set_port_affinity(ldev, 0);
+	mp->mfi = fi;
+}
+
+static void mlx5_lag_fib_nexthop_event(struct mlx5_lag *ldev,
+				       unsigned long event,
+				       struct fib_nh *fib_nh,
+				       struct fib_info *fi)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	/* Check the nh event is related to the route */
+	if (!mp->mfi || mp->mfi != fi)
+		return;
+
+	/* nh added/removed */
+	if (event == FIB_EVENT_NH_DEL) {
+		int i = mlx5_lag_dev_get_netdev_idx(ldev, fib_nh->nh_dev);
+
+		if (i >= 0) {
+			i = (i + 1) % 2 + 1; /* peer port */
+			mlx5_lag_set_port_affinity(ldev, i);
+		}
+	} else if (event == FIB_EVENT_NH_ADD &&
+		   fi->fib_nhs == 2) {
+		mlx5_lag_set_port_affinity(ldev, 0);
+	}
+}
+
+static void mlx5_lag_fib_update(struct work_struct *work)
+{
+	struct mlx5_fib_event_work *fib_work =
+		container_of(work, struct mlx5_fib_event_work, work);
+	struct mlx5_lag *ldev = fib_work->ldev;
+	struct fib_nh *fib_nh;
+
+	/* Protect internal structures from changes */
+	rtnl_lock();
+	switch (fib_work->event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		mlx5_lag_fib_route_event(ldev, fib_work->event,
+					 fib_work->fen_info.fi);
+		fib_info_put(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fib_nh = fib_work->fnh_info.fib_nh;
+		mlx5_lag_fib_nexthop_event(ldev,
+					   fib_work->event,
+					   fib_work->fnh_info.fib_nh,
+					   fib_nh->nh_parent);
+		fib_info_put(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	}
+
+	rtnl_unlock();
+	kfree(fib_work);
+}
+
+static struct mlx5_fib_event_work *
+mlx5_lag_init_fib_work(struct mlx5_lag *ldev, unsigned long event)
+{
+	struct mlx5_fib_event_work *fib_work;
+
+	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+	if (WARN_ON(!fib_work))
+		return NULL;
+
+	INIT_WORK(&fib_work->work, mlx5_lag_fib_update);
+	fib_work->ldev = ldev;
+	fib_work->event = event;
+
+	return fib_work;
+}
+
+static int mlx5_lag_fib_event(struct notifier_block *nb,
+			      unsigned long event,
+			      void *ptr)
+{
+	struct lag_mp *mp = container_of(nb, struct lag_mp, fib_nb);
+	struct mlx5_lag *ldev = container_of(mp, struct mlx5_lag, lag_mp);
+	struct fib_notifier_info *info = ptr;
+	struct mlx5_fib_event_work *fib_work;
+	struct fib_entry_notifier_info *fen_info;
+	struct fib_nh_notifier_info *fnh_info;
+	struct fib_info *fi;
+
+	if (info->family != AF_INET)
+		return NOTIFY_DONE;
+
+	if (!mlx5_lag_multipath_check_prereq(ldev))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case FIB_EVENT_ENTRY_REPLACE: /* fall through */
+	case FIB_EVENT_ENTRY_APPEND: /* fall through */
+	case FIB_EVENT_ENTRY_ADD: /* fall through */
+	case FIB_EVENT_ENTRY_DEL:
+		fen_info = container_of(info, struct fib_entry_notifier_info,
+					info);
+		fi = fen_info->fi;
+		if (fi->fib_dev != ldev->pf[0].netdev &&
+		    fi->fib_dev != ldev->pf[1].netdev) {
+			return NOTIFY_DONE;
+		}
+		fib_work = mlx5_lag_init_fib_work(ldev, event);
+		if (!fib_work)
+			return NOTIFY_DONE;
+		fib_work->fen_info = *fen_info;
+		/* Take reference on fib_info to prevent it from being
+		 * freed while work is queued. Release it afterwards.
+		 */
+		fib_info_hold(fib_work->fen_info.fi);
+		break;
+	case FIB_EVENT_NH_ADD: /* fall through */
+	case FIB_EVENT_NH_DEL:
+		fnh_info = container_of(info, struct fib_nh_notifier_info,
+					info);
+		fib_work = mlx5_lag_init_fib_work(ldev, event);
+		if (!fib_work)
+			return NOTIFY_DONE;
+		fib_work->fnh_info = *fnh_info;
+		fib_info_hold(fib_work->fnh_info.fib_nh->nh_parent);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	queue_work(ldev->wq, &fib_work->work);
+
+	return NOTIFY_DONE;
+}
+
+int mlx5_lag_mp_init(struct mlx5_lag *ldev)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+	int err;
+
+	if (mp->fib_nb.notifier_call)
+		return 0;
+
+	mp->fib_nb.notifier_call = mlx5_lag_fib_event;
+	err = register_fib_notifier(&mp->fib_nb,
+				    mlx5_lag_fib_event_flush);
+	if (err)
+		mp->fib_nb.notifier_call = NULL;
+
+	return err;
+}
+
+void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev)
+{
+	struct lag_mp *mp = &ldev->lag_mp;
+
+	if (!mp->fib_nb.notifier_call)
+		return;
+
+	unregister_fib_notifier(&mp->fib_nb);
+	mp->fib_nb.notifier_call = NULL;
+}
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag_mp.h
@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2019 Mellanox Technologies. */
+
+#ifndef __MLX5_LAG_MP_H__
+#define __MLX5_LAG_MP_H__
+
+#include "lag.h"
+#include "mlx5_core.h"
+
+struct lag_mp {
+	struct notifier_block     fib_nb;
+	struct fib_info           *mfi; /* used in tracking fib events */
+};
+
+#ifdef CONFIG_MLX5_ESWITCH
+
+int mlx5_lag_mp_init(struct mlx5_lag *ldev);
+void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev);
+
+#else /* CONFIG_MLX5_ESWITCH */
+
+static inline int mlx5_lag_mp_init(struct mlx5_lag *ldev) { return 0; }
+static inline void mlx5_lag_mp_cleanup(struct mlx5_lag *ldev) {}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+#endif /* __MLX5_LAG_MP_H__ */