Merge branch 'net-prepare-pacing-offload-support'

Eric Dumazet says:

====================
net: prepare pacing offload support

Some network devices have the ability to offload EDT (Earliest
Departure Time) which is the model used for TCP pacing and FQ
packet scheduler.

Some of them implement the timing wheel mechanism described in
https://saeed.github.io/files/carousel-sigcomm17.pdf
with an associated 'timing wheel horizon'.

In order to upstream the NIC support, this series adds :

1) timing wheel horizon as a per-device attribute.

2) FQ packet scheduler support, to let paced packets
   below the timing wheel horizon be handled by the driver.

v1: https://lore.kernel.org/20240930152304.472767-2-edumazet@google.com
====================

Link: https://patch.msgid.link/20241003121219.2396589-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-10-04 15:37:58 -07:00
commit 0da7fb3bca
8 changed files with 44 additions and 6 deletions

View File

@ -1137,6 +1137,10 @@ attribute-sets:
name: dpll-pin
type: nest
nested-attributes: link-dpll-pin-attrs
-
name: max-pacing-offload-horizon
type: uint
doc: EDT offload horizon supported by the device (in nsec).
-
name: af-spec-attrs
attributes:

View File

@ -183,3 +183,4 @@ struct_devlink_port* devlink_port
struct_dpll_pin* dpll_pin
struct hlist_head page_pools
struct dim_irq_moder* irq_moder
u64 max_pacing_offload_horizon

View File

@ -2009,6 +2009,8 @@ enum netdev_reg_state {
* @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
* where the clock is recovered.
*
* @max_pacing_offload_horizon: max EDT offload horizon in nsec.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
@ -2399,6 +2401,8 @@ struct net_device {
/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
struct dim_irq_moder *irq_moder;
u64 max_pacing_offload_horizon;
u8 priv[] ____cacheline_aligned
__counted_by(priv_len);
} ____cacheline_aligned;

View File

@ -377,6 +377,7 @@ enum {
IFLA_GSO_IPV4_MAX_SIZE,
IFLA_GRO_IPV4_MAX_SIZE,
IFLA_DPLL_PIN,
IFLA_MAX_PACING_OFFLOAD_HORIZON,
__IFLA_MAX
};

View File

@ -836,6 +836,8 @@ enum {
TCA_FQ_WEIGHTS, /* Weights for each band */
TCA_FQ_OFFLOAD_HORIZON, /* dequeue paced packets within this horizon immediately (us units) */
__TCA_FQ_MAX
};

View File

@ -1118,6 +1118,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ rtnl_devlink_port_size(dev)
+ rtnl_dpll_pin_size(dev)
+ nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ 0;
}
@ -1867,6 +1868,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
READ_ONCE(dev->tso_max_size)) ||
nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
READ_ONCE(dev->tso_max_segs)) ||
nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
READ_ONCE(dev->max_pacing_offload_horizon)) ||
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
READ_ONCE(dev->num_rx_queues)) ||
@ -1975,6 +1978,7 @@ nla_put_failure:
}
static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN },
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },

View File

@ -111,6 +111,7 @@ struct fq_perband_flows {
struct fq_sched_data {
/* Read mostly cache line */
u64 offload_horizon;
u32 quantum;
u32 initial_quantum;
u32 flow_refill_delay;
@ -299,7 +300,7 @@ static void fq_gc(struct fq_sched_data *q,
}
/* Fast path can be used if :
* 1) Packet tstamp is in the past.
* 1) Packet tstamp is in the past, or within the pacing offload horizon.
* 2) FQ qlen == 0 OR
* (no flow is currently eligible for transmit,
* AND fast path queue has less than 8 packets)
@ -314,7 +315,7 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
const struct fq_sched_data *q = qdisc_priv(sch);
const struct sock *sk;
if (fq_skb_cb(skb)->time_to_send > now)
if (fq_skb_cb(skb)->time_to_send > now + q->offload_horizon)
return false;
if (sch->q.qlen != 0) {
@ -595,15 +596,18 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
unsigned long sample;
struct rb_node *p;
if (q->time_next_delayed_flow > now)
if (q->time_next_delayed_flow > now + q->offload_horizon)
return;
/* Update unthrottle latency EWMA.
* This is cheap and can help diagnosing timer/latency problems.
*/
sample = (unsigned long)(now - q->time_next_delayed_flow);
q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
q->unthrottle_latency_ns += sample >> 3;
if ((long)sample > 0) {
q->unthrottle_latency_ns -= q->unthrottle_latency_ns >> 3;
q->unthrottle_latency_ns += sample >> 3;
}
now += q->offload_horizon;
q->time_next_delayed_flow = ~0ULL;
while ((p = rb_first(&q->delayed)) != NULL) {
@ -687,7 +691,7 @@ begin:
u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
f->time_next_packet);
if (now < time_next_packet) {
if (now + q->offload_horizon < time_next_packet) {
head->first = f->next;
f->time_next_packet = time_next_packet;
fq_flow_set_throttled(q, f);
@ -925,6 +929,7 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
[TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 },
[TCA_FQ_PRIOMAP] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_prio_qopt)),
[TCA_FQ_WEIGHTS] = NLA_POLICY_EXACT_LEN(FQ_BANDS * sizeof(s32)),
[TCA_FQ_OFFLOAD_HORIZON] = { .type = NLA_U32 },
};
/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
@ -1100,6 +1105,17 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
WRITE_ONCE(q->horizon_drop,
nla_get_u8(tb[TCA_FQ_HORIZON_DROP]));
if (tb[TCA_FQ_OFFLOAD_HORIZON]) {
u64 offload_horizon = (u64)NSEC_PER_USEC *
nla_get_u32(tb[TCA_FQ_OFFLOAD_HORIZON]);
if (offload_horizon <= qdisc_dev(sch)->max_pacing_offload_horizon) {
WRITE_ONCE(q->offload_horizon, offload_horizon);
} else {
NL_SET_ERR_MSG_MOD(extack, "invalid offload_horizon");
err = -EINVAL;
}
}
if (!err) {
sch_tree_unlock(sch);
@ -1183,6 +1199,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
.bands = FQ_BANDS,
};
struct nlattr *opts;
u64 offload_horizon;
u64 ce_threshold;
s32 weights[3];
u64 horizon;
@ -1199,6 +1216,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
horizon = READ_ONCE(q->horizon);
do_div(horizon, NSEC_PER_USEC);
offload_horizon = READ_ONCE(q->offload_horizon);
do_div(offload_horizon, NSEC_PER_USEC);
if (nla_put_u32(skb, TCA_FQ_PLIMIT,
READ_ONCE(sch->limit)) ||
nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT,
@ -1224,6 +1244,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_TIMER_SLACK,
READ_ONCE(q->timer_slack)) ||
nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) ||
nla_put_u32(skb, TCA_FQ_OFFLOAD_HORIZON, (u32)offload_horizon) ||
nla_put_u8(skb, TCA_FQ_HORIZON_DROP,
READ_ONCE(q->horizon_drop)))
goto nla_put_failure;

View File

@ -377,6 +377,7 @@ enum {
IFLA_GSO_IPV4_MAX_SIZE,
IFLA_GRO_IPV4_MAX_SIZE,
IFLA_DPLL_PIN,
IFLA_MAX_PACING_OFFLOAD_HORIZON,
__IFLA_MAX
};