bpf: Add support for changing congestion control

Added support for changing congestion control for SOCK_OPS bpf
programs through the setsockopt bpf helper function. It also adds
a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Lawrence Brakmo 2017-06-30 20:02:49 -07:00 committed by David S. Miller
parent d9925368a6
commit 91b5b21c7c
7 changed files with 58 additions and 17 deletions

View File

@ -1004,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
@ -2078,4 +2080,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk)
rwnd = 0; rwnd = 0;
return rwnd; return rwnd;
} }
static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
{
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN) == 1);
}
#endif /* _TCP_H */ #endif /* _TCP_H */

View File

@ -778,6 +778,9 @@ enum {
* passive connection is * passive connection is
* established * established
*/ */
BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
* needs ECN
*/
}; };
#endif /* _UAPI__LINUX_BPF_H__ */ #endif /* _UAPI__LINUX_BPF_H__ */

View File

@ -2719,8 +2719,24 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
} }
} else if (level == SOL_TCP && } else if (level == SOL_TCP &&
sk->sk_prot->setsockopt == tcp_setsockopt) { sk->sk_prot->setsockopt == tcp_setsockopt) {
/* Place holder */ #ifdef CONFIG_INET
if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX];
strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1));
name[TCP_CA_NAME_MAX-1] = 0;
ret = tcp_set_congestion_control(sk, name, false);
if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
/* replacing an existing ca */
tcp_reinit_congestion_control(sk,
inet_csk(sk)->icsk_ca_ops);
} else {
ret = -EINVAL; ret = -EINVAL;
}
#else
ret = -EINVAL;
#endif
} else { } else {
ret = -EINVAL; ret = -EINVAL;
} }

View File

@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0; name[val] = 0;
lock_sock(sk); lock_sock(sk);
err = tcp_set_congestion_control(sk, name); err = tcp_set_congestion_control(sk, name, true);
release_sock(sk); release_sock(sk);
return err; return err;
} }

View File

@ -189,7 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk); INET_ECN_dontxmit(sk);
} }
static void tcp_reinit_congestion_control(struct sock *sk, void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca) const struct tcp_congestion_ops *ca)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
@ -333,8 +333,12 @@ out:
return ret; return ret;
} }
/* Change congestion control for socket */ /* Change congestion control for socket. If load is false, then it is the
int tcp_set_congestion_control(struct sock *sk, const char *name) * responsibility of the caller to call tcp_init_congestion_control or
* tcp_reinit_congestion_control (if the current congestion control was
* already initialized.
*/
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
@ -344,21 +348,29 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
return -EPERM; return -EPERM;
rcu_read_lock(); rcu_read_lock();
if (!load)
ca = tcp_ca_find(name);
else
ca = __tcp_ca_find_autoload(name); ca = __tcp_ca_find_autoload(name);
/* No change asking for existing value */ /* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) { if (ca == icsk->icsk_ca_ops) {
icsk->icsk_ca_setsockopt = 1; icsk->icsk_ca_setsockopt = 1;
goto out; goto out;
} }
if (!ca) if (!ca) {
err = -ENOENT; err = -ENOENT;
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || } else if (!load) {
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) icsk->icsk_ca_ops = ca;
err = -EPERM; if (!try_module_get(ca->owner))
else if (!try_module_get(ca->owner))
err = -EBUSY; err = -EBUSY;
else } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
err = -EPERM;
} else if (!try_module_get(ca->owner)) {
err = -EBUSY;
} else {
tcp_reinit_congestion_control(sk, ca); tcp_reinit_congestion_control(sk, ca);
}
out: out:
rcu_read_unlock(); rcu_read_unlock();
return err; return err;

View File

@ -6191,7 +6191,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA)) (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1; inet_rsk(req)->ecn_ok = 1;
} }

View File

@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
if (!(tp->ecn_flags & TCP_ECN_OK)) if (!(tp->ecn_flags & TCP_ECN_OK))
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
else if (tcp_ca_needs_ecn(sk)) else if (tcp_ca_needs_ecn(sk) ||
tcp_bpf_ca_needs_ecn(sk))
INET_ECN_xmit(sk); INET_ECN_xmit(sk);
} }
@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
tcp_ca_needs_ecn(sk); tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
if (!use_ecn) { if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk); const struct dst_entry *dst = __sk_dst_get(sk);
@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
if (use_ecn) { if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK; tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk)) if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
INET_ECN_xmit(sk); INET_ECN_xmit(sk);
} }
} }