From 1f52f6c0b0e846908e9c1082dab1b3f7088b82ac Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:35 -0700
Subject: [PATCH 1/6] bpf: Create BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY

Create new macro BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() to be used by
__cgroup_bpf_run_filter_skb for EGRESS BPF progs so BPF programs can
request cwr for TCP packets.

Current cgroup skb programs can only return 0 or 1 (0 to drop the
packet. This macro changes the behavior so the low order bit
indicates whether the packet should be dropped (0) or not (1)
and the next bit is used for congestion notification (cn).

Hence, new allowed return values of CGROUP EGRESS BPF programs are:
  0: drop packet
  1: keep packet
  2: drop packet and call cwr
  3: keep packet and call cwr

This macro then converts it to one of NET_XMIT values or -EPERM
that has the effect of dropping the packet with no cn.
  0: NET_XMIT_SUCCESS  skb should be transmitted (no cn)
  1: NET_XMIT_DROP     skb should be dropped and cwr called
  2: NET_XMIT_CN       skb should be transmitted and cwr called
  3: -EPERM            skb should be dropped (no cn)

Note that when more than one BPF program is called, the packet is
dropped if at least one of programs requests it be dropped, and
there is cn if at least one program returns cn.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 50 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ff3e00ff84d2..2cc58fc0f413 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -552,6 +552,56 @@ _out:							\
 		_ret;					\
 	 })
 
+/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
+ * so BPF programs can request cwr for TCP packets.
+ *
+ * Current cgroup skb programs can only return 0 or 1 (0 to drop the
+ * packet. This macro changes the behavior so the low order bit
+ * indicates whether the packet should be dropped (0) or not (1)
+ * and the next bit is a congestion notification bit. This could be
+ * used by TCP to call tcp_enter_cwr()
+ *
+ * Hence, new allowed return values of CGROUP EGRESS BPF programs are:
+ *   0: drop packet
+ *   1: keep packet
+ *   2: drop packet and cn
+ *   3: keep packet and cn
+ *
+ * This macro then converts it to one of the NET_XMIT or an error
+ * code that is then interpreted as drop packet (and no cn):
+ *   0: NET_XMIT_SUCCESS  skb should be transmitted
+ *   1: NET_XMIT_DROP     skb should be dropped and cn
+ *   2: NET_XMIT_CN       skb should be transmitted and cn
+ *   3: -EPERM            skb should be dropped
+ */
+#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog_array_item *_item;	\
+		struct bpf_prog *_prog;			\
+		struct bpf_prog_array *_array;		\
+		u32 ret;				\
+		u32 _ret = 1;				\
+		u32 _cn = 0;				\
+		preempt_disable();			\
+		rcu_read_lock();			\
+		_array = rcu_dereference(array);	\
+		_item = &_array->items[0];		\
+		while ((_prog = READ_ONCE(_item->prog))) {		\
+			bpf_cgroup_storage_set(_item->cgroup_storage);	\
+			ret = func(_prog, ctx);		\
+			_ret &= (ret & 1);		\
+			_cn |= (ret & 2);		\
+			_item++;			\
+		}					\
+		rcu_read_unlock();			\
+		preempt_enable();			\
+		if (_ret)				\
+			_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);	\
+		else					\
+			_ret = (_cn ? NET_XMIT_DROP : -EPERM);		\
+		_ret;					\
+	})
+
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
 

From 5cf1e91456301f8c4f6bbc63ff76cff12f92f31b Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:36 -0700
Subject: [PATCH 2/6] bpf: cgroup inet skb programs can return 0 to 3

Allows cgroup inet skb programs to return values in the range [0, 3].
The second bit is used to deterine if congestion occurred and higher
level protocol should decrease rate. E.g. TCP would call tcp_enter_cwr()

The bpf_prog must set expected_attach_type to BPF_CGROUP_INET_EGRESS
at load time if it uses the new return values (i.e. 2 or 3).

The expected_attach_type is currently not enforced for
BPF_PROG_TYPE_CGROUP_SKB.  e.g Meaning the current bpf_prog with
expected_attach_type setting to BPF_CGROUP_INET_EGRESS can attach to
BPF_CGROUP_INET_INGRESS.  Blindly enforcing expected_attach_type will
break backward compatibility.

This patch adds a enforce_expected_attach_type bit to only
enforce the expected_attach_type when it uses the new
return value.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/syscall.c   | 12 ++++++++++++
 kernel/bpf/verifier.c  | 16 +++++++++++++---
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index ba8b65270e0d..43b45d6db36d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -526,7 +526,8 @@ struct bpf_prog {
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1; /* callchain buffer allocated? */
+				has_callchain_buf:1, /* callchain buffer allocated? */
+				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3d546b6f4646..1539774d78c7 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1585,6 +1585,14 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		default:
 			return -EINVAL;
 		}
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		switch (expected_attach_type) {
+		case BPF_CGROUP_INET_INGRESS:
+		case BPF_CGROUP_INET_EGRESS:
+			return 0;
+		default:
+			return -EINVAL;
+		}
 	default:
 		return 0;
 	}
@@ -1836,6 +1844,10 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 		return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+	case BPF_PROG_TYPE_CGROUP_SKB:
+		return prog->enforce_expected_attach_type &&
+			prog->expected_attach_type != attach_type ?
+			-EINVAL : 0;
 	default:
 		return 0;
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2778417e6e0c..5c2cb5bd84ce 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5508,11 +5508,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static int check_return_code(struct bpf_verifier_env *env)
 {
+	struct tnum enforce_attach_type_range = tnum_unknown;
 	struct bpf_reg_state *reg;
 	struct tnum range = tnum_range(0, 1);
 
 	switch (env->prog->type) {
 	case BPF_PROG_TYPE_CGROUP_SKB:
+		if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+			range = tnum_range(0, 3);
+			enforce_attach_type_range = tnum_range(2, 3);
+		}
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
@@ -5531,18 +5536,23 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
+		char tn_buf[48];
+
 		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
-			char tn_buf[48];
-
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 			verbose(env, "has value %s", tn_buf);
 		} else {
 			verbose(env, "has unknown scalar value");
 		}
-		verbose(env, " should have been 0 or 1\n");
+		tnum_strn(tn_buf, sizeof(tn_buf), range);
+		verbose(env, " should have been %s\n", tn_buf);
 		return -EINVAL;
 	}
+
+	if (!tnum_is_unknown(enforce_attach_type_range) &&
+	    tnum_in(enforce_attach_type_range, reg->var_off))
+		env->prog->enforce_expected_attach_type = 1;
 	return 0;
 }
 

From e7a3160d092aa4dd7fb2ef23335cb3a98400aec6 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:37 -0700
Subject: [PATCH 3/6] bpf: Update __cgroup_bpf_run_filter_skb with cn

For egress packets, __cgroup_bpf_fun_filter_skb() will now call
BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY() instead of PROG_CGROUP_RUN_ARRAY()
in order to propagate congestion notifications (cn) requests to TCP
callers.

For egress packets, this function can return:
   NET_XMIT_SUCCESS    (0)    - continue with packet output
   NET_XMIT_DROP       (1)    - drop packet and notify TCP to call cwr
   NET_XMIT_CN         (2)    - continue with packet output and notify TCP
                                to call cwr
   -EPERM                     - drop packet

For ingress packets, this function will return -EPERM if any attached
program was found and if it returned != 1 during execution. Otherwise 0
is returned.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/cgroup.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ff594eb86fd7..1b65ab0df457 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -587,8 +587,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
  * The program type passed in via @type must be suitable for network
  * filtering. No further check is performed to assert that.
  *
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ *   NET_XMIT_SUCCESS    (0)	- continue with packet output
+ *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
+ *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
+ *				  to call cwr
+ *   -EPERM			- drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
  */
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -614,12 +622,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 	/* compute pointers for the bpf prog */
 	bpf_compute_and_save_data_end(skb, &saved_data_end);
 
-	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
-				 __bpf_prog_run_save_cb);
+	if (type == BPF_CGROUP_INET_EGRESS) {
+		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+	} else {
+		ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+					  __bpf_prog_run_save_cb);
+		ret = (ret == 1 ? 0 : -EPERM);
+	}
 	bpf_restore_data_end(skb, saved_data_end);
 	__skb_pull(skb, offset);
 	skb->sk = save_sk;
-	return ret == 1 ? 0 : -EPERM;
+
+	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 

From 956fe2190820df3a6ee530204e059da508159319 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:38 -0700
Subject: [PATCH 4/6] bpf: Update BPF_CGROUP_RUN_PROG_INET_EGRESS calls

Update BPF_CGROUP_RUN_PROG_INET_EGRESS() callers to support returning
congestion notifications from the BPF programs.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/ip_output.c  | 34 +++++++++++++++++++++++-----------
 net/ipv6/ip6_output.c | 26 +++++++++++++++++---------
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index bfd0ca554977..1217a53381c2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -287,16 +287,9 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
 	return ret;
 }
 
-static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	unsigned int mtu;
-	int ret;
-
-	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
-	if (ret) {
-		kfree_skb(skb);
-		return ret;
-	}
 
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
@@ -315,18 +308,37 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
 	return ip_finish_output2(net, sk, skb);
 }
 
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	switch (ret) {
+	case NET_XMIT_SUCCESS:
+		return __ip_finish_output(net, sk, skb);
+	case NET_XMIT_CN:
+		return __ip_finish_output(net, sk, skb) ? : ret;
+	default:
+		kfree_skb(skb);
+		return ret;
+	}
+}
+
 static int ip_mc_finish_output(struct net *net, struct sock *sk,
 			       struct sk_buff *skb)
 {
 	int ret;
 
 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
-	if (ret) {
+	switch (ret) {
+	case NET_XMIT_SUCCESS:
+		return dev_loopback_xmit(net, sk, skb);
+	case NET_XMIT_CN:
+		return dev_loopback_xmit(net, sk, skb) ? : ret;
+	default:
 		kfree_skb(skb);
 		return ret;
 	}
-
-	return dev_loopback_xmit(net, sk, skb);
 }
 
 int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index adef2236abe2..a75bc21d8c88 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -128,16 +128,8 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	return -EINVAL;
 }
 
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-	int ret;
-
-	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
-	if (ret) {
-		kfree_skb(skb);
-		return ret;
-	}
-
 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 	/* Policy lookup after SNAT yielded a new policy */
 	if (skb_dst(skb)->xfrm) {
@@ -154,6 +146,22 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 		return ip6_finish_output2(net, sk, skb);
 }
 
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+	int ret;
+
+	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+	switch (ret) {
+	case NET_XMIT_SUCCESS:
+		return __ip6_finish_output(net, sk, skb);
+	case NET_XMIT_CN:
+		return __ip6_finish_output(net, sk, skb) ? : ret;
+	default:
+		kfree_skb(skb);
+		return ret;
+	}
+}
+
 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct net_device *dev = skb_dst(skb)->dev;

From ffd81558d56c611b1e93f856c77f42046a2deab5 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:39 -0700
Subject: [PATCH 5/6] bpf: Add cn support to hbm_out_kern.c

Update hbm_out_kern.c to support returning cn notifications.
Also updates relevant files to allow disabling cn notifications.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/do_hbm_test.sh | 10 +++++++---
 samples/bpf/hbm.c          | 18 +++++++++++++++---
 samples/bpf/hbm.h          |  3 ++-
 samples/bpf/hbm_out_kern.c | 26 +++++++++++++++++++++-----
 4 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
index 56c8b4115c95..e48b047d4646 100755
--- a/samples/bpf/do_hbm_test.sh
+++ b/samples/bpf/do_hbm_test.sh
@@ -13,10 +13,10 @@ Usage() {
   echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
   echo "loads. The output is the goodput in Mbps (unless -D was used)."
   echo ""
-  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
-  echo "             [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
+  echo "             [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E]"
   echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
-  echo "             [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+  echo "             [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
   echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
   echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
   echo "  Where:"
@@ -33,6 +33,7 @@ Usage() {
   echo "    -f or --flows     number of concurrent flows (default=1)"
   echo "    -i or --id        cgroup id (an integer, default is 1)"
   echo "    -N                use netperf instead of iperf3"
+  echo "    --no_cn           Do not return CN notifications"
   echo "    -l                do not limit flows using loopback"
   echo "    -h                Help"
   echo "    -p or --port      iperf3 port (default is 5201)"
@@ -115,6 +116,9 @@ processArgs () {
     -c=*|--cc=*)
       cc="${i#*=}"
       ;;
+    --no_cn)
+      flags="$flags --no_cn"
+      ;;
     --debug)
       flags="$flags -d"
       debug_flag=1
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index a79828ab273f..c5629bae67ab 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -16,6 +16,7 @@
  *    -l	Also limit flows doing loopback
  *    -n <#>	To create cgroup \"/hbm#\" and attach prog
  *		Default is /hbm1
+ *    --no_cn   Do not return cn notifications
  *    -r <rate>	Rate limit in Mbps
  *    -s	Get HBM stats (marked, dropped, etc.)
  *    -t <time>	Exit after specified seconds (default is 0)
@@ -42,6 +43,7 @@
 
 #include <linux/bpf.h>
 #include <bpf/bpf.h>
+#include <getopt.h>
 
 #include "bpf_load.h"
 #include "bpf_rlimit.h"
@@ -59,6 +61,7 @@ bool stats_flag;
 bool loopback_flag;
 bool debugFlag;
 bool work_conserving_flag;
+bool no_cn_flag;
 
 static void Usage(void);
 static void read_trace_pipe2(void);
@@ -185,6 +188,7 @@ static int run_bpf_prog(char *prog, int cg_id)
 	qstats.rate = rate;
 	qstats.stats = stats_flag ? 1 : 0;
 	qstats.loopback = loopback_flag ? 1 : 0;
+	qstats.no_cn = no_cn_flag ? 1 : 0;
 	if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
 		printf("ERROR: Could not update map element\n");
 		goto err;
@@ -366,14 +370,15 @@ static void Usage(void)
 {
 	printf("This program loads a cgroup skb BPF program to enforce\n"
 	       "cgroup output (egress) bandwidth limits.\n\n"
-	       "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [-r <rate>] [-s]\n"
-	       "           [-t <secs>] [-w] [-h] [prog]\n"
+	       "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [--no_cn] [-r <rate>]\n"
+	       "           [-s] [-t <secs>] [-w] [-h] [prog]\n"
 	       "  Where:\n"
 	       "    -o         indicates egress direction (default)\n"
 	       "    -d         print BPF trace debug buffer\n"
 	       "    -l         also limit flows using loopback\n"
 	       "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n"
 	       "               Default is /hbm1\n"
+	       "    --no_cn    disable CN notifcations\n"
 	       "    -r <rate>  Rate in Mbps\n"
 	       "    -s         Update HBM stats\n"
 	       "    -t <time>  Exit after specified seconds (default is 0)\n"
@@ -393,9 +398,16 @@ int main(int argc, char **argv)
 	int  k;
 	int cg_id = 1;
 	char *optstring = "iodln:r:st:wh";
+	struct option loptions[] = {
+		{"no_cn", 0, NULL, 1},
+		{NULL, 0, NULL, 0}
+	};
 
-	while ((k = getopt(argc, argv, optstring)) != -1) {
+	while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) {
 		switch (k) {
+		case 1:
+			no_cn_flag = true;
+			break;
 		case'o':
 			break;
 		case 'd':
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
index 518e8147d084..c08247cec2a7 100644
--- a/samples/bpf/hbm.h
+++ b/samples/bpf/hbm.h
@@ -19,7 +19,8 @@ struct hbm_vqueue {
 struct hbm_queue_stats {
 	unsigned long rate;		/* in Mbps*/
 	unsigned long stats:1,		/* get HBM stats (marked, dropped,..) */
-		loopback:1;		/* also limit flows using loopback */
+		loopback:1,		/* also limit flows using loopback */
+		no_cn:1;		/* do not use cn flags */
 	unsigned long long pkts_marked;
 	unsigned long long bytes_marked;
 	unsigned long long pkts_dropped;
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
index f806863d0b79..fa3ea92e1564 100644
--- a/samples/bpf/hbm_out_kern.c
+++ b/samples/bpf/hbm_out_kern.c
@@ -119,13 +119,16 @@ int _hbm_out_cg(struct __sk_buff *skb)
 	// Set flags (drop, congestion, cwr)
 	// Dropping => we are congested, so ignore congestion flag
 	if (credit < -DROP_THRESH ||
-	    (len > LARGE_PKT_THRESH &&
-	     credit < -LARGE_PKT_DROP_THRESH)) {
-		// Very congested, set drop flag
+	    (len > LARGE_PKT_THRESH && credit < -LARGE_PKT_DROP_THRESH)) {
+		// Very congested, set drop packet
 		drop_flag = true;
+		if (pkti.ecn)
+			congestion_flag = true;
+		else if (pkti.is_tcp)
+			cwr_flag = true;
 	} else if (credit < 0) {
 		// Congested, set congestion flag
-		if (pkti.ecn) {
+		if (pkti.ecn || pkti.is_tcp) {
 			if (credit < -MARK_THRESH)
 				congestion_flag = true;
 			else
@@ -137,7 +140,15 @@ int _hbm_out_cg(struct __sk_buff *skb)
 
 	if (congestion_flag) {
 		if (!bpf_skb_ecn_set_ce(skb)) {
-			if (len > LARGE_PKT_THRESH) {
+			if (pkti.is_tcp) {
+				unsigned int rand = bpf_get_prandom_u32();
+
+				if (-credit >= MARK_THRESH +
+				    (rand % MARK_REGION_SIZE)) {
+					// Do congestion control
+					cwr_flag = true;
+				}
+			} else if (len > LARGE_PKT_THRESH) {
 				// Problem if too many small packets?
 				drop_flag = true;
 			}
@@ -146,12 +157,17 @@ int _hbm_out_cg(struct __sk_buff *skb)
 
 	if (drop_flag)
 		rv = DROP_PKT;
+	if (qsp != NULL)
+		if (qsp->no_cn)
+			cwr_flag = false;
 
 	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
 
 	if (rv == DROP_PKT)
 		__sync_add_and_fetch(&(qdp->credit), len);
 
+	if (cwr_flag)
+		rv |= 2;
 	return rv;
 }
 char _license[] SEC("license") = "GPL";

From d58c6f7212f4eda59ca94cbfbaa785dde7675456 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Tue, 28 May 2019 16:59:40 -0700
Subject: [PATCH 6/6] bpf: Add more stats to HBM

Adds more stats to HBM, including average cwnd and rtt of all TCP
flows, percents of packets that are ecn ce marked and distribution
of return values.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/hbm.c          | 33 +++++++++++++++++++
 samples/bpf/hbm.h          |  6 ++++
 samples/bpf/hbm_kern.h     | 66 ++++++++++++++++++++++++++++++++++++--
 samples/bpf/hbm_out_kern.c | 22 ++++++++-----
 4 files changed, 117 insertions(+), 10 deletions(-)

diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
index c5629bae67ab..480b7ad6a1f2 100644
--- a/samples/bpf/hbm.c
+++ b/samples/bpf/hbm.c
@@ -316,6 +316,14 @@ static int run_bpf_prog(char *prog, int cg_id)
 		double percent_pkts, percent_bytes;
 		char fname[100];
 		FILE *fout;
+		int k;
+		static const char *returnValNames[] = {
+			"DROP_PKT",
+			"ALLOW_PKT",
+			"DROP_PKT_CWR",
+			"ALLOW_PKT_CWR"
+		};
+#define RET_VAL_COUNT 4
 
 // Future support of ingress
 //		if (!outFlag)
@@ -350,6 +358,31 @@ static int run_bpf_prog(char *prog, int cg_id)
 			(qstats.bytes_total + 1);
 		fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
 		fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+
+		// ECN CE markings
+		percent_pkts = (qstats.pkts_ecn_ce * 100.0) /
+			(qstats.pkts_total + 1);
+		fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts,
+			(int)qstats.pkts_ecn_ce);
+
+		// Average cwnd
+		fprintf(fout, "avg cwnd:%d\n",
+			(int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1)));
+		// Average rtt
+		fprintf(fout, "avg rtt:%d\n",
+			(int)(qstats.sum_rtt / (qstats.pkts_total + 1)));
+		// Average credit
+		fprintf(fout, "avg credit:%d\n",
+			(int)(qstats.sum_credit /
+			      (1500 * ((int)qstats.pkts_total) + 1)));
+
+		// Return values stats
+		for (k = 0; k < RET_VAL_COUNT; k++) {
+			percent_pkts = (qstats.returnValCount[k] * 100.0) /
+				(qstats.pkts_total + 1);
+			fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k],
+				percent_pkts, (int)qstats.returnValCount[k]);
+		}
 		fclose(fout);
 	}
 
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
index c08247cec2a7..f0963ed6a562 100644
--- a/samples/bpf/hbm.h
+++ b/samples/bpf/hbm.h
@@ -29,4 +29,10 @@ struct hbm_queue_stats {
 	unsigned long long bytes_total;
 	unsigned long long firstPacketTime;
 	unsigned long long lastPacketTime;
+	unsigned long long pkts_ecn_ce;
+	unsigned long long returnValCount[4];
+	unsigned long long sum_cwnd;
+	unsigned long long sum_rtt;
+	unsigned long long sum_cwnd_cnt;
+	long long sum_credit;
 };
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
index 41384be233b9..be19cf1d5cd5 100644
--- a/samples/bpf/hbm_kern.h
+++ b/samples/bpf/hbm_kern.h
@@ -65,17 +65,43 @@ struct bpf_map_def SEC("maps") queue_stats = {
 BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
 
 struct hbm_pkt_info {
+	int	cwnd;
+	int	rtt;
 	bool	is_ip;
 	bool	is_tcp;
 	short	ecn;
 };
 
+static int get_tcp_info(struct __sk_buff *skb, struct hbm_pkt_info *pkti)
+{
+	struct bpf_sock *sk;
+	struct bpf_tcp_sock *tp;
+
+	sk = skb->sk;
+	if (sk) {
+		sk = bpf_sk_fullsock(sk);
+		if (sk) {
+			if (sk->protocol == IPPROTO_TCP) {
+				tp = bpf_tcp_sock(sk);
+				if (tp) {
+					pkti->cwnd = tp->snd_cwnd;
+					pkti->rtt = tp->srtt_us >> 3;
+					return 0;
+				}
+			}
+		}
+	}
+	return 1;
+}
+
 static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
 					     struct hbm_pkt_info *pkti)
 {
 	struct iphdr iph;
 	struct ipv6hdr *ip6h;
 
+	pkti->cwnd = 0;
+	pkti->rtt = 0;
 	bpf_skb_load_bytes(skb, 0, &iph, 12);
 	if (iph.version == 6) {
 		ip6h = (struct ipv6hdr *)&iph;
@@ -91,6 +117,8 @@ static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
 		pkti->is_tcp = false;
 		pkti->ecn = 0;
 	}
+	if (pkti->is_tcp)
+		get_tcp_info(skb, pkti);
 }
 
 static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
@@ -105,8 +133,14 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
 					     int len,
 					     unsigned long long curtime,
 					     bool congestion_flag,
-					     bool drop_flag)
+					     bool drop_flag,
+					     bool cwr_flag,
+					     bool ecn_ce_flag,
+					     struct hbm_pkt_info *pkti,
+					     int credit)
 {
+	int rv = ALLOW_PKT;
+
 	if (qsp != NULL) {
 		// Following is needed for work conserving
 		__sync_add_and_fetch(&(qsp->bytes_total), len);
@@ -116,7 +150,7 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
 				qsp->firstPacketTime = curtime;
 			qsp->lastPacketTime = curtime;
 			__sync_add_and_fetch(&(qsp->pkts_total), 1);
-			if (congestion_flag || drop_flag) {
+			if (congestion_flag) {
 				__sync_add_and_fetch(&(qsp->pkts_marked), 1);
 				__sync_add_and_fetch(&(qsp->bytes_marked), len);
 			}
@@ -125,6 +159,34 @@ static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
 				__sync_add_and_fetch(&(qsp->bytes_dropped),
 						     len);
 			}
+			if (ecn_ce_flag)
+				__sync_add_and_fetch(&(qsp->pkts_ecn_ce), 1);
+			if (pkti->cwnd) {
+				__sync_add_and_fetch(&(qsp->sum_cwnd),
+						     pkti->cwnd);
+				__sync_add_and_fetch(&(qsp->sum_cwnd_cnt), 1);
+			}
+			if (pkti->rtt)
+				__sync_add_and_fetch(&(qsp->sum_rtt),
+						     pkti->rtt);
+			__sync_add_and_fetch(&(qsp->sum_credit), credit);
+
+			if (drop_flag)
+				rv = DROP_PKT;
+			if (cwr_flag)
+				rv |= 2;
+			if (rv == DROP_PKT)
+				__sync_add_and_fetch(&(qsp->returnValCount[0]),
+						     1);
+			else if (rv == ALLOW_PKT)
+				__sync_add_and_fetch(&(qsp->returnValCount[1]),
+						     1);
+			else if (rv == 2)
+				__sync_add_and_fetch(&(qsp->returnValCount[2]),
+						     1);
+			else if (rv == 3)
+				__sync_add_and_fetch(&(qsp->returnValCount[3]),
+						     1);
 		}
 	}
 }
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
index fa3ea92e1564..829934bd43cb 100644
--- a/samples/bpf/hbm_out_kern.c
+++ b/samples/bpf/hbm_out_kern.c
@@ -62,11 +62,12 @@ int _hbm_out_cg(struct __sk_buff *skb)
 	unsigned int queue_index = 0;
 	unsigned long long curtime;
 	int credit;
-	signed long long delta = 0, zero = 0;
+	signed long long delta = 0, new_credit;
 	int max_credit = MAX_CREDIT;
 	bool congestion_flag = false;
 	bool drop_flag = false;
 	bool cwr_flag = false;
+	bool ecn_ce_flag = false;
 	struct hbm_vqueue *qdp;
 	struct hbm_queue_stats *qsp = NULL;
 	int rv = ALLOW_PKT;
@@ -99,9 +100,11 @@ int _hbm_out_cg(struct __sk_buff *skb)
 	 */
 	if (delta > 0) {
 		qdp->lasttime = curtime;
-		credit += CREDIT_PER_NS(delta, qdp->rate);
-		if (credit > MAX_CREDIT)
+		new_credit = credit + CREDIT_PER_NS(delta, qdp->rate);
+		if (new_credit > MAX_CREDIT)
 			credit = MAX_CREDIT;
+		else
+			credit = new_credit;
 	}
 	credit -= len;
 	qdp->credit = credit;
@@ -139,7 +142,9 @@ int _hbm_out_cg(struct __sk_buff *skb)
 	}
 
 	if (congestion_flag) {
-		if (!bpf_skb_ecn_set_ce(skb)) {
+		if (bpf_skb_ecn_set_ce(skb)) {
+			ecn_ce_flag = true;
+		} else {
 			if (pkti.is_tcp) {
 				unsigned int rand = bpf_get_prandom_u32();
 
@@ -155,16 +160,17 @@ int _hbm_out_cg(struct __sk_buff *skb)
 		}
 	}
 
-	if (drop_flag)
-		rv = DROP_PKT;
 	if (qsp != NULL)
 		if (qsp->no_cn)
 			cwr_flag = false;
 
-	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag,
+			 cwr_flag, ecn_ce_flag, &pkti, credit);
 
-	if (rv == DROP_PKT)
+	if (drop_flag) {
 		__sync_add_and_fetch(&(qdp->credit), len);
+		rv = DROP_PKT;
+	}
 
 	if (cwr_flag)
 		rv |= 2;