From 94133cf24bb33889aac267a7f0e3e6a08b8a8e5a Mon Sep 17 00:00:00 2001
From: Mykyta Yatsenko <yatsenko@meta.com>
Date: Tue, 14 May 2024 14:12:21 +0100
Subject: [PATCH 01/23] bpftool: Introduce btf c dump sorting

Sort bpftool c dump output; aiming to simplify vmlinux.h diffing and
forcing more natural type definitions ordering.

Definitions are sorted first by their BTF kind ranks, then by their base
type name and by their own name.

Type ranks

Assign ranks to btf kinds (defined in function btf_type_rank) to set
next order:
1. Anonymous enums/enums64
2. Named enums/enums64
3. Trivial types typedefs (ints, then floats)
4. Structs/Unions
5. Function prototypes
6. Forward declarations

Type rank is set to maximum for unnamed reference types, structs and
unions to avoid emitting those types early. They will be emitted as
part of the type chain starting with named type.

Lexicographical ordering

Each type is assigned a sort_name and own_name.
sort_name is the resolved name of the final base type for reference
types (typedef, pointer, array etc). Sorting by sort_name allows to
group typedefs of the same base type. sort_name for non-reference type
is the same as own_name. own_name is a direct name of particular type,
is used as final sorting step.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Tested-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Quentin Monnet <qmo@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240514131221.20585-1-yatsenko@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/bpftool/Documentation/bpftool-btf.rst |   6 +-
 tools/bpf/bpftool/bash-completion/bpftool     |   3 +
 tools/bpf/bpftool/btf.c                       | 140 +++++++++++++++++-
 3 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index eaba24320fb2..3f6bca03ad2e 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -28,7 +28,7 @@ BTF COMMANDS
 | **bpftool** **btf help**
 |
 | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* }
-| *FORMAT* := { **raw** | **c** }
+| *FORMAT* := { **raw** | **c** [**unsorted**] }
 | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
 | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* }
 
@@ -63,7 +63,9 @@ bpftool btf dump *BTF_SRC*
     pahole.
 
     **format** option can be used to override default (raw) output format. Raw
-    (**raw**) or C-syntax (**c**) output formats are supported.
+    (**raw**) or C-syntax (**c**) output formats are supported. With C-style
+    formatting, the output is sorted by default. Use the **unsorted** option
+    to avoid sorting the output.
 
 bpftool btf help
     Print short help message.
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 04afe2ac2228..be99d49b8714 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -930,6 +930,9 @@ _bpftool()
                         format)
                             COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) )
                             ;;
+                        c)
+                            COMPREPLY=( $( compgen -W "unsorted" -- "$cur" ) )
+                            ;;
                         *)
                             # emit extra options
                             case ${words[3]} in
diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 91fcb75babe3..af047dedde38 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -43,6 +43,13 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
 	[BTF_KIND_ENUM64]	= "ENUM64",
 };
 
+struct sort_datum {
+	int index;
+	int type_rank;
+	const char *sort_name;
+	const char *own_name;
+};
+
 static const char *btf_int_enc_str(__u8 encoding)
 {
 	switch (encoding) {
@@ -460,9 +467,122 @@ static void __printf(2, 0) btf_dump_printf(void *ctx,
 	vfprintf(stdout, fmt, args);
 }
 
-static int dump_btf_c(const struct btf *btf,
-		      __u32 *root_type_ids, int root_type_cnt)
+static int btf_type_rank(const struct btf *btf, __u32 index, bool has_name)
 {
+	const struct btf_type *t = btf__type_by_id(btf, index);
+	const int kind = btf_kind(t);
+	const int max_rank = 10;
+
+	if (t->name_off)
+		has_name = true;
+
+	switch (kind) {
+	case BTF_KIND_ENUM:
+	case BTF_KIND_ENUM64:
+		return has_name ? 1 : 0;
+	case BTF_KIND_INT:
+	case BTF_KIND_FLOAT:
+		return 2;
+	case BTF_KIND_STRUCT:
+	case BTF_KIND_UNION:
+		return has_name ? 3 : max_rank;
+	case BTF_KIND_FUNC_PROTO:
+		return has_name ? 4 : max_rank;
+	case BTF_KIND_ARRAY:
+		if (has_name)
+			return btf_type_rank(btf, btf_array(t)->type, has_name);
+		return max_rank;
+	case BTF_KIND_TYPE_TAG:
+	case BTF_KIND_CONST:
+	case BTF_KIND_PTR:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_DECL_TAG:
+		if (has_name)
+			return btf_type_rank(btf, t->type, has_name);
+		return max_rank;
+	default:
+		return max_rank;
+	}
+}
+
+static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool from_ref)
+{
+	const struct btf_type *t = btf__type_by_id(btf, index);
+
+	switch (btf_kind(t)) {
+	case BTF_KIND_ENUM:
+	case BTF_KIND_ENUM64: {
+		int name_off = t->name_off;
+
+		/* Use name of the first element for anonymous enums if allowed */
+		if (!from_ref && !t->name_off && btf_vlen(t))
+			name_off = btf_enum(t)->name_off;
+
+		return btf__name_by_offset(btf, name_off);
+	}
+	case BTF_KIND_ARRAY:
+		return btf_type_sort_name(btf, btf_array(t)->type, true);
+	case BTF_KIND_TYPE_TAG:
+	case BTF_KIND_CONST:
+	case BTF_KIND_PTR:
+	case BTF_KIND_VOLATILE:
+	case BTF_KIND_RESTRICT:
+	case BTF_KIND_TYPEDEF:
+	case BTF_KIND_DECL_TAG:
+		return btf_type_sort_name(btf, t->type, true);
+	default:
+		return btf__name_by_offset(btf, t->name_off);
+	}
+	return NULL;
+}
+
+static int btf_type_compare(const void *left, const void *right)
+{
+	const struct sort_datum *d1 = (const struct sort_datum *)left;
+	const struct sort_datum *d2 = (const struct sort_datum *)right;
+	int r;
+
+	if (d1->type_rank != d2->type_rank)
+		return d1->type_rank < d2->type_rank ? -1 : 1;
+
+	r = strcmp(d1->sort_name, d2->sort_name);
+	if (r)
+		return r;
+
+	return strcmp(d1->own_name, d2->own_name);
+}
+
+static struct sort_datum *sort_btf_c(const struct btf *btf)
+{
+	struct sort_datum *datums;
+	int n;
+
+	n = btf__type_cnt(btf);
+	datums = malloc(sizeof(struct sort_datum) * n);
+	if (!datums)
+		return NULL;
+
+	for (int i = 0; i < n; ++i) {
+		struct sort_datum *d = datums + i;
+		const struct btf_type *t = btf__type_by_id(btf, i);
+
+		d->index = i;
+		d->type_rank = btf_type_rank(btf, i, false);
+		d->sort_name = btf_type_sort_name(btf, i, false);
+		d->own_name = btf__name_by_offset(btf, t->name_off);
+	}
+
+	qsort(datums, n, sizeof(struct sort_datum), btf_type_compare);
+
+	return datums;
+}
+
+static int dump_btf_c(const struct btf *btf,
+		      __u32 *root_type_ids, int root_type_cnt, bool sort_dump)
+{
+	struct sort_datum *datums = NULL;
 	struct btf_dump *d;
 	int err = 0, i;
 
@@ -486,8 +606,12 @@ static int dump_btf_c(const struct btf *btf,
 	} else {
 		int cnt = btf__type_cnt(btf);
 
+		if (sort_dump)
+			datums = sort_btf_c(btf);
 		for (i = 1; i < cnt; i++) {
-			err = btf_dump__dump_type(d, i);
+			int idx = datums ? datums[i].index : i;
+
+			err = btf_dump__dump_type(d, idx);
 			if (err)
 				goto done;
 		}
@@ -500,6 +624,7 @@ static int dump_btf_c(const struct btf *btf,
 	printf("#endif /* __VMLINUX_H__ */\n");
 
 done:
+	free(datums);
 	btf_dump__free(d);
 	return err;
 }
@@ -549,10 +674,10 @@ static bool btf_is_kernel_module(__u32 btf_id)
 
 static int do_dump(int argc, char **argv)
 {
+	bool dump_c = false, sort_dump_c = true;
 	struct btf *btf = NULL, *base = NULL;
 	__u32 root_type_ids[2];
 	int root_type_cnt = 0;
-	bool dump_c = false;
 	__u32 btf_id = -1;
 	const char *src;
 	int fd = -1;
@@ -663,6 +788,9 @@ static int do_dump(int argc, char **argv)
 				goto done;
 			}
 			NEXT_ARG();
+		} else if (is_prefix(*argv, "unsorted")) {
+			sort_dump_c = false;
+			NEXT_ARG();
 		} else {
 			p_err("unrecognized option: '%s'", *argv);
 			err = -EINVAL;
@@ -691,7 +819,7 @@ static int do_dump(int argc, char **argv)
 			err = -ENOTSUP;
 			goto done;
 		}
-		err = dump_btf_c(btf, root_type_ids, root_type_cnt);
+		err = dump_btf_c(btf, root_type_ids, root_type_cnt, sort_dump_c);
 	} else {
 		err = dump_btf_raw(btf, root_type_ids, root_type_cnt);
 	}
@@ -1063,7 +1191,7 @@ static int do_help(int argc, char **argv)
 		"       %1$s %2$s help\n"
 		"\n"
 		"       BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n"
-		"       FORMAT  := { raw | c }\n"
+		"       FORMAT  := { raw | c [unsorted] }\n"
 		"       " HELP_SPEC_MAP "\n"
 		"       " HELP_SPEC_PROGRAM "\n"
 		"       " HELP_SPEC_OPTIONS " |\n"

From e7b64f9d3f5b10186038201e0b91f734cbd7fc3d Mon Sep 17 00:00:00 2001
From: Artem Savkov <asavkov@redhat.com>
Date: Mon, 13 May 2024 13:26:58 +0200
Subject: [PATCH 02/23] bpftool: Fix make dependencies for vmlinux.h

With pre-generated vmlinux.h there is no dependency on neither vmlinux
nor bootstrap bpftool. Define dependencies separately for both modes.
This avoids needless rebuilds in some corner cases.

Suggested-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Artem Savkov <asavkov@redhat.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Quentin Monnet <qmo@kernel.org>
Link: https://lore.kernel.org/bpf/20240513112658.43691-1-asavkov@redhat.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/bpf/bpftool/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index dfa4f1bebbb3..ba927379eb20 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -204,10 +204,11 @@ ifeq ($(feature-clang-bpf-co-re),1)
 
 BUILD_BPF_SKELS := 1
 
-$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP)
 ifeq ($(VMLINUX_H),)
+$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP)
 	$(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) btf dump file $< format c > $@
 else
+$(OUTPUT)vmlinux.h: $(VMLINUX_H)
 	$(Q)cp "$(VMLINUX_H)" $@
 endif
 

From 34021caef79f76e70ac31247d321ecd0683c4939 Mon Sep 17 00:00:00 2001
From: Alan Maguire <alan.maguire@oracle.com>
Date: Tue, 14 May 2024 17:27:16 +0100
Subject: [PATCH 03/23] kbuild, bpf: Use test-ge check for v1.25-only pahole

There is no need to set the pahole v1.25-only flags in an
"ifeq" version clause; we are already in a <= v1.25 branch
of "ifeq", so that combined with a "test-ge" v1.25 ensures the
flags will be applied for v1.25 only.

Suggested-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240514162716.2448265-1-alan.maguire@oracle.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 scripts/Makefile.btf | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf
index 2d6e5ed9081e..bca8a8f26ea4 100644
--- a/scripts/Makefile.btf
+++ b/scripts/Makefile.btf
@@ -14,9 +14,7 @@ pahole-flags-$(call test-ge, $(pahole-ver), 121)	+= --btf_gen_floats
 
 pahole-flags-$(call test-ge, $(pahole-ver), 122)	+= -j
 
-ifeq ($(pahole-ver), 125)
-pahole-flags-y	+= --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
-endif
+pahole-flags-$(call test-ge, $(pahole-ver), 125)	+= --skip_encoding_btf_inconsistent_proto --btf_gen_optimized
 
 else
 

From 6f130e4d4a5f7174f98300376f3994817ad7e21c Mon Sep 17 00:00:00 2001
From: Mohammad Shehar Yaar Tausif <sheharyaar48@gmail.com>
Date: Thu, 16 May 2024 12:54:11 +0530
Subject: [PATCH 04/23] bpf: Fix order of args in call to bpf_map_kvcalloc

The original function call passed size of smap->bucket before the number of
buckets which raises the error 'calloc-transposed-args' on compilation.

Signed-off-by: Mohammad Shehar Yaar Tausif <sheharyaar48@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240516072411.42016-1-sheharyaar48@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/bpf_local_storage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 976cb258a0ed..c938dea5ddbf 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
 	nbuckets = max_t(u32, 2, nbuckets);
 	smap->bucket_log = ilog2(nbuckets);
 
-	smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
-					 nbuckets, GFP_USER | __GFP_NOWARN);
+	smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+					 sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
 	if (!smap->buckets) {
 		err = -ENOMEM;
 		goto free_smap;

From f4aba3471cfb9ccf69b476463f19b4c50fef6b14 Mon Sep 17 00:00:00 2001
From: Ivan Babrou <ivan@cloudflare.com>
Date: Mon, 20 May 2024 15:51:49 -0700
Subject: [PATCH 05/23] bpftool: Un-const bpf_func_info to fix it for llvm 17
 and newer

LLVM 17 started treating const structs as constants:

* https://github.com/llvm/llvm-project/commit/0b2d5b967d98

Combined with pointer laundering via ptr_to_u64, which takes a const ptr,
but in reality treats the underlying memory as mutable, this makes clang
always pass zero to btf__type_by_id, which breaks full name resolution.

Disassembly before (LLVM 16) and after (LLVM 17):

    -    8b 75 cc                 mov    -0x34(%rbp),%esi
    -    e8 47 8d 02 00           call   3f5b0 <btf__type_by_id>
    +    31 f6                    xor    %esi,%esi
    +    e8 a9 8c 02 00           call   3f510 <btf__type_by_id>

It's a bigger project to fix this properly (and a question whether LLVM
itself should detect this), but for right now let's just fix bpftool.

For more information, see this thread in bpf mailing list:

* https://lore.kernel.org/bpf/CABWYdi0ymezpYsQsPv7qzpx2fWuTkoD1-wG1eT-9x-TSREFrQg@mail.gmail.com/T/

Fixes: b662000aff84 ("bpftool: Adding support for BTF program names")
Signed-off-by: Ivan Babrou <ivan@cloudflare.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240520225149.5517-1-ivan@cloudflare.com
---
 tools/bpf/bpftool/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 958e92acca8e..9b75639434b8 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -410,7 +410,7 @@ void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd,
 {
 	const char *prog_name = prog_info->name;
 	const struct btf_type *func_type;
-	const struct bpf_func_info finfo = {};
+	struct bpf_func_info finfo = {};
 	struct bpf_prog_info info = {};
 	__u32 info_len = sizeof(info);
 	struct btf *prog_btf = NULL;

From 1b0215a3633a4c54ed7ec3af93e7a782dda8d965 Mon Sep 17 00:00:00 2001
From: Ying Zhang <yingzhang098@163.com>
Date: Tue, 21 May 2024 16:17:02 +0000
Subject: [PATCH 06/23] bpf: Remove unused variable "prev_state"

The variable "prev_state" is not used for any actual operations

v2: Fix commit message and description.

Signed-off-by: Ying Zhang <yingzhang098@163.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/bpf/20240521161702.4339-1-yingzhang098@163.com
---
 samples/bpf/cpustat_kern.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c
index 944f13fe164a..7ec7143e2757 100644
--- a/samples/bpf/cpustat_kern.c
+++ b/samples/bpf/cpustat_kern.c
@@ -211,7 +211,7 @@ int bpf_prog1(struct cpu_args *ctx)
 SEC("tracepoint/power/cpu_frequency")
 int bpf_prog2(struct cpu_args *ctx)
 {
-	u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
+	u64 *pts, *cstate, *pstate, cur_ts, delta;
 	u32 key, cpu, pstate_idx;
 	u64 *val;
 
@@ -232,7 +232,6 @@ int bpf_prog2(struct cpu_args *ctx)
 	if (!cstate)
 		return 0;
 
-	prev_state = *pstate;
 	*pstate = ctx->state;
 
 	if (!*pts) {

From 6c8d7598dfed759bf1d9d0322b4c2b42eb7252d8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Fri, 17 May 2024 14:21:46 +0800
Subject: [PATCH 07/23] selftests/bpf: Fix prog numbers in test_sockmap

bpf_prog5 and bpf_prog7 are removed from progs/test_sockmap_kern.h in
commit d79a32129b21 ("bpf: Selftests, remove prints from sockmap tests"),
now there are only 9 progs in it, not 11:

	SEC("sk_skb1")
	int bpf_prog1(struct __sk_buff *skb)
	SEC("sk_skb2")
	int bpf_prog2(struct __sk_buff *skb)
	SEC("sk_skb3")
	int bpf_prog3(struct __sk_buff *skb)
	SEC("sockops")
	int bpf_sockmap(struct bpf_sock_ops *skops)
	SEC("sk_msg1")
	int bpf_prog4(struct sk_msg_md *msg)
	SEC("sk_msg2")
	int bpf_prog6(struct sk_msg_md *msg)
	SEC("sk_msg3")
	int bpf_prog8(struct sk_msg_md *msg)
	SEC("sk_msg4")
	int bpf_prog9(struct sk_msg_md *msg)
	SEC("sk_msg5")
	int bpf_prog10(struct sk_msg_md *msg)

This patch updates the array sizes of prog_fd[], prog_attach_type[] and
prog_type[] from 11 to 9 accordingly.

Fixes: d79a32129b21 ("bpf: Selftests, remove prints from sockmap tests")
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/9c10d9f974f07fcb354a43a8eca67acb2fafc587.1715926605.git.tanggeliang@kylinos.cn
---
 tools/testing/selftests/bpf/test_sockmap.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 92752f5eeded..4499b3cfc3a6 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -63,7 +63,7 @@ int passed;
 int failed;
 int map_fd[9];
 struct bpf_map *maps[9];
-int prog_fd[11];
+int prog_fd[9];
 
 int txmsg_pass;
 int txmsg_redir;
@@ -1793,8 +1793,6 @@ int prog_attach_type[] = {
 	BPF_SK_MSG_VERDICT,
 	BPF_SK_MSG_VERDICT,
 	BPF_SK_MSG_VERDICT,
-	BPF_SK_MSG_VERDICT,
-	BPF_SK_MSG_VERDICT,
 };
 
 int prog_type[] = {
@@ -1807,8 +1805,6 @@ int prog_type[] = {
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_SK_MSG,
 	BPF_PROG_TYPE_SK_MSG,
-	BPF_PROG_TYPE_SK_MSG,
-	BPF_PROG_TYPE_SK_MSG,
 };
 
 static int populate_progs(char *bpf_file)

From ece4b296904167336d0aaab26bd7122018835202 Mon Sep 17 00:00:00 2001
From: Brad Cowie <brad@faucet.nz>
Date: Wed, 22 May 2024 17:07:11 +1200
Subject: [PATCH 08/23] net: netfilter: Make ct zone opts configurable for bpf
 ct helpers

Add ct zone id and direction to bpf_ct_opts so that arbitrary ct zones
can be used for xdp/tc bpf ct helper functions bpf_{xdp,skb}_ct_alloc
and bpf_{xdp,skb}_ct_lookup.

Signed-off-by: Brad Cowie <brad@faucet.nz>
Link: https://lore.kernel.org/r/20240522050712.732558-1-brad@faucet.nz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 net/netfilter/nf_conntrack_bpf.c | 68 ++++++++++++++++++++++++++------
 1 file changed, 55 insertions(+), 13 deletions(-)

diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index d2492d050fe6..4a136fc3a9c0 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -32,7 +32,9 @@
  *		   -EINVAL - Passed NULL for bpf_tuple pointer
  *		   -EINVAL - opts->reserved is not 0
  *		   -EINVAL - netns_id is less than -1
- *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12)
+ *		   -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12
+ *		   -EINVAL - opts->ct_zone_id set when
+			     opts__sz isn't NF_BPF_CT_OPTS_SZ (16)
  *		   -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP
  *		   -ENONET - No network namespace found for netns_id
  *		   -ENOENT - Conntrack lookup could not find entry for tuple
@@ -42,6 +44,8 @@
  *		 Values:
  *		   IPPROTO_TCP, IPPROTO_UDP
  * @dir:       - connection tracking tuple direction.
+ * @ct_zone_id - connection tracking zone id.
+ * @ct_zone_dir - connection tracking zone direction.
  * @reserved   - Reserved member, will be reused for more options in future
  *		 Values:
  *		   0
@@ -51,11 +55,13 @@ struct bpf_ct_opts {
 	s32 error;
 	u8 l4proto;
 	u8 dir;
-	u8 reserved[2];
+	u16 ct_zone_id;
+	u8 ct_zone_dir;
+	u8 reserved[3];
 };
 
 enum {
-	NF_BPF_CT_OPTS_SZ = 12,
+	NF_BPF_CT_OPTS_SZ = 16,
 };
 
 static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple,
@@ -104,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
 			u32 timeout)
 {
 	struct nf_conntrack_tuple otuple, rtuple;
+	struct nf_conntrack_zone ct_zone;
 	struct nf_conn *ct;
 	int err;
 
-	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
-	    opts_len != NF_BPF_CT_OPTS_SZ)
+	if (!opts || !bpf_tuple)
 		return ERR_PTR(-EINVAL);
+	if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+		return ERR_PTR(-EINVAL);
+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
+		if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+			return ERR_PTR(-EINVAL);
+	} else {
+		if (opts->ct_zone_id)
+			return ERR_PTR(-EINVAL);
+	}
 
 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
 		return ERR_PTR(-EINVAL);
@@ -130,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple,
 			return ERR_PTR(-ENONET);
 	}
 
-	ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple,
+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
+		if (opts->ct_zone_dir == 0)
+			opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+		nf_ct_zone_init(&ct_zone,
+				opts->ct_zone_id, opts->ct_zone_dir, 0);
+	} else {
+		ct_zone = nf_ct_zone_dflt;
+	}
+
+	ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple,
 				GFP_ATOMIC);
 	if (IS_ERR(ct))
 		goto out;
@@ -152,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 {
 	struct nf_conntrack_tuple_hash *hash;
 	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_zone ct_zone;
 	struct nf_conn *ct;
 	int err;
 
-	if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
-	    opts_len != NF_BPF_CT_OPTS_SZ)
+	if (!opts || !bpf_tuple)
 		return ERR_PTR(-EINVAL);
+	if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12))
+		return ERR_PTR(-EINVAL);
+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
+		if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2])
+			return ERR_PTR(-EINVAL);
+	} else {
+		if (opts->ct_zone_id)
+			return ERR_PTR(-EINVAL);
+	}
 	if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP))
 		return ERR_PTR(-EPROTO);
 	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS))
@@ -174,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 			return ERR_PTR(-ENONET);
 	}
 
-	hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple);
+	if (opts_len == NF_BPF_CT_OPTS_SZ) {
+		if (opts->ct_zone_dir == 0)
+			opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR;
+		nf_ct_zone_init(&ct_zone,
+				opts->ct_zone_id, opts->ct_zone_dir, 0);
+	} else {
+		ct_zone = nf_ct_zone_dflt;
+	}
+
+	hash = nf_conntrack_find_get(net, &ct_zone, &tuple);
 	if (opts->netns_id >= 0)
 		put_net(net);
 	if (!hash)
@@ -245,7 +287,7 @@ __bpf_kfunc_start_defs();
  * @opts	- Additional options for allocation (documented above)
  *		    Cannot be NULL
  * @opts__sz	- Length of the bpf_ct_opts structure
- *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
  */
 __bpf_kfunc struct nf_conn___init *
 bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
@@ -279,7 +321,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts	- Additional options for lookup (documented above)
  *		    Cannot be NULL
  * @opts__sz	- Length of the bpf_ct_opts structure
- *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
  */
 __bpf_kfunc struct nf_conn *
 bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
@@ -312,7 +354,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts	- Additional options for allocation (documented above)
  *		    Cannot be NULL
  * @opts__sz	- Length of the bpf_ct_opts structure
- *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
  */
 __bpf_kfunc struct nf_conn___init *
 bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
@@ -347,7 +389,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
  * @opts	- Additional options for lookup (documented above)
  *		    Cannot be NULL
  * @opts__sz	- Length of the bpf_ct_opts structure
- *		    Must be NF_BPF_CT_OPTS_SZ (12)
+ *		    Must be NF_BPF_CT_OPTS_SZ (16) or 12
  */
 __bpf_kfunc struct nf_conn *
 bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,

From a87f34e742d279d54d529e4bc4763fdaab32a466 Mon Sep 17 00:00:00 2001
From: Brad Cowie <brad@faucet.nz>
Date: Wed, 22 May 2024 17:07:12 +1200
Subject: [PATCH 09/23] selftests/bpf: Update tests for new ct zone opts for
 nf_conntrack kfuncs

Add test for allocating and looking up ct entry in a
non-default ct zone with kfuncs bpf_{xdp,skb}_ct_alloc
and bpf_{xdp,skb}_ct_lookup.

Add negative tests for looking up ct entry in a different
ct zone to where it was allocated and with a different
direction.

Update reserved test for old struct definition to test for
ct_zone_id being set when opts size isn't NF_BPF_CT_OPTS_SZ (16).

Signed-off-by: Brad Cowie <brad@faucet.nz>
Link: https://lore.kernel.org/r/20240522050712.732558-2-brad@faucet.nz
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 tools/testing/selftests/bpf/config            |   1 +
 .../testing/selftests/bpf/prog_tests/bpf_nf.c |   7 ++
 .../testing/selftests/bpf/progs/test_bpf_nf.c | 108 ++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
index eeabd798bc3a..2fb16da78dce 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -80,6 +80,7 @@ CONFIG_NETFILTER_XT_TARGET_CT=y
 CONFIG_NETKIT=y
 CONFIG_NF_CONNTRACK=y
 CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
 CONFIG_NF_DEFRAG_IPV4=y
 CONFIG_NF_DEFRAG_IPV6=y
 CONFIG_NF_NAT=y
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
index b30ff6b3b81a..a4a1f93878d4 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c
@@ -104,6 +104,7 @@ static void test_bpf_nf_ct(int mode)
 
 	ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple");
 	ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0");
+	ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0");
 	ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1");
 	ASSERT_EQ(skel->bss->test_einval_len_opts, -EINVAL, "Test EINVAL for len__opts != NF_BPF_CT_OPTS_SZ");
 	ASSERT_EQ(skel->bss->test_eproto_l4proto, -EPROTO, "Test EPROTO for l4proto != TCP or UDP");
@@ -122,6 +123,12 @@ static void test_bpf_nf_ct(int mode)
 	ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark");
 	ASSERT_EQ(skel->data->test_snat_addr, 0, "Test for source natting");
 	ASSERT_EQ(skel->data->test_dnat_addr, 0, "Test for destination natting");
+	ASSERT_EQ(skel->data->test_ct_zone_id_alloc_entry, 0, "Test for alloc new entry in specified ct zone");
+	ASSERT_EQ(skel->data->test_ct_zone_id_insert_entry, 0, "Test for insert new entry in specified ct zone");
+	ASSERT_EQ(skel->data->test_ct_zone_id_succ_lookup, 0, "Test for successful lookup in specified ct_zone");
+	ASSERT_EQ(skel->bss->test_ct_zone_dir_enoent_lookup, -ENOENT, "Test ENOENT for lookup with wrong ct zone dir");
+	ASSERT_EQ(skel->bss->test_ct_zone_id_enoent_lookup, -ENOENT, "Test ENOENT for lookup in wrong ct zone");
+
 end:
 	if (client_fd != -1)
 		close(client_fd);
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
index 77ad8adf68da..0289d8ce2b80 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c
@@ -9,10 +9,14 @@
 #define EINVAL 22
 #define ENOENT 2
 
+#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL)
+#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY)
+
 extern unsigned long CONFIG_HZ __kconfig;
 
 int test_einval_bpf_tuple = 0;
 int test_einval_reserved = 0;
+int test_einval_reserved_new = 0;
 int test_einval_netns_id = 0;
 int test_einval_len_opts = 0;
 int test_eproto_l4proto = 0;
@@ -22,6 +26,11 @@ int test_eafnosupport = 0;
 int test_alloc_entry = -EINVAL;
 int test_insert_entry = -EAFNOSUPPORT;
 int test_succ_lookup = -ENOENT;
+int test_ct_zone_id_alloc_entry = -EINVAL;
+int test_ct_zone_id_insert_entry = -EAFNOSUPPORT;
+int test_ct_zone_id_succ_lookup = -ENOENT;
+int test_ct_zone_dir_enoent_lookup = 0;
+int test_ct_zone_id_enoent_lookup = 0;
 u32 test_delta_timeout = 0;
 u32 test_status = 0;
 u32 test_insert_lookup_mark = 0;
@@ -45,6 +54,17 @@ struct bpf_ct_opts___local {
 	s32 netns_id;
 	s32 error;
 	u8 l4proto;
+	u8 dir;
+	u8 reserved[2];
+};
+
+struct bpf_ct_opts___new {
+	s32 netns_id;
+	s32 error;
+	u8 l4proto;
+	u8 dir;
+	u16 ct_zone_id;
+	u8 ct_zone_dir;
 	u8 reserved[3];
 } __attribute__((preserve_access_index));
 
@@ -220,10 +240,97 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
 	}
 }
 
+static __always_inline void
+nf_ct_opts_new_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32,
+						 struct bpf_ct_opts___new *, u32),
+		    struct nf_conn *(*alloc_fn)(void *, struct bpf_sock_tuple *, u32,
+						struct bpf_ct_opts___new *, u32),
+		    void *ctx)
+{
+	struct bpf_ct_opts___new opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 };
+	struct bpf_sock_tuple bpf_tuple;
+	struct nf_conn *ct;
+
+	__builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4));
+
+	opts_def.reserved[0] = 1;
+	ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		       sizeof(opts_def));
+	opts_def.reserved[0] = 0;
+	if (ct)
+		bpf_ct_release(ct);
+	else
+		test_einval_reserved_new = opts_def.error;
+
+	bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */
+	bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */
+	bpf_tuple.ipv4.sport = bpf_get_prandom_u32(); /* src port */
+	bpf_tuple.ipv4.dport = bpf_get_prandom_u32(); /* dst port */
+
+	/* use non-default ct zone */
+	opts_def.ct_zone_id = 10;
+	opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG;
+	ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def,
+		      sizeof(opts_def));
+	if (ct) {
+		__u16 sport = bpf_get_prandom_u32();
+		__u16 dport = bpf_get_prandom_u32();
+		union nf_inet_addr saddr = {};
+		union nf_inet_addr daddr = {};
+		struct nf_conn *ct_ins;
+
+		bpf_ct_set_timeout(ct, 10000);
+
+		/* snat */
+		saddr.ip = bpf_get_prandom_u32();
+		bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC___local);
+		/* dnat */
+		daddr.ip = bpf_get_prandom_u32();
+		bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST___local);
+
+		ct_ins = bpf_ct_insert_entry(ct);
+		if (ct_ins) {
+			struct nf_conn *ct_lk;
+
+			/* entry should exist in same ct zone we inserted it */
+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
+					  &opts_def, sizeof(opts_def));
+			if (ct_lk) {
+				bpf_ct_release(ct_lk);
+				test_ct_zone_id_succ_lookup = 0;
+			}
+
+			/* entry should not exist with wrong direction */
+			opts_def.ct_zone_dir = NF_CT_ZONE_DIR_REPL;
+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
+					  &opts_def, sizeof(opts_def));
+			opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG;
+			if (ct_lk)
+				bpf_ct_release(ct_lk);
+			else
+				test_ct_zone_dir_enoent_lookup = opts_def.error;
+
+			/* entry should not exist in default ct zone */
+			opts_def.ct_zone_id = 0;
+			ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4),
+					  &opts_def, sizeof(opts_def));
+			if (ct_lk)
+				bpf_ct_release(ct_lk);
+			else
+				test_ct_zone_id_enoent_lookup = opts_def.error;
+
+			bpf_ct_release(ct_ins);
+			test_ct_zone_id_insert_entry = 0;
+		}
+		test_ct_zone_id_alloc_entry = 0;
+	}
+}
+
 SEC("xdp")
 int nf_xdp_ct_test(struct xdp_md *ctx)
 {
 	nf_ct_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx);
+	nf_ct_opts_new_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx);
 	return 0;
 }
 
@@ -231,6 +338,7 @@ SEC("tc")
 int nf_skb_ct_test(struct __sk_buff *ctx)
 {
 	nf_ct_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx);
+	nf_ct_opts_new_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx);
 	return 0;
 }
 

From 4d25ca2d6801cfcf26f7f39c561611ba5be99bf8 Mon Sep 17 00:00:00 2001
From: Abhishek Chauhan <quic_abchauha@quicinc.com>
Date: Thu, 9 May 2024 14:18:32 -0700
Subject: [PATCH 10/23] net: Rename mono_delivery_time to tstamp_type for
 scalabilty

mono_delivery_time was added to check if skb->tstamp has delivery
time in mono clock base (i.e. EDT) otherwise skb->tstamp has
timestamp in ingress and delivery_time at egress.

Renaming the bitfield from mono_delivery_time to tstamp_type is for
extensibilty for other timestamps such as userspace timestamp
(i.e. SO_TXTIME) set via sock opts.

As we are renaming the mono_delivery_time to tstamp_type, it makes
sense to start assigning tstamp_type based on enum defined
in this commit.

Earlier we used bool arg flag to check if the tstamp is mono in
function skb_set_delivery_time, Now the signature of the functions
accepts tstamp_type to distinguish between mono and real time.

Also skb_set_delivery_type_by_clockid is a new function which accepts
clockid to determine the tstamp_type.

In future tstamp_type:1 can be extended to support userspace timestamp
by increasing the bitfield.

Signed-off-by: Abhishek Chauhan <quic_abchauha@quicinc.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240509211834.3235191-2-quic_abchauha@quicinc.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h                     | 52 ++++++++++++++++------
 include/net/inet_frag.h                    |  4 +-
 net/bridge/netfilter/nf_conntrack_bridge.c |  6 +--
 net/core/dev.c                             |  2 +-
 net/core/filter.c                          | 10 ++---
 net/ieee802154/6lowpan/reassembly.c        |  2 +-
 net/ipv4/inet_fragment.c                   |  2 +-
 net/ipv4/ip_fragment.c                     |  2 +-
 net/ipv4/ip_output.c                       |  9 ++--
 net/ipv4/tcp_output.c                      | 14 +++---
 net/ipv6/ip6_output.c                      |  6 +--
 net/ipv6/netfilter.c                       |  6 +--
 net/ipv6/netfilter/nf_conntrack_reasm.c    |  2 +-
 net/ipv6/reassembly.c                      |  2 +-
 net/ipv6/tcp_ipv6.c                        |  2 +-
 net/sched/act_bpf.c                        |  4 +-
 net/sched/cls_bpf.c                        |  4 +-
 17 files changed, 78 insertions(+), 51 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0b97c93a6de..3a721cc3b644 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -706,6 +706,11 @@ typedef unsigned int sk_buff_data_t;
 typedef unsigned char *sk_buff_data_t;
 #endif
 
+enum skb_tstamp_type {
+	SKB_CLOCK_REALTIME,
+	SKB_CLOCK_MONOTONIC,
+};
+
 /**
  * DOC: Basic sk_buff geometry
  *
@@ -823,10 +828,8 @@ typedef unsigned char *sk_buff_data_t;
  *	@dst_pending_confirm: need to confirm neighbour
  *	@decrypted: Decrypted SKB
  *	@slow_gro: state present at GRO time, slower prepare step required
- *	@mono_delivery_time: When set, skb->tstamp has the
- *		delivery_time in mono clock base (i.e. EDT).  Otherwise, the
- *		skb->tstamp has the (rcv) timestamp at ingress and
- *		delivery_time at egress.
+ *	@tstamp_type: When set, skb->tstamp has the
+ *		delivery_time clock base of skb->tstamp.
  *	@napi_id: id of the NAPI struct this skb came from
  *	@sender_cpu: (aka @napi_id) source CPU in XPS
  *	@alloc_cpu: CPU which did the skb allocation.
@@ -954,7 +957,7 @@ struct sk_buff {
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
-	__u8			mono_delivery_time:1;	/* See SKB_MONO_DELIVERY_TIME_MASK */
+	__u8			tstamp_type:1;	/* See skb_tstamp_type */
 #ifdef CONFIG_NET_XGRESS
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
 	__u8			tc_skip_classify:1;
@@ -4183,7 +4186,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb,
 static inline void __net_timestamp(struct sk_buff *skb)
 {
 	skb->tstamp = ktime_get_real();
-	skb->mono_delivery_time = 0;
+	skb->tstamp_type = SKB_CLOCK_REALTIME;
 }
 
 static inline ktime_t net_timedelta(ktime_t t)
@@ -4192,10 +4195,33 @@ static inline ktime_t net_timedelta(ktime_t t)
 }
 
 static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
-					 bool mono)
+					 u8 tstamp_type)
 {
 	skb->tstamp = kt;
-	skb->mono_delivery_time = kt && mono;
+
+	if (kt)
+		skb->tstamp_type = tstamp_type;
+	else
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
+}
+
+static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
+						    ktime_t kt, clockid_t clockid)
+{
+	u8 tstamp_type = SKB_CLOCK_REALTIME;
+
+	switch (clockid) {
+	case CLOCK_REALTIME:
+		break;
+	case CLOCK_MONOTONIC:
+		tstamp_type = SKB_CLOCK_MONOTONIC;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		kt = 0;
+	}
+
+	skb_set_delivery_time(skb, kt, tstamp_type);
 }
 
 DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
@@ -4205,8 +4231,8 @@ DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);
  */
 static inline void skb_clear_delivery_time(struct sk_buff *skb)
 {
-	if (skb->mono_delivery_time) {
-		skb->mono_delivery_time = 0;
+	if (skb->tstamp_type) {
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
 		if (static_branch_unlikely(&netstamp_needed_key))
 			skb->tstamp = ktime_get_real();
 		else
@@ -4216,7 +4242,7 @@ static inline void skb_clear_delivery_time(struct sk_buff *skb)
 
 static inline void skb_clear_tstamp(struct sk_buff *skb)
 {
-	if (skb->mono_delivery_time)
+	if (skb->tstamp_type)
 		return;
 
 	skb->tstamp = 0;
@@ -4224,7 +4250,7 @@ static inline void skb_clear_tstamp(struct sk_buff *skb)
 
 static inline ktime_t skb_tstamp(const struct sk_buff *skb)
 {
-	if (skb->mono_delivery_time)
+	if (skb->tstamp_type)
 		return 0;
 
 	return skb->tstamp;
@@ -4232,7 +4258,7 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb)
 
 static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
 {
-	if (!skb->mono_delivery_time && skb->tstamp)
+	if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
 		return skb->tstamp;
 
 	if (static_branch_unlikely(&netstamp_needed_key) || cond)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 153960663ce4..5af6eb14c5db 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -76,7 +76,7 @@ struct frag_v6_compare_key {
  * @stamp: timestamp of the last received fragment
  * @len: total length of the original datagram
  * @meat: length of received fragments so far
- * @mono_delivery_time: stamp has a mono delivery time (EDT)
+ * @tstamp_type: stamp has a mono delivery time (EDT)
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @fqdir: pointer to struct fqdir
@@ -97,7 +97,7 @@ struct inet_frag_queue {
 	ktime_t			stamp;
 	int			len;
 	int			meat;
-	u8			mono_delivery_time;
+	u8			tstamp_type;
 	__u8			flags;
 	u16			max_size;
 	struct fqdir		*fqdir;
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index c3c51b9a6826..816bb0fde718 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -32,7 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 					   struct sk_buff *))
 {
 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
-	bool mono_delivery_time = skb->mono_delivery_time;
+	u8 tstamp_type = skb->tstamp_type;
 	unsigned int hlen, ll_rs, mtu;
 	ktime_t tstamp = skb->tstamp;
 	struct ip_frag_state state;
@@ -82,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
 			if (iter.frag)
 				ip_fraglist_prepare(skb, &iter);
 
-			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+			skb_set_delivery_time(skb, tstamp, tstamp_type);
 			err = output(net, sk, data, skb);
 			if (err || !iter.frag)
 				break;
@@ -113,7 +113,7 @@ slow_path:
 			goto blackhole;
 		}
 
-		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
 		err = output(net, sk, data, skb2);
 		if (err)
 			goto blackhole;
diff --git a/net/core/dev.c b/net/core/dev.c
index e1bb6d7856d9..85fe8138f3e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2160,7 +2160,7 @@ EXPORT_SYMBOL(net_disable_timestamp);
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
 	skb->tstamp = 0;
-	skb->mono_delivery_time = 0;
+	skb->tstamp_type = SKB_CLOCK_REALTIME;
 	if (static_branch_unlikely(&netstamp_needed_key))
 		skb->tstamp = ktime_get_real();
 }
diff --git a/net/core/filter.c b/net/core/filter.c
index 2510464692af..a3781a796da4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7730,13 +7730,13 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
 		if (!tstamp)
 			return -EINVAL;
 		skb->tstamp = tstamp;
-		skb->mono_delivery_time = 1;
+		skb->tstamp_type = SKB_CLOCK_MONOTONIC;
 		break;
 	case BPF_SKB_TSTAMP_UNSPEC:
 		if (tstamp)
 			return -EINVAL;
 		skb->tstamp = 0;
-		skb->mono_delivery_time = 0;
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
 		break;
 	default:
 		return -EINVAL;
@@ -9443,7 +9443,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
 					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
 		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
 					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
-		/* skb->tc_at_ingress && skb->mono_delivery_time,
+		/* skb->tc_at_ingress && skb->tstamp_type,
 		 * read 0 as the (rcv) timestamp.
 		 */
 		*insn++ = BPF_MOV64_IMM(value_reg, 0);
@@ -9468,7 +9468,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 	 * the bpf prog is aware the tstamp could have delivery time.
 	 * Thus, write skb->tstamp as is if tstamp_type_access is true.
 	 * Otherwise, writing at ingress will have to clear the
-	 * mono_delivery_time bit also.
+	 * skb->tstamp_type bit also.
 	 */
 	if (!prog->tstamp_type_access) {
 		__u8 tmp_reg = BPF_REG_AX;
@@ -9478,7 +9478,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
 		/* goto <store> */
 		*insn++ = BPF_JMP_A(2);
-		/* <clear>: mono_delivery_time */
+		/* <clear>: skb->tstamp_type */
 		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
 		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
 	}
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 56ef873828f4..867d637d86f0 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -130,7 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
 		goto err;
 
 	fq->q.stamp = skb->tstamp;
-	fq->q.mono_delivery_time = skb->mono_delivery_time;
+	fq->q.tstamp_type = skb->tstamp_type;
 	if (frag_type == LOWPAN_DISPATCH_FRAG1)
 		fq->q.flags |= INET_FRAG_FIRST_IN;
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index faaec92a46ac..d179a2c84222 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
 	skb_mark_not_on_list(head);
 	head->prev = NULL;
 	head->tstamp = q->stamp;
-	head->mono_delivery_time = q->mono_delivery_time;
+	head->tstamp_type = q->tstamp_type;
 
 	if (sk)
 		refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 08e2c92e25ab..a92664a5ef2e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		qp->iif = dev->ifindex;
 
 	qp->q.stamp = skb->tstamp;
-	qp->q.mono_delivery_time = skb->mono_delivery_time;
+	qp->q.tstamp_type = skb->tstamp_type;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
 	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9500031a1f55..fe86cadfa85b 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 {
 	struct iphdr *iph;
 	struct sk_buff *skb2;
-	bool mono_delivery_time = skb->mono_delivery_time;
+	u8 tstamp_type = skb->tstamp_type;
 	struct rtable *rt = skb_rtable(skb);
 	unsigned int mtu, hlen, ll_rs;
 	struct ip_fraglist_iter iter;
@@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				}
 			}
 
-			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+			skb_set_delivery_time(skb, tstamp, tstamp_type);
 			err = output(net, sk, skb);
 
 			if (!err)
@@ -912,7 +912,7 @@ slow_path:
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
-		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
 		err = output(net, sk, skb2);
 		if (err)
 			goto fail;
@@ -1649,7 +1649,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
 		nskb->ip_summed = CHECKSUM_NONE;
-		nskb->mono_delivery_time = !!transmit_time;
+		if (transmit_time)
+			nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
 		if (txhash)
 			skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
 		ip_push_pending_frames(sk, &fl4);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95caf8aaa8be..d44371cfa6ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1301,7 +1301,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
 	tp = tcp_sk(sk);
 	prior_wstamp = tp->tcp_wstamp_ns;
 	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
-	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+	skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
 	if (clone_it) {
 		oskb = skb;
 
@@ -1655,7 +1655,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 	skb_split(skb, buff, len);
 
-	skb_set_delivery_time(buff, skb->tstamp, true);
+	skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
 	tcp_fragment_tstamp(skb, buff);
 
 	old_factor = tcp_skb_pcount(skb);
@@ -2764,7 +2764,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
 			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
 			tp->tcp_wstamp_ns = tp->tcp_clock_cache;
-			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
+			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 			tcp_init_tso_segs(skb, mss_now);
 			goto repair; /* Skip network transmission */
@@ -3752,11 +3752,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 #ifdef CONFIG_SYN_COOKIES
 	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
 		skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
-				      true);
+				      SKB_CLOCK_MONOTONIC);
 	else
 #endif
 	{
-		skb_set_delivery_time(skb, now, true);
+		skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
 		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
 			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
 	}
@@ -3843,7 +3843,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
 	bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
 				synack_type, &opts);
 
-	skb_set_delivery_time(skb, now, true);
+	skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
 	tcp_add_tx_delay(skb, tp);
 
 	return skb;
@@ -4027,7 +4027,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
 
 	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
 
-	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
+	skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
 
 	/* Now full SYN+DATA was cloned and sent (or not),
 	 * remove the SYN from the original skb (syn_data)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 27d8725445e3..1ab0f23d37bf 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -859,7 +859,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 				inet6_sk(skb->sk) : NULL;
-	bool mono_delivery_time = skb->mono_delivery_time;
+	u8 tstamp_type = skb->tstamp_type;
 	struct ip6_frag_state state;
 	unsigned int mtu, hlen, nexthdr_offset;
 	ktime_t tstamp = skb->tstamp;
@@ -955,7 +955,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			if (iter.frag)
 				ip6_fraglist_prepare(skb, &iter);
 
-			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+			skb_set_delivery_time(skb, tstamp, tstamp_type);
 			err = output(net, sk, skb);
 			if (!err)
 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
@@ -1016,7 +1016,7 @@ slow_path:
 		/*
 		 *	Put this fragment into the sending queue.
 		 */
-		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
+		skb_set_delivery_time(frag, tstamp, tstamp_type);
 		err = output(net, sk, frag);
 		if (err)
 			goto fail;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 53d255838e6a..e0c2347b4dc6 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -126,7 +126,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 				  struct sk_buff *))
 {
 	int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
-	bool mono_delivery_time = skb->mono_delivery_time;
+	u8 tstamp_type = skb->tstamp_type;
 	ktime_t tstamp = skb->tstamp;
 	struct ip6_frag_state state;
 	u8 *prevhdr, nexthdr = 0;
@@ -192,7 +192,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 			if (iter.frag)
 				ip6_fraglist_prepare(skb, &iter);
 
-			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
+			skb_set_delivery_time(skb, tstamp, tstamp_type);
 			err = output(net, sk, data, skb);
 			if (err || !iter.frag)
 				break;
@@ -225,7 +225,7 @@ slow_path:
 			goto blackhole;
 		}
 
-		skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
+		skb_set_delivery_time(skb2, tstamp, tstamp_type);
 		err = output(net, sk, data, skb2);
 		if (err)
 			goto blackhole;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 5e1b50c6a44d..6f0844c9315d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		fq->iif = dev->ifindex;
 
 	fq->q.stamp = skb->tstamp;
-	fq->q.mono_delivery_time = skb->mono_delivery_time;
+	fq->q.tstamp_type = skb->tstamp_type;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
 	if (payload_len > fq->q.max_size)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 327caca64257..a48be617a8ab 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -198,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 		fq->iif = dev->ifindex;
 
 	fq->q.stamp = skb->tstamp;
-	fq->q.mono_delivery_time = skb->mono_delivery_time;
+	fq->q.tstamp_type = skb->tstamp_type;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
 	add_frag_mem_limit(fq->q.fqdir, skb->truesize);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 4c3605485b68..8333005c5c2e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -975,7 +975,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 			mark = inet_twsk(sk)->tw_mark;
 		else
 			mark = READ_ONCE(sk->sk_mark);
-		skb_set_delivery_time(buff, tcp_transmit_time(sk), true);
+		skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
 	}
 	if (txhash) {
 		/* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0e3cf11ae5fc..396b576390d0 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb,
 		bpf_compute_data_pointers(skb);
 		filter_res = bpf_prog_run(filter, skb);
 	}
-	if (unlikely(!skb->tstamp && skb->mono_delivery_time))
-		skb->mono_delivery_time = 0;
+	if (unlikely(!skb->tstamp && skb->tstamp_type))
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
 	if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
 		skb_orphan(skb);
 
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5e83e890f6a4..1941ebec23ff 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb,
 			bpf_compute_data_pointers(skb);
 			filter_res = bpf_prog_run(prog->filter, skb);
 		}
-		if (unlikely(!skb->tstamp && skb->mono_delivery_time))
-			skb->mono_delivery_time = 0;
+		if (unlikely(!skb->tstamp && skb->tstamp_type))
+			skb->tstamp_type = SKB_CLOCK_REALTIME;
 
 		if (prog->exts_integrated) {
 			res->class   = 0;

From 1693c5db6ab8262e6f5263f9d211855959aa5acd Mon Sep 17 00:00:00 2001
From: Abhishek Chauhan <quic_abchauha@quicinc.com>
Date: Thu, 9 May 2024 14:18:33 -0700
Subject: [PATCH 11/23] net: Add additional bit to support clockid_t timestamp
 type

tstamp_type is now set based on actual clockid_t compressed
into 2 bits.

To make the design scalable for future needs this commit bring in
the change to extend the tstamp_type:1 to tstamp_type:2 to support
other clockid_t timestamp.

We now support CLOCK_TAI as part of tstamp_type as part of this
commit with existing support CLOCK_MONOTONIC and CLOCK_REALTIME.

Signed-off-by: Abhishek Chauhan <quic_abchauha@quicinc.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240509211834.3235191-3-quic_abchauha@quicinc.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/skbuff.h         | 18 ++++++++-----
 include/uapi/linux/bpf.h       | 15 +++++++----
 net/core/filter.c              | 46 +++++++++++++++++++---------------
 net/ipv4/ip_output.c           |  5 +++-
 net/ipv4/raw.c                 |  2 +-
 net/ipv4/tcp_ipv4.c            |  2 ++
 net/ipv6/ip6_output.c          |  5 +++-
 net/ipv6/raw.c                 |  2 +-
 net/ipv6/tcp_ipv6.c            | 10 ++++++--
 net/packet/af_packet.c         |  7 +++---
 tools/include/uapi/linux/bpf.h | 15 +++++++----
 11 files changed, 81 insertions(+), 46 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3a721cc3b644..1e5c97daaa37 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -709,6 +709,8 @@ typedef unsigned char *sk_buff_data_t;
 enum skb_tstamp_type {
 	SKB_CLOCK_REALTIME,
 	SKB_CLOCK_MONOTONIC,
+	SKB_CLOCK_TAI,
+	__SKB_CLOCK_MAX = SKB_CLOCK_TAI,
 };
 
 /**
@@ -957,7 +959,7 @@ struct sk_buff {
 	/* private: */
 	__u8			__mono_tc_offset[0];
 	/* public: */
-	__u8			tstamp_type:1;	/* See skb_tstamp_type */
+	__u8			tstamp_type:2;	/* See skb_tstamp_type */
 #ifdef CONFIG_NET_XGRESS
 	__u8			tc_at_ingress:1;	/* See TC_AT_INGRESS_MASK */
 	__u8			tc_skip_classify:1;
@@ -1087,15 +1089,16 @@ struct sk_buff {
 #endif
 #define PKT_TYPE_OFFSET		offsetof(struct sk_buff, __pkt_type_offset)
 
-/* if you move tc_at_ingress or mono_delivery_time
+/* if you move tc_at_ingress or tstamp_type
  * around, you also must adapt these constants.
  */
 #ifdef __BIG_ENDIAN_BITFIELD
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 7)
-#define TC_AT_INGRESS_MASK		(1 << 6)
+#define SKB_TSTAMP_TYPE_MASK		(3 << 6)
+#define SKB_TSTAMP_TYPE_RSHIFT		(6)
+#define TC_AT_INGRESS_MASK		(1 << 5)
 #else
-#define SKB_MONO_DELIVERY_TIME_MASK	(1 << 0)
-#define TC_AT_INGRESS_MASK		(1 << 1)
+#define SKB_TSTAMP_TYPE_MASK		(3)
+#define TC_AT_INGRESS_MASK		(1 << 2)
 #endif
 #define SKB_BF_MONO_TC_OFFSET		offsetof(struct sk_buff, __mono_tc_offset)
 
@@ -4216,6 +4219,9 @@ static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
 	case CLOCK_MONOTONIC:
 		tstamp_type = SKB_CLOCK_MONOTONIC;
 		break;
+	case CLOCK_TAI:
+		tstamp_type = SKB_CLOCK_TAI;
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		kt = 0;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 90706a47f6ff..25ea393cf084 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6207,12 +6207,17 @@ union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+/* The enum used in skb->tstamp_type. It specifies the clock type
+ * of the time stored in the skb->tstamp.
+ */
 enum {
-	BPF_SKB_TSTAMP_UNSPEC,
-	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
-	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
-	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
-	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+	BPF_SKB_TSTAMP_UNSPEC = 0,		/* DEPRECATED */
+	BPF_SKB_TSTAMP_DELIVERY_MONO = 1,	/* DEPRECATED */
+	BPF_SKB_CLOCK_REALTIME = 0,
+	BPF_SKB_CLOCK_MONOTONIC = 1,
+	BPF_SKB_CLOCK_TAI = 2,
+	/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
+	 * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
 	 */
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index a3781a796da4..c6edfe9f41bc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7726,17 +7726,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
 		return -EOPNOTSUPP;
 
 	switch (tstamp_type) {
-	case BPF_SKB_TSTAMP_DELIVERY_MONO:
+	case BPF_SKB_CLOCK_REALTIME:
+		skb->tstamp = tstamp;
+		skb->tstamp_type = SKB_CLOCK_REALTIME;
+		break;
+	case BPF_SKB_CLOCK_MONOTONIC:
 		if (!tstamp)
 			return -EINVAL;
 		skb->tstamp = tstamp;
 		skb->tstamp_type = SKB_CLOCK_MONOTONIC;
 		break;
-	case BPF_SKB_TSTAMP_UNSPEC:
-		if (tstamp)
+	case BPF_SKB_CLOCK_TAI:
+		if (!tstamp)
 			return -EINVAL;
-		skb->tstamp = 0;
-		skb->tstamp_type = SKB_CLOCK_REALTIME;
+		skb->tstamp = tstamp;
+		skb->tstamp_type = SKB_CLOCK_TAI;
 		break;
 	default:
 		return -EINVAL;
@@ -9387,16 +9391,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
 {
 	__u8 value_reg = si->dst_reg;
 	__u8 skb_reg = si->src_reg;
-	/* AX is needed because src_reg and dst_reg could be the same */
-	__u8 tmp_reg = BPF_REG_AX;
-
-	*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
-			      SKB_BF_MONO_TC_OFFSET);
-	*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
-				SKB_MONO_DELIVERY_TIME_MASK, 2);
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
-	*insn++ = BPF_JMP_A(1);
-	*insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
+	BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
+	BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
+	BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
+	BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
+	*insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+	*insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+	*insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
+#else
+	BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
 
 	return insn;
 }
@@ -9439,10 +9444,11 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
 		__u8 tmp_reg = BPF_REG_AX;
 
 		*insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
-		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
-					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
-		*insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
-					TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
+		/* check if ingress mask bits is set */
+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+		*insn++ = BPF_JMP_A(4);
+		*insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+		*insn++ = BPF_JMP_A(2);
 		/* skb->tc_at_ingress && skb->tstamp_type,
 		 * read 0 as the (rcv) timestamp.
 		 */
@@ -9479,7 +9485,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
 		/* goto <store> */
 		*insn++ = BPF_JMP_A(2);
 		/* <clear>: skb->tstamp_type */
-		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
 		*insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
 	}
 #endif
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index fe86cadfa85b..b90d0f78ac80 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1457,7 +1457,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 
 	skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
 	skb->mark = cork->mark;
-	skb->tstamp = cork->transmit_time;
+	if (sk_is_tcp(sk))
+		skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
+	else
+		skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
 	/*
 	 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
 	 * on dst refcount
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4cb43401e0e0..1a0953650356 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 	skb->protocol = htons(ETH_P_IP);
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc->mark;
-	skb->tstamp = sockc->transmit_time;
+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
 	skb_dst_set(skb, &rt->dst);
 	*rtp = NULL;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 30ef0c8f5e92..8f70b8d1d1e5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3625,6 +3625,8 @@ void __init tcp_v4_init(void)
 		 */
 		inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
 
+		sk->sk_clockid = CLOCK_MONOTONIC;
+
 		per_cpu(ipv4_tcp_sk, cpu) = sk;
 	}
 	if (register_pernet_subsys(&tcp_sk_ops))
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1ab0f23d37bf..e7a19df3125e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1924,7 +1924,10 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = cork->base.mark;
-	skb->tstamp = cork->base.transmit_time;
+	if (sk_is_tcp(sk))
+		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
+	else
+		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
 
 	ip6_cork_steal_dst(skb, cork);
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 2eedf255600b..f838366e8256 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -621,7 +621,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc->mark;
-	skb->tstamp = sockc->transmit_time;
+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
 
 	skb_put(skb, length);
 	skb_reset_network_header(skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8333005c5c2e..750aa681779c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2387,8 +2387,14 @@ static struct inet_protosw tcpv6_protosw = {
 
 static int __net_init tcpv6_net_init(struct net *net)
 {
-	return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
-				    SOCK_RAW, IPPROTO_TCP, net);
+	int res;
+
+	res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
+				   SOCK_RAW, IPPROTO_TCP, net);
+	if (!res)
+		net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;
+
+	return res;
 }
 
 static void __net_exit tcpv6_net_exit(struct net *net)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ea3ebc160e25..fce390887591 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2056,8 +2056,7 @@ retry:
 	skb->dev = dev;
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = READ_ONCE(sk->sk_mark);
-	skb->tstamp = sockc.transmit_time;
-
+	skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
 	skb_setup_tx_timestamp(skb, sockc.tsflags);
 
 	if (unlikely(extra_len == 4))
@@ -2584,7 +2583,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
 	skb->dev = dev;
 	skb->priority = READ_ONCE(po->sk.sk_priority);
 	skb->mark = READ_ONCE(po->sk.sk_mark);
-	skb->tstamp = sockc->transmit_time;
+	skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid);
 	skb_setup_tx_timestamp(skb, sockc->tsflags);
 	skb_zcopy_set_nouarg(skb, ph.raw);
 
@@ -3062,7 +3061,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
 	skb->dev = dev;
 	skb->priority = READ_ONCE(sk->sk_priority);
 	skb->mark = sockc.mark;
-	skb->tstamp = sockc.transmit_time;
+	skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
 
 	if (unlikely(extra_len == 4))
 		skb->no_fcs = 1;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 90706a47f6ff..25ea393cf084 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6207,12 +6207,17 @@ union {					\
 	__u64 :64;			\
 } __attribute__((aligned(8)))
 
+/* The enum used in skb->tstamp_type. It specifies the clock type
+ * of the time stored in the skb->tstamp.
+ */
 enum {
-	BPF_SKB_TSTAMP_UNSPEC,
-	BPF_SKB_TSTAMP_DELIVERY_MONO,	/* tstamp has mono delivery time */
-	/* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle,
-	 * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC
-	 * and try to deduce it by ingress, egress or skb->sk->sk_clockid.
+	BPF_SKB_TSTAMP_UNSPEC = 0,		/* DEPRECATED */
+	BPF_SKB_TSTAMP_DELIVERY_MONO = 1,	/* DEPRECATED */
+	BPF_SKB_CLOCK_REALTIME = 0,
+	BPF_SKB_CLOCK_MONOTONIC = 1,
+	BPF_SKB_CLOCK_TAI = 2,
+	/* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle,
+	 * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid.
 	 */
 };
 

From c34e3ab2a76e6a55a64e0d56acc5607062c2bad9 Mon Sep 17 00:00:00 2001
From: Abhishek Chauhan <quic_abchauha@quicinc.com>
Date: Thu, 9 May 2024 14:18:34 -0700
Subject: [PATCH 12/23] selftests/bpf: Handle forwarding of UDP CLOCK_TAI
 packets

With changes in the design to forward CLOCK_TAI in the skbuff
framework,  existing selftest framework needs modification
to handle forwarding of UDP packets with CLOCK_TAI as clockid.

Signed-off-by: Abhishek Chauhan <quic_abchauha@quicinc.com>
Reviewed-by: Martin KaFai Lau <martin.lau@kernel.org>
Link: https://lore.kernel.org/r/20240509211834.3235191-4-quic_abchauha@quicinc.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 .../selftests/bpf/prog_tests/ctx_rewrite.c    | 10 +++--
 .../selftests/bpf/prog_tests/tc_redirect.c    |  3 --
 .../selftests/bpf/progs/test_tc_dtime.c       | 41 ++++++++-----------
 3 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
index 3b7c57fe55a5..08b6391f2f56 100644
--- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c
@@ -69,15 +69,17 @@ static struct test_case test_cases[] = {
 	{
 		N(SCHED_CLS, struct __sk_buff, tstamp),
 		.read  = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "w11 &= 3;"
-			 "if w11 != 0x3 goto pc+2;"
+			 "if w11 & 0x4 goto pc+1;"
+			 "goto pc+4;"
+			 "if w11 & 0x3 goto pc+1;"
+			 "goto pc+2;"
 			 "$dst = 0;"
 			 "goto pc+1;"
 			 "$dst = *(u64 *)($ctx + sk_buff::tstamp);",
 		.write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);"
-			 "if w11 & 0x2 goto pc+1;"
+			 "if w11 & 0x4 goto pc+1;"
 			 "goto pc+2;"
-			 "w11 &= -2;"
+			 "w11 &= -4;"
 			 "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;"
 			 "*(u64 *)($ctx + sk_buff::tstamp) = $src;",
 	},
diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
index b1073d36d77a..327d51f59142 100644
--- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c
@@ -890,9 +890,6 @@ static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd)
 
 	ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0,
 		  dtime_cnt_str(t, INGRESS_FWDNS_P100));
-	/* non mono delivery time is not forwarded */
-	ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0,
-		  dtime_cnt_str(t, INGRESS_FWDNS_P101));
 	for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++)
 		ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i));
 
diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
index 74ec09f040b7..ca8e8734d901 100644
--- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c
@@ -222,16 +222,20 @@ int egress_host(struct __sk_buff *skb)
 		return TC_ACT_OK;
 
 	if (skb_proto(skb_type) == IPPROTO_TCP) {
-		if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
+		if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
+		    skb->tstamp)
+			inc_dtimes(EGRESS_ENDHOST);
+		else
+			inc_errs(EGRESS_ENDHOST);
+	} else if (skb_proto(skb_type) == IPPROTO_UDP) {
+		if (skb->tstamp_type == BPF_SKB_CLOCK_TAI &&
 		    skb->tstamp)
 			inc_dtimes(EGRESS_ENDHOST);
 		else
 			inc_errs(EGRESS_ENDHOST);
 	} else {
-		if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC &&
+		if (skb->tstamp_type == BPF_SKB_CLOCK_REALTIME &&
 		    skb->tstamp)
-			inc_dtimes(EGRESS_ENDHOST);
-		else
 			inc_errs(EGRESS_ENDHOST);
 	}
 
@@ -252,7 +256,7 @@ int ingress_host(struct __sk_buff *skb)
 	if (!skb_type)
 		return TC_ACT_OK;
 
-	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO &&
+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC &&
 	    skb->tstamp == EGRESS_FWDNS_MAGIC)
 		inc_dtimes(INGRESS_ENDHOST);
 	else
@@ -315,7 +319,6 @@ int egress_fwdns_prio100(struct __sk_buff *skb)
 SEC("tc")
 int ingress_fwdns_prio101(struct __sk_buff *skb)
 {
-	__u64 expected_dtime = EGRESS_ENDHOST_MAGIC;
 	int skb_type;
 
 	skb_type = skb_get_type(skb);
@@ -323,29 +326,24 @@ int ingress_fwdns_prio101(struct __sk_buff *skb)
 		/* Should have handled in prio100 */
 		return TC_ACT_SHOT;
 
-	if (skb_proto(skb_type) == IPPROTO_UDP)
-		expected_dtime = 0;
-
 	if (skb->tstamp_type) {
 		if (fwdns_clear_dtime() ||
-		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
-		    skb->tstamp != expected_dtime)
+		    (skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC &&
+		    skb->tstamp_type != BPF_SKB_CLOCK_TAI) ||
+		    skb->tstamp != EGRESS_ENDHOST_MAGIC)
 			inc_errs(INGRESS_FWDNS_P101);
 		else
 			inc_dtimes(INGRESS_FWDNS_P101);
 	} else {
-		if (!fwdns_clear_dtime() && expected_dtime)
+		if (!fwdns_clear_dtime())
 			inc_errs(INGRESS_FWDNS_P101);
 	}
 
-	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
 		skb->tstamp = INGRESS_FWDNS_MAGIC;
 	} else {
 		if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
-				       BPF_SKB_TSTAMP_DELIVERY_MONO))
-			inc_errs(SET_DTIME);
-		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
-					BPF_SKB_TSTAMP_UNSPEC))
+				       BPF_SKB_CLOCK_MONOTONIC))
 			inc_errs(SET_DTIME);
 	}
 
@@ -370,7 +368,7 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
 
 	if (skb->tstamp_type) {
 		if (fwdns_clear_dtime() ||
-		    skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO ||
+		    skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC ||
 		    skb->tstamp != INGRESS_FWDNS_MAGIC)
 			inc_errs(EGRESS_FWDNS_P101);
 		else
@@ -380,14 +378,11 @@ int egress_fwdns_prio101(struct __sk_buff *skb)
 			inc_errs(EGRESS_FWDNS_P101);
 	}
 
-	if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) {
+	if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) {
 		skb->tstamp = EGRESS_FWDNS_MAGIC;
 	} else {
 		if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC,
-				       BPF_SKB_TSTAMP_DELIVERY_MONO))
-			inc_errs(SET_DTIME);
-		if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC,
-					BPF_SKB_TSTAMP_UNSPEC))
+				       BPF_SKB_CLOCK_MONOTONIC))
 			inc_errs(SET_DTIME);
 	}
 

From c12603e76ef666ce5c51a9d6faf155c9e3de7601 Mon Sep 17 00:00:00 2001
From: Xiao Wang <xiao.w.wang@intel.com>
Date: Thu, 16 May 2024 17:04:30 +0800
Subject: [PATCH 13/23] riscv, bpf: Optimize zextw insn with Zba extension

The Zba extension provides add.uw insn which can be used to implement
zext.w with rs2 set as ZERO.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Pu Lehui <pulehui@huawei.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/bpf/20240516090430.493122-1-xiao.w.wang@intel.com
---
 arch/riscv/Kconfig       | 12 ++++++++++++
 arch/riscv/net/bpf_jit.h | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 9e87287942dc..6b8f1059594d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -595,6 +595,18 @@ config TOOLCHAIN_HAS_VECTOR_CRYPTO
 	def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb)
 	depends on AS_HAS_OPTION_ARCH
 
+config RISCV_ISA_ZBA
+	bool "Zba extension support for bit manipulation instructions"
+	default y
+	help
+	   Add support for enabling optimisations in the kernel when the Zba
+	   extension is detected at boot.
+
+	   The Zba extension provides instructions to accelerate the generation
+	   of addresses that index into arrays of basic data types.
+
+	   If you don't know what to do here, say Y.
+
 config RISCV_ISA_ZBB
 	bool "Zbb extension support for bit manipulation instructions"
 	depends on TOOLCHAIN_HAS_ZBB
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index fdbf88ca8b70..97041b58237a 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -18,6 +18,11 @@ static inline bool rvc_enabled(void)
 	return IS_ENABLED(CONFIG_RISCV_ISA_C);
 }
 
+static inline bool rvzba_enabled(void)
+{
+	return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA);
+}
+
 static inline bool rvzbb_enabled(void)
 {
 	return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB);
@@ -939,6 +944,14 @@ static inline u16 rvc_sdsp(u32 imm9, u8 rs2)
 	return rv_css_insn(0x7, imm, rs2, 0x2);
 }
 
+/* RV64-only ZBA instructions. */
+
+static inline u32 rvzba_zextw(u8 rd, u8 rs1)
+{
+	/* add.uw rd, rs1, ZERO */
+	return rv_r_insn(0x04, RV_REG_ZERO, rs1, 0, rd, 0x3b);
+}
+
 #endif /* __riscv_xlen == 64 */
 
 /* Helper functions that emit RVC instructions when possible. */
@@ -1161,6 +1174,11 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx)
 
 static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx)
 {
+	if (rvzba_enabled()) {
+		emit(rvzba_zextw(rd, rs), ctx);
+		return;
+	}
+
 	emit_slli(rd, rs, 32, ctx);
 	emit_srli(rd, rd, 32, ctx);
 }

From e944fc8152744a41dc62e720995538e48b053bb9 Mon Sep 17 00:00:00 2001
From: Xiao Wang <xiao.w.wang@intel.com>
Date: Thu, 23 May 2024 11:18:35 +0800
Subject: [PATCH 14/23] riscv, bpf: Use STACK_ALIGN macro for size rounding up

Use the macro STACK_ALIGN that is defined in asm/processor.h for stack size
rounding up, just like bpf_jit_comp32.c does.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Pu Lehui <pulehui@huawei.com>
Link: https://lore.kernel.org/bpf/20240523031835.3977713-1-xiao.w.wang@intel.com
---
 arch/riscv/net/bpf_jit_comp64.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 79a001d5533e..c21a0ff23415 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -868,7 +868,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im,
 	stack_size += 8;
 	sreg_off = stack_size;
 
-	stack_size = round_up(stack_size, 16);
+	stack_size = round_up(stack_size, STACK_ALIGN);
 
 	if (!is_struct_ops) {
 		/* For the trampoline called from function entry,
@@ -1960,7 +1960,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 {
 	int i, stack_adjust = 0, store_offset, bpf_stack_adjust;
 
-	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
+	bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, STACK_ALIGN);
 	if (bpf_stack_adjust)
 		mark_fp(ctx);
 
@@ -1982,7 +1982,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog)
 	if (ctx->arena_vm_start)
 		stack_adjust += 8;
 
-	stack_adjust = round_up(stack_adjust, 16);
+	stack_adjust = round_up(stack_adjust, STACK_ALIGN);
 	stack_adjust += bpf_stack_adjust;
 
 	store_offset = stack_adjust - 8;

From 99fa63d9ca60c4c1cc843fde205e4bc6e86b218f Mon Sep 17 00:00:00 2001
From: Xiao Wang <xiao.w.wang@intel.com>
Date: Sun, 19 May 2024 13:05:07 +0800
Subject: [PATCH 15/23] riscv, bpf: Try RVC for reg move within BPF_CMPXCHG JIT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We could try to emit compressed insn for reg move operation during CMPXCHG
JIT, the instruction compression has no impact on the jump offsets of
following forward and backward jump instructions.

Signed-off-by: Xiao Wang <xiao.w.wang@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn@kernel.org>
Link: https://lore.kernel.org/bpf/20240519050507.2217791-1-xiao.w.wang@intel.com
---
 arch/riscv/net/bpf_jit_comp64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index c21a0ff23415..2bc4c14ea59a 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -537,8 +537,10 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64,
 	/* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */
 	case BPF_CMPXCHG:
 		r0 = bpf_to_rv_reg(BPF_REG_0, ctx);
-		emit(is64 ? rv_addi(RV_REG_T2, r0, 0) :
-		     rv_addiw(RV_REG_T2, r0, 0), ctx);
+		if (is64)
+			emit_mv(RV_REG_T2, r0, ctx);
+		else
+			emit_addiw(RV_REG_T2, r0, 0, ctx);
 		emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) :
 		     rv_lr_w(r0, 0, rd, 0, 0), ctx);
 		jmp_offset = ninsns_rvoff(8);

From 2c1713a8f1c94033a6e00aae4693ab03e8a3b9f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <linux@weissschuh.net>
Date: Sat, 18 May 2024 16:58:47 +0200
Subject: [PATCH 16/23] bpf: constify member bpf_sysctl_kern:: Table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sysctl core is preparing to only expose instances of struct ctl_table
as "const". This will also affect the ctl_table argument of sysctl handlers,
for which bpf_sysctl_kern::table is also used.

As the function prototype of all sysctl handlers throughout the tree
needs to stay consistent that change will be done in one commit.

To reduce the size of that final commit, switch this utility type which
is not bound by "typedef proc_handler" to "const struct ctl_table".

No functional change.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Joel Granados <j.granados@samsung.com>
Link: https://lore.kernel.org/bpf/20240518-sysctl-const-handler-bpf-v1-1-f0d7186743c1@weissschuh.net
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0f12cf01070e..b02aea291b7e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1406,7 +1406,7 @@ struct bpf_sock_ops_kern {
 
 struct bpf_sysctl_kern {
 	struct ctl_table_header *head;
-	struct ctl_table *table;
+	const struct ctl_table *table;
 	void *cur_val;
 	size_t cur_len;
 	void *new_val;

From 4652072e7b9d643edc9ebb04e3e2c021461b7af0 Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Fri, 17 May 2024 08:34:45 -0700
Subject: [PATCH 17/23] bpf, docs: Move sentence about returning R0 to abi.rst

As discussed at LSF/MM/BPF, the sentence about using R0 for returning
values from calls is part of the calling convention and belongs in
abi.rst.  Any further additions or clarifications to this text are left
for future patches on abi.rst.  The current patch is simply to unblock
progression of instruction-set.rst to a standard.

In contrast, the restriction of register numbers to the range 0-10
is untouched, left in the instruction-set.rst definition of the
src_reg and dst_reg fields.

Signed-off-by: Dave Thaler <dthaler1968@googlemail.com>
Link: https://lore.kernel.org/r/20240517153445.3914-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/standardization/abi.rst             | 3 +++
 Documentation/bpf/standardization/instruction-set.rst | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/bpf/standardization/abi.rst b/Documentation/bpf/standardization/abi.rst
index 0c2e10eeb89a..41514137cb7b 100644
--- a/Documentation/bpf/standardization/abi.rst
+++ b/Documentation/bpf/standardization/abi.rst
@@ -23,3 +23,6 @@ The BPF calling convention is defined as:
 
 R0 - R5 are scratch registers and BPF programs needs to spill/fill them if
 necessary across calls.
+
+The BPF program needs to store the return value into register R0 before doing an
+``EXIT``.
diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 00c93eb42613..e2f6c0a4a559 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -476,9 +476,6 @@ the jump instruction.  Thus 'PC += 1' skips execution of the next
 instruction if it's a basic instruction or results in undefined behavior
 if the next instruction is a 128-bit wide instruction.
 
-The BPF program needs to store the return value into register R0 before doing an
-``EXIT``.
-
 Example:
 
 ``{JSGE, X, JMP32}`` means::

From a985fdca5e7e665d58dc40c92a67c8b67b6291db Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Fri, 17 May 2024 09:58:55 -0700
Subject: [PATCH 18/23] bpf, docs: Use RFC 2119 language for ISA requirements

Per IETF convention and discussion at LSF/MM/BPF, use MUST etc.
keywords as requested by IETF Area Director review.  Also as
requested, indicate that documenting BTF is out of scope of this
document and will be covered by a separate IETF specification.

Added paragraph about the terminology that is required IETF boilerplate
and must be worded exactly as such.

Signed-off-by: Dave Thaler <dthaler1968@googlemail.com>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20240517165855.4688-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/standardization/instruction-set.rst   | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index e2f6c0a4a559..02e2490433c6 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -14,6 +14,13 @@ set architecture (ISA).
 Documentation conventions
 =========================
 
+The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and
+"OPTIONAL" in this document are to be interpreted as described in
+BCP 14 `<https://www.rfc-editor.org/info/rfc2119>`_
+`RFC8174 <https://www.rfc-editor.org/info/rfc8174>`_
+when, and only when, they appear in all capitals, as shown here.
+
 For brevity and consistency, this document refers to families
 of types using a shorthand syntax and refers to several expository,
 mnemonic functions when describing the semantics of instructions.
@@ -106,9 +113,9 @@ Conformance groups
 
 An implementation does not need to support all instructions specified in this
 document (e.g., deprecated instructions).  Instead, a number of conformance
-groups are specified.  An implementation must support the base32 conformance
-group and may support additional conformance groups, where supporting a
-conformance group means it must support all instructions in that conformance
+groups are specified.  An implementation MUST support the base32 conformance
+group and MAY support additional conformance groups, where supporting a
+conformance group means it MUST support all instructions in that conformance
 group.
 
 The use of named conformance groups enables interoperability between a runtime
@@ -209,7 +216,7 @@ For example::
   07     1       0        00 00  11 22 33 44  r1 += 0x11223344 // big
 
 Note that most instructions do not use all of the fields.
-Unused fields shall be cleared to zero.
+Unused fields SHALL be cleared to zero.
 
 Wide instruction encoding
 --------------------------
@@ -374,7 +381,7 @@ interpreted as a 64-bit signed value.
 Note that there are varying definitions of the signed modulo operation
 when the dividend or divisor are negative, where implementations often
 vary by language such that Python, Ruby, etc.  differ from C, Go, Java,
-etc. This specification requires that signed modulo use truncated division
+etc. This specification requires that signed modulo MUST use truncated division
 (where -13 % 3 == -1) as implemented in C, Go, etc.::
 
    a % n = a - n * trunc(a / n)
@@ -404,7 +411,7 @@ only and do not use a separate source register or immediate value.
 For ``ALU``, the 1-bit source operand field in the opcode is used to
 select what byte order the operation converts from or to. For
 ``ALU64``, the 1-bit source operand field in the opcode is reserved
-and must be set to 0.
+and MUST be set to 0.
 
 =====  ========  =====  =================================================
 class  source    value  description
@@ -512,7 +519,8 @@ for each program type, but static IDs are unique across all program types.
 
 Platforms that support the BPF Type Format (BTF) support identifying
 a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID
-identifies the helper name and type.
+identifies the helper name and type.  Further documentation of BTF
+is outside the scope of this document and is left for future work.
 
 Program-local functions
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -722,5 +730,5 @@ carried over from classic BPF. These instructions used an instruction
 class of ``LD``, a size modifier of ``W``, ``H``, or ``B``, and a
 mode modifier of ``ABS`` or ``IND``.  The 'dst_reg' and 'offset' fields were
 set to zero, and 'src_reg' was set to zero for ``ABS``.  However, these
-instructions are deprecated and should no longer be used.  All legacy packet
+instructions are deprecated and SHOULD no longer be used.  All legacy packet
 access instructions belong to the "packet" conformance group.

From 4e1215d9a1903fc9e976aa8903674d050c7af5ff Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Mon, 20 May 2024 14:52:55 -0700
Subject: [PATCH 19/23] bpf, docs: clarify sign extension of 64-bit use of
 32-bit imm

imm is defined as a 32-bit signed integer.

{MOV, K, ALU64} says it does "dst = src" (where src is 'imm') and it
does do dst = (s64)imm, which in that sense does sign extend imm. The MOVSX
instruction is explained as sign extending, so added the example of
{MOV, K, ALU64} to make this more clear.

{JLE, K, JMP} says it does "PC += offset if dst <= src" (where src is 'imm',
and the comparison is unsigned). This was apparently ambiguous to some
readers as to whether the comparison was "dst <= (u64)(u32)imm" or
"dst <= (u64)(s64)imm" so added an example to make this more clear.

v1 -> v2: Address comments from Yonghong

Signed-off-by: Dave Thaler <dthaler1968@googlemail.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Acked-by: David Vernet <void@manifault.com>
Link: https://lore.kernel.org/r/20240520215255.10595-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/standardization/instruction-set.rst     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 02e2490433c6..6b11d1109f12 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -393,6 +393,19 @@ The ``MOVSX`` instruction does a move operation with sign extension.
 operands into 64-bit operands.  Unlike other arithmetic instructions,
 ``MOVSX`` is only defined for register source operands (``X``).
 
+``{MOV, K, ALU64}`` means::
+
+  dst = (s64)imm
+
+``{MOV, X, ALU}`` means::
+
+  dst = (u32)src
+
+``{MOVSX, X, ALU}`` with 'offset' 8 means::
+
+  dst = (u32)(s32)(s8)src
+
+
 The ``NEG`` instruction is only defined when the source bit is clear
 (``K``).
 
@@ -491,6 +504,10 @@ Example:
 
 where 's>=' indicates a signed '>=' comparison.
 
+``{JLE, K, JMP}`` means::
+
+  if dst <= (u64)(s64)imm goto +offset
+
 ``{JA, K, JMP32}`` means::
 
   gotol +imm

From 6a6d8b6f00ade597e0030669fae3fdf57cfba33b Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Fri, 24 May 2024 09:46:18 -0700
Subject: [PATCH 20/23] bpf, docs: Add table captions

As suggested by Ines Robles in his IETF GENART review at
https://datatracker.ietf.org/doc/review-ietf-bpf-isa-02-genart-lc-robles-2024-05-16/

Signed-off-by: Dave Thaler <dthaler1968@gmail.com>
Link: https://lore.kernel.org/r/20240524164618.18894-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/standardization/instruction-set.rst   | 184 ++++++++++--------
 1 file changed, 102 insertions(+), 82 deletions(-)

diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 6b11d1109f12..1602454fe7cf 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -32,7 +32,7 @@ Types
 This document refers to integer types with the notation `SN` to specify
 a type's signedness (`S`) and bit width (`N`), respectively.
 
-.. table:: Meaning of signedness notation.
+.. table:: Meaning of signedness notation
 
   ==== =========
   S    Meaning
@@ -41,7 +41,7 @@ a type's signedness (`S`) and bit width (`N`), respectively.
   s    signed
   ==== =========
 
-.. table:: Meaning of bit-width notation.
+.. table:: Meaning of bit-width notation
 
   ===== =========
   N     Bit width
@@ -263,18 +263,20 @@ Instruction classes
 
 The three least significant bits of the 'opcode' field store the instruction class:
 
-=====  =====  ===============================  ===================================
-class  value  description                      reference
-=====  =====  ===============================  ===================================
-LD     0x0    non-standard load operations     `Load and store instructions`_
-LDX    0x1    load into register operations    `Load and store instructions`_
-ST     0x2    store from immediate operations  `Load and store instructions`_
-STX    0x3    store from register operations   `Load and store instructions`_
-ALU    0x4    32-bit arithmetic operations     `Arithmetic and jump instructions`_
-JMP    0x5    64-bit jump operations           `Arithmetic and jump instructions`_
-JMP32  0x6    32-bit jump operations           `Arithmetic and jump instructions`_
-ALU64  0x7    64-bit arithmetic operations     `Arithmetic and jump instructions`_
-=====  =====  ===============================  ===================================
+.. table:: Instruction class
+
+  =====  =====  ===============================  ===================================
+  class  value  description                      reference
+  =====  =====  ===============================  ===================================
+  LD     0x0    non-standard load operations     `Load and store instructions`_
+  LDX    0x1    load into register operations    `Load and store instructions`_
+  ST     0x2    store from immediate operations  `Load and store instructions`_
+  STX    0x3    store from register operations   `Load and store instructions`_
+  ALU    0x4    32-bit arithmetic operations     `Arithmetic and jump instructions`_
+  JMP    0x5    64-bit jump operations           `Arithmetic and jump instructions`_
+  JMP32  0x6    32-bit jump operations           `Arithmetic and jump instructions`_
+  ALU64  0x7    64-bit arithmetic operations     `Arithmetic and jump instructions`_
+  =====  =====  ===============================  ===================================
 
 Arithmetic and jump instructions
 ================================
@@ -292,6 +294,8 @@ For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and
 **s (source)**
   the source operand location, which unless otherwise specified is one of:
 
+  .. table:: Source operand location
+
   ======  =====  ==============================================
   source  value  description
   ======  =====  ==============================================
@@ -312,27 +316,29 @@ The 'code' field encodes the operation as below, where 'src' refers to the
 the source operand and 'dst' refers to the value of the destination
 register.
 
-=====  =====  =======  ==========================================================
-name   code   offset   description
-=====  =====  =======  ==========================================================
-ADD    0x0    0        dst += src
-SUB    0x1    0        dst -= src
-MUL    0x2    0        dst \*= src
-DIV    0x3    0        dst = (src != 0) ? (dst / src) : 0
-SDIV   0x3    1        dst = (src != 0) ? (dst s/ src) : 0
-OR     0x4    0        dst \|= src
-AND    0x5    0        dst &= src
-LSH    0x6    0        dst <<= (src & mask)
-RSH    0x7    0        dst >>= (src & mask)
-NEG    0x8    0        dst = -dst
-MOD    0x9    0        dst = (src != 0) ? (dst % src) : dst
-SMOD   0x9    1        dst = (src != 0) ? (dst s% src) : dst
-XOR    0xa    0        dst ^= src
-MOV    0xb    0        dst = src
-MOVSX  0xb    8/16/32  dst = (s8,s16,s32)src
-ARSH   0xc    0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
-END    0xd    0        byte swap operations (see `Byte swap instructions`_ below)
-=====  =====  =======  ==========================================================
+.. table:: Arithmetic instructions
+
+  =====  =====  =======  ==========================================================
+  name   code   offset   description
+  =====  =====  =======  ==========================================================
+  ADD    0x0    0        dst += src
+  SUB    0x1    0        dst -= src
+  MUL    0x2    0        dst \*= src
+  DIV    0x3    0        dst = (src != 0) ? (dst / src) : 0
+  SDIV   0x3    1        dst = (src != 0) ? (dst s/ src) : 0
+  OR     0x4    0        dst \|= src
+  AND    0x5    0        dst &= src
+  LSH    0x6    0        dst <<= (src & mask)
+  RSH    0x7    0        dst >>= (src & mask)
+  NEG    0x8    0        dst = -dst
+  MOD    0x9    0        dst = (src != 0) ? (dst % src) : dst
+  SMOD   0x9    1        dst = (src != 0) ? (dst s% src) : dst
+  XOR    0xa    0        dst ^= src
+  MOV    0xb    0        dst = src
+  MOVSX  0xb    8/16/32  dst = (s8,s16,s32)src
+  ARSH   0xc    0        :term:`sign extending<Sign Extend>` dst >>= (src & mask)
+  END    0xd    0        byte swap operations (see `Byte swap instructions`_ below)
+  =====  =====  =======  ==========================================================
 
 Underflow and overflow are allowed during arithmetic operations, meaning
 the 64-bit or 32-bit value will wrap. If BPF program execution would
@@ -426,13 +432,15 @@ select what byte order the operation converts from or to. For
 ``ALU64``, the 1-bit source operand field in the opcode is reserved
 and MUST be set to 0.
 
-=====  ========  =====  =================================================
-class  source    value  description
-=====  ========  =====  =================================================
-ALU    TO_LE     0      convert between host byte order and little endian
-ALU    TO_BE     1      convert between host byte order and big endian
-ALU64  Reserved  0      do byte swap unconditionally
-=====  ========  =====  =================================================
+.. table:: Byte swap instructions
+
+  =====  ========  =====  =================================================
+  class  source    value  description
+  =====  ========  =====  =================================================
+  ALU    TO_LE     0      convert between host byte order and little endian
+  ALU    TO_BE     1      convert between host byte order and big endian
+  ALU64  Reserved  0      do byte swap unconditionally
+  =====  ========  =====  =================================================
 
 The 'imm' field encodes the width of the swap operations.  The following widths
 are supported: 16, 32 and 64.  Width 64 operations belong to the base64
@@ -468,27 +476,29 @@ otherwise identical operations, and indicates the base64 conformance
 group unless otherwise specified.
 The 'code' field encodes the operation as below:
 
-========  =====  =======  =================================  ===================================================
-code      value  src_reg  description                        notes
-========  =====  =======  =================================  ===================================================
-JA        0x0    0x0      PC += offset                       {JA, K, JMP} only
-JA        0x0    0x0      PC += imm                          {JA, K, JMP32} only
-JEQ       0x1    any      PC += offset if dst == src
-JGT       0x2    any      PC += offset if dst > src          unsigned
-JGE       0x3    any      PC += offset if dst >= src         unsigned
-JSET      0x4    any      PC += offset if dst & src
-JNE       0x5    any      PC += offset if dst != src
-JSGT      0x6    any      PC += offset if dst > src          signed
-JSGE      0x7    any      PC += offset if dst >= src         signed
-CALL      0x8    0x0      call helper function by static ID  {CALL, K, JMP} only, see `Helper functions`_
-CALL      0x8    0x1      call PC += imm                     {CALL, K, JMP} only, see `Program-local functions`_
-CALL      0x8    0x2      call helper function by BTF ID     {CALL, K, JMP} only, see `Helper functions`_
-EXIT      0x9    0x0      return                             {CALL, K, JMP} only
-JLT       0xa    any      PC += offset if dst < src          unsigned
-JLE       0xb    any      PC += offset if dst <= src         unsigned
-JSLT      0xc    any      PC += offset if dst < src          signed
-JSLE      0xd    any      PC += offset if dst <= src         signed
-========  =====  =======  =================================  ===================================================
+.. table:: Jump instructions
+
+  ========  =====  =======  =================================  ===================================================
+  code      value  src_reg  description                        notes
+  ========  =====  =======  =================================  ===================================================
+  JA        0x0    0x0      PC += offset                       {JA, K, JMP} only
+  JA        0x0    0x0      PC += imm                          {JA, K, JMP32} only
+  JEQ       0x1    any      PC += offset if dst == src
+  JGT       0x2    any      PC += offset if dst > src          unsigned
+  JGE       0x3    any      PC += offset if dst >= src         unsigned
+  JSET      0x4    any      PC += offset if dst & src
+  JNE       0x5    any      PC += offset if dst != src
+  JSGT      0x6    any      PC += offset if dst > src          signed
+  JSGE      0x7    any      PC += offset if dst >= src         signed
+  CALL      0x8    0x0      call helper function by static ID  {CALL, K, JMP} only, see `Helper functions`_
+  CALL      0x8    0x1      call PC += imm                     {CALL, K, JMP} only, see `Program-local functions`_
+  CALL      0x8    0x2      call helper function by BTF ID     {CALL, K, JMP} only, see `Helper functions`_
+  EXIT      0x9    0x0      return                             {CALL, K, JMP} only
+  JLT       0xa    any      PC += offset if dst < src          unsigned
+  JLE       0xb    any      PC += offset if dst <= src         unsigned
+  JSLT      0xc    any      PC += offset if dst < src          signed
+  JSLE      0xd    any      PC += offset if dst <= src         signed
+  ========  =====  =======  =================================  ===================================================
 
 where 'PC' denotes the program counter, and the offset to increment by
 is in units of 64-bit instructions relative to the instruction following
@@ -559,6 +569,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
 **mode**
   The mode modifier is one of:
 
+  .. table:: Mode modifier
+
     =============  =====  ====================================  =============
     mode modifier  value  description                           reference
     =============  =====  ====================================  =============
@@ -573,6 +585,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the
 **sz (size)**
   The size modifier is one of:
 
+  .. table:: Size modifier
+
     ====  =====  =====================
     size  value  description
     ====  =====  =====================
@@ -641,14 +655,16 @@ The 'imm' field is used to encode the actual atomic operation.
 Simple atomic operation use a subset of the values defined to encode
 arithmetic operations in the 'imm' field to encode the atomic operation:
 
-========  =====  ===========
-imm       value  description
-========  =====  ===========
-ADD       0x00   atomic add
-OR        0x40   atomic or
-AND       0x50   atomic and
-XOR       0xa0   atomic xor
-========  =====  ===========
+.. table:: Simple atomic operations
+
+  ========  =====  ===========
+  imm       value  description
+  ========  =====  ===========
+  ADD       0x00   atomic add
+  OR        0x40   atomic or
+  AND       0x50   atomic and
+  XOR       0xa0   atomic xor
+  ========  =====  ===========
 
 
 ``{ATOMIC, W, STX}`` with 'imm' = ADD means::
@@ -662,6 +678,8 @@ XOR       0xa0   atomic xor
 In addition to the simple atomic operations, there also is a modifier and
 two complex atomic operations:
 
+.. table:: Complex atomic operations
+
 ===========  ================  ===========================
 imm          value             description
 ===========  ================  ===========================
@@ -695,17 +713,19 @@ The following table defines a set of ``{IMM, DW, LD}`` instructions
 with opcode subtypes in the 'src_reg' field, using new terms such as "map"
 defined further below:
 
-=======  =========================================  ===========  ==============
-src_reg  pseudocode                                 imm type     dst type
-=======  =========================================  ===========  ==============
-0x0      dst = (next_imm << 32) | imm               integer      integer
-0x1      dst = map_by_fd(imm)                       map fd       map
-0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data address
-0x3      dst = var_addr(imm)                        variable id  data address
-0x4      dst = code_addr(imm)                       integer      code address
-0x5      dst = map_by_idx(imm)                      map index    map
-0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data address
-=======  =========================================  ===========  ==============
+.. table:: 64-bit immediate instructions
+
+  =======  =========================================  ===========  ==============
+  src_reg  pseudocode                                 imm type     dst type
+  =======  =========================================  ===========  ==============
+  0x0      dst = (next_imm << 32) | imm               integer      integer
+  0x1      dst = map_by_fd(imm)                       map fd       map
+  0x2      dst = map_val(map_by_fd(imm)) + next_imm   map fd       data address
+  0x3      dst = var_addr(imm)                        variable id  data address
+  0x4      dst = code_addr(imm)                       integer      code address
+  0x5      dst = map_by_idx(imm)                      map index    map
+  0x6      dst = map_val(map_by_idx(imm)) + next_imm  map index    data address
+  =======  =========================================  ===========  ==============
 
 where
 

From f980f13e4eb299abba6692365315196e1ba6fd2c Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Sat, 25 May 2024 08:33:32 -0700
Subject: [PATCH 21/23] bpf, docs: Clarify call local offset

In the Jump instructions section it explains that the offset is
"relative to the instruction following the jump instruction".
But the program-local section confusingly said "referenced by
offset from the call instruction, similar to JA".

This patch updates that sentence with consistent wording, saying
it's relative to the instruction following the call instruction.

Signed-off-by: Dave Thaler <dthaler1968@gmail.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240525153332.21355-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/standardization/instruction-set.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 1602454fe7cf..08f614b10a56 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -552,9 +552,10 @@ is outside the scope of this document and is left for future work.
 Program-local functions
 ~~~~~~~~~~~~~~~~~~~~~~~
 Program-local functions are functions exposed by the same BPF program as the
-caller, and are referenced by offset from the call instruction, similar to
-``JA``.  The offset is encoded in the 'imm' field of the call instruction.
-An ``EXIT`` within the program-local function will return to the caller.
+caller, and are referenced by offset from the instruction following the call
+instruction, similar to ``JA``.  The offset is encoded in the 'imm' field of
+the call instruction. An ``EXIT`` within the program-local function will
+return to the caller.
 
 Load and store instructions
 ===========================

From e245ef8a0b06fa5a18ca3bdcf82430166419283e Mon Sep 17 00:00:00 2001
From: Dave Thaler <dthaler1968@googlemail.com>
Date: Sat, 25 May 2024 23:18:15 -0700
Subject: [PATCH 22/23] bpf, docs: Fix instruction.rst indentation

The table captions patch corrected indented most tables to work with
the table directive for adding a caption but missed two of them.

Signed-off-by: Dave Thaler <dthaler1968@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240526061815.22497-1-dthaler1968@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 .../bpf/standardization/instruction-set.rst   | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/Documentation/bpf/standardization/instruction-set.rst b/Documentation/bpf/standardization/instruction-set.rst
index 08f614b10a56..8d19810504b8 100644
--- a/Documentation/bpf/standardization/instruction-set.rst
+++ b/Documentation/bpf/standardization/instruction-set.rst
@@ -296,12 +296,12 @@ For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and
 
   .. table:: Source operand location
 
-  ======  =====  ==============================================
-  source  value  description
-  ======  =====  ==============================================
-  K       0      use 32-bit 'imm' value as source operand
-  X       1      use 'src_reg' register value as source operand
-  ======  =====  ==============================================
+    ======  =====  ==============================================
+    source  value  description
+    ======  =====  ==============================================
+    K       0      use 32-bit 'imm' value as source operand
+    X       1      use 'src_reg' register value as source operand
+    ======  =====  ==============================================
 
 **instruction class**
   the instruction class (see `Instruction classes`_)
@@ -681,13 +681,13 @@ two complex atomic operations:
 
 .. table:: Complex atomic operations
 
-===========  ================  ===========================
-imm          value             description
-===========  ================  ===========================
-FETCH        0x01              modifier: return old value
-XCHG         0xe0 | FETCH      atomic exchange
-CMPXCHG      0xf0 | FETCH      atomic compare and exchange
-===========  ================  ===========================
+  ===========  ================  ===========================
+  imm          value             description
+  ===========  ================  ===========================
+  FETCH        0x01              modifier: return old value
+  XCHG         0xe0 | FETCH      atomic exchange
+  CMPXCHG      0xf0 | FETCH      atomic compare and exchange
+  ===========  ================  ===========================
 
 The ``FETCH`` modifier is optional for simple atomic operations, and
 always set for the complex atomic operations.  If the ``FETCH`` flag

From d9cbd8343b010016fcaabc361c37720dcafddcbe Mon Sep 17 00:00:00 2001
From: yunshui <jiangyunshui@kylinos.cn>
Date: Thu, 23 May 2024 11:35:20 +0800
Subject: [PATCH 23/23] bpf, net: Use DEV_STAT_INC()

syzbot/KCSAN reported that races happen when multiple CPUs updating
dev->stats.tx_error concurrently. Adopt SMP safe DEV_STATS_INC() to
update the dev->stats fields.

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: yunshui <jiangyunshui@kylinos.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240523033520.4029314-1-jiangyunshui@kylinos.cn
---
 net/core/filter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index c6edfe9f41bc..7c46ecba3b01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2274,12 +2274,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
 
 	err = bpf_out_neigh_v6(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
-		dev->stats.tx_errors++;
+		DEV_STATS_INC(dev, tx_errors);
 	else
 		ret = NET_XMIT_SUCCESS;
 	goto out_xmit;
 out_drop:
-	dev->stats.tx_errors++;
+	DEV_STATS_INC(dev, tx_errors);
 	kfree_skb(skb);
 out_xmit:
 	return ret;
@@ -2380,12 +2380,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
 
 	err = bpf_out_neigh_v4(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
-		dev->stats.tx_errors++;
+		DEV_STATS_INC(dev, tx_errors);
 	else
 		ret = NET_XMIT_SUCCESS;
 	goto out_xmit;
 out_drop:
-	dev->stats.tx_errors++;
+	DEV_STATS_INC(dev, tx_errors);
 	kfree_skb(skb);
 out_xmit:
 	return ret;