From 820a185896b77814557302b981b092a9e7b36814 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 24 Jul 2024 15:15:35 +0200
Subject: [PATCH 01/58] fcntl: add F_CREATED_QUERY

Systemd has a helper called openat_report_new() that returns whether a
file was created anew or it already existed before for cases where
O_CREAT has to be used without O_EXCL (cf. [1]). That apparently isn't
something that's specific to systemd but it's where I noticed it.

The current logic is that it first attempts to open the file without
O_CREAT | O_EXCL and if it gets ENOENT the helper tries again with both
flags. If that succeeds all is well. If it now reports EEXIST it
retries.

That works fairly well but some corner cases make this more involved. If
this operates on a dangling symlink the first openat() without O_CREAT |
O_EXCL will return ENOENT but the second openat() with O_CREAT | O_EXCL
will fail with EEXIST. The reason is that openat() without O_CREAT |
O_EXCL follows the symlink while O_CREAT | O_EXCL doesn't for security
reasons. So it's not something we can really change unless we add an
explicit opt-in via O_FOLLOW which seems really ugly.

The caller could try and use fanotify() to register to listen for
creation events in the directory before calling openat(). The caller
could then compare the returned tid to its own tid to ensure that even
in threaded environments it actually created the file. That might work
but is a lot of work for something that should be fairly simple and I'm
uncertain about it's reliability.

The caller could use a bpf lsm hook to hook into security_file_open() to
figure out whether they created the file. That also seems a bit wild.

So let's add F_CREATED_QUERY which allows the caller to check whether
they actually did create the file. That has caveats of course but I
don't think they are problematic:

* In multi-threaded environments a thread can only be sure that it did
  create the file if it calls openat() with O_CREAT. In other words,
  it's obviously not enough to just go through it's fdtable and check
  these fds because another thread could've created the file.

* If there's any codepaths where an openat() with O_CREAT would yield
  the same struct file as that of another thread it would obviously
  cause wrong results. I'm not aware of any such codepaths from openat()
  itself. Imho, that would be a bug.

* Related to the previous point, calling the new fcntl() on files created
  and opened via special-purpose system calls or ioctl()s would cause
  wrong results only if the affected subsystem a) raises FMODE_CREATED
  and b) may return the same struct file for two different calls. I'm
  not seeing anything outside of regular VFS code that raises
  FMODE_CREATED.

  There is code for b) in e.g., the drm layer where the same struct file
  is resurfaced but again FMODE_CREATED isn't used and it would be very
  misleading if it did.

Link: https://github.com/systemd/systemd/blob/11d5e2b5fbf9f6bfa5763fd45b56829ad4f0777f/src/basic/fs-util.c#L1078 [1]
Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-1-e8153a2f1991@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fcntl.c                 | 10 ++++++++++
 include/uapi/linux/fcntl.h |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 300e5d9ad913..22ec683ad8f8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -343,6 +343,12 @@ static long f_dupfd_query(int fd, struct file *filp)
 	return f.file == filp;
 }
 
+/* Let the caller figure out whether a given file was just created. */
+static long f_created_query(const struct file *filp)
+{
+	return !!(filp->f_mode & FMODE_CREATED);
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
@@ -352,6 +358,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	long err = -EINVAL;
 
 	switch (cmd) {
+	case F_CREATED_QUERY:
+		err = f_created_query(filp);
+		break;
 	case F_DUPFD:
 		err = f_dupfd(argi, filp, 0);
 		break;
@@ -463,6 +472,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 static int check_fcntl_cmd(unsigned cmd)
 {
 	switch (cmd) {
+	case F_CREATED_QUERY:
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
 	case F_DUPFD_QUERY:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index c0bcc185fa48..e55a3314bcb0 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -16,6 +16,9 @@
 
 #define F_DUPFD_QUERY	(F_LINUX_SPECIFIC_BASE + 3)
 
+/* Was the file just created? */
+#define F_CREATED_QUERY	(F_LINUX_SPECIFIC_BASE + 4)
+
 /*
  * Cancel a blocking posix lock; internal use only until we expose an
  * asynchronous lock api to userspace:

From d0fe8920cbe42547798fd806f078eeaaba05df18 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 24 Jul 2024 15:15:36 +0200
Subject: [PATCH 02/58] selftests: add F_CREATED_QUERY tests

Add simple selftests for fcntl(fd, F_CREATED_QUERY, 0).

Link: https://lore.kernel.org/r/20240724-work-fcntl-v1-2-e8153a2f1991@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../testing/selftests/core/close_range_test.c | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index 12b4eb9d0434..e0d9851fe1c9 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -26,6 +26,10 @@
 #define F_DUPFD_QUERY (F_LINUX_SPECIFIC_BASE + 3)
 #endif
 
+#ifndef F_CREATED_QUERY
+#define F_CREATED_QUERY (F_LINUX_SPECIFIC_BASE + 4)
+#endif
+
 static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
 				  unsigned int flags)
 {
@@ -624,4 +628,39 @@ TEST(close_range_bitmap_corruption)
 	EXPECT_EQ(0, WEXITSTATUS(status));
 }
 
+TEST(fcntl_created)
+{
+	for (int i = 0; i < 101; i++) {
+		int fd;
+		char path[PATH_MAX];
+
+		fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(fd, 0) {
+			if (errno == ENOENT)
+				SKIP(return,
+					   "Skipping test since /dev/null does not exist");
+		}
+
+		/* We didn't create "/dev/null". */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0);
+		close(fd);
+
+		sprintf(path, "aaaa_%d", i);
+		fd = open(path, O_CREAT | O_RDONLY | O_CLOEXEC, 0600);
+		ASSERT_GE(fd, 0);
+
+		/* We created "aaaa_%d". */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 1);
+		close(fd);
+
+		fd = open(path, O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(fd, 0);
+
+		/* We're opening it again, so no positive creation check. */
+		EXPECT_EQ(fcntl(fd, F_CREATED_QUERY, 0), 0);
+		close(fd);
+		unlink(path);
+	}
+}
+
 TEST_HARNESS_MAIN

From e02bdb7208e7b5432cbe68392ae062810a7649fb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 25 Jul 2024 15:31:24 +0200
Subject: [PATCH 03/58] Merge patch series "Add an fcntl() to check file
 creation"

Christian Brauner <brauner@kernel.org> says:

Systemd has a helper called openat_report_new() that returns whether a
file was created anew or it already existed before for cases where
O_CREAT has to be used without O_EXCL (cf. [1]). That apparently isn't
something that's specific to systemd but it's where I noticed it.

The current logic is that it first attempts to open the file without
O_CREAT | O_EXCL and if it gets ENOENT the helper tries again with both
flags. If that succeeds all is well. If it now reports EEXIST it
retries.

That works fairly well but some corner cases make this more involved. If
this operates on a dangling symlink the first openat() without O_CREAT |
O_EXCL will return ENOENT but the second openat() with O_CREAT | O_EXCL
will fail with EEXIST. The reason is that openat() without O_CREAT |
O_EXCL follows the symlink while O_CREAT | O_EXCL doesn't for security
reasons. So it's not something we can really change unless we add an
explicit opt-in via O_FOLLOW which seems really ugly.

The caller could try and use fanotify() to register to listen for
creation events in the directory before calling openat(). The caller
could then compare the returned tid to its own tid to ensure that even
in threaded environments it actually created the file. That might work
but is a lot of work for something that should be fairly simple and I'm
uncertain about it's reliability.

The caller could use a bpf lsm hook to hook into security_file_open() to
figure out whether they created the file. That also seems a bit wild.

So let's add F_CREATED_QUERY which allows the caller to check whether
they actually did create the file. That has caveats of course but I
don't
think they are problematic:

* In multi-threaded environments a thread can only be sure that it did
  create the file if it calls openat() with O_CREAT. In other words,
  it's obviously not enough to just go through it's fdtable and check
  these fds because another thread could've created the file.

* If there's any codepaths where an openat() with O_CREAT would yield
  the same struct file as that of another thread it would obviously
  cause wrong results. I'm not aware of any such codepaths from openat()
  itself. Imho, that would be a bug.

* Related to the previous point, calling the new fcntl() on files created
  and opened via special-purpose system calls or ioctl()s would cause
  wrong results only if the affected subsystem a) raises FMODE_CREATED
  and b) may return the same struct file for two different calls. I'm
  not seeing anything outside of regular VFS code that raises
  FMODE_CREATED.

  There is code for b) in e.g., the drm layer where the same struct file
  is resurfaced but again FMODE_CREATED isn't used and it would be very
  misleading if it did.

[1]: https://github.com/systemd/systemd/blob/11d5e2b5fbf9f6bfa5763fd45b56829ad4f0777f/src/basic/fs-util.c#L1078

* patches from https://lore.kernel.org/r/20240724-work-fcntl-v1-0-e8153a2f1991@kernel.org:
  selftests: add F_CREATED_QUERY tests
  fcntl: add F_CREATED_QUERY

Signed-off-by: Christian Brauner <brauner@kernel.org>

From c393eaa85349941badf3ce5f087400dbaf3bbe02 Mon Sep 17 00:00:00 2001
From: Haifeng Xu <haifeng.xu@shopee.com>
Date: Fri, 26 Jul 2024 11:05:25 +0800
Subject: [PATCH 04/58] fs: don't flush in-flight wb switches for superblocks
 without cgroup writeback

When deactivating any type of superblock, it had to wait for the in-flight
wb switches to be completed. wb switches are executed in inode_switch_wbs_work_fn()
which needs to acquire the wb_switch_rwsem and races against sync_inodes_sb().
If there are too much dirty data in the superblock, the waiting time may increase
significantly.

For superblocks without cgroup writeback such as tmpfs, they have nothing to
do with the wb swithes, so the flushing can be avoided.

Signed-off-by: Haifeng Xu <haifeng.xu@shopee.com>
Link: https://lore.kernel.org/r/20240726030525.180330-1-haifeng.xu@shopee.com
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c         | 7 ++++++-
 fs/super.c                | 2 +-
 include/linux/writeback.h | 4 ++--
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index b865a3fa52f3..4451ecff37c4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1132,6 +1132,7 @@ out_bdi_put:
 
 /**
  * cgroup_writeback_umount - flush inode wb switches for umount
+ * @sb: target super_block
  *
  * This function is called when a super_block is about to be destroyed and
  * flushes in-flight inode wb switches.  An inode wb switch goes through
@@ -1140,8 +1141,12 @@ out_bdi_put:
  * rare occurrences and synchronize_rcu() can take a while, perform
  * flushing iff wb switches are in flight.
  */
-void cgroup_writeback_umount(void)
+void cgroup_writeback_umount(struct super_block *sb)
 {
+
+	if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
+		return;
+
 	/*
 	 * SB_ACTIVE should be reliably cleared before checking
 	 * isw_nr_in_flight, see generic_shutdown_super().
diff --git a/fs/super.c b/fs/super.c
index 38d72a3cf6fc..28f42502bdf9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -621,7 +621,7 @@ void generic_shutdown_super(struct super_block *sb)
 		sync_filesystem(sb);
 		sb->s_flags &= ~SB_ACTIVE;
 
-		cgroup_writeback_umount();
+		cgroup_writeback_umount(sb);
 
 		/* Evict all inodes with zero refcount. */
 		evict_inodes(sb);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1a54676d843a..56b85841ae4c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -217,7 +217,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 			      size_t bytes);
 int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
 			   enum wb_reason reason, struct wb_completion *done);
-void cgroup_writeback_umount(void);
+void cgroup_writeback_umount(struct super_block *sb);
 bool cleanup_offline_cgwb(struct bdi_writeback *wb);
 
 /**
@@ -324,7 +324,7 @@ static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
 {
 }
 
-static inline void cgroup_writeback_umount(void)
+static inline void cgroup_writeback_umount(struct super_block *sb)
 {
 }
 

From 160210cea0356d31ffaf64b63a3ebeb26c7c0cd4 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Mon, 3 Jun 2024 09:48:34 +0800
Subject: [PATCH 05/58] fs/direct-io: Remove linux/prefetch.h include

After commit c22198e78d52 ("direct-io: remove random prefetches"), Nothing
in this file needs anything from `linux/prefetch.h`.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Link: https://lore.kernel.org/r/20240603014834.45294-1-youling.tang@linux.dev
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/direct-io.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b0aafe640fa4..bbd05f1a2145 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
 #include <linux/rwsem.h>
 #include <linux/uio.h>
 #include <linux/atomic.h>
-#include <linux/prefetch.h>
 
 #include "internal.h"
 
@@ -1121,11 +1120,6 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	struct blk_plug plug;
 	unsigned long align = offset | iov_iter_alignment(iter);
 
-	/*
-	 * Avoid references to bdev if not absolutely needed to give
-	 * the early prefetch in the caller enough time.
-	 */
-
 	/* watch out for a 0 len io from a tricksy fs */
 	if (iov_iter_rw(iter) == READ && !count)
 		return 0;

From de7007f27d9108fb00d801a6be8aae2ce4dd6101 Mon Sep 17 00:00:00 2001
From: Mohit0404 <mohitpawar@mitaoe.ac.in>
Date: Sat, 27 Jul 2024 12:51:34 +0530
Subject: [PATCH 06/58] Fixed: fs: file_table_c: Missing blank line warnings
 and struct declaration improved

Fixed-
	WARNING: Missing a blank line after declarations
	WARNING: Missing a blank line after declarations
	Declaration format: improved struct file declaration format

Signed-off-by: Mohit0404 <mohitpawar@mitaoe.ac.in>
Link: https://lore.kernel.org/r/20240727072134.130962-2-mohitpawar@mitaoe.ac.in
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index ca7843dde56d..35f2d5d9ca76 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -136,6 +136,7 @@ static int __init init_fs_stat_sysctls(void)
 	register_sysctl_init("fs", fs_stat_sysctls);
 	if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
 		struct ctl_table_header *hdr;
+
 		hdr = register_sysctl_mount_point("fs/binfmt_misc");
 		kmemleak_not_leak(hdr);
 	}
@@ -383,7 +384,9 @@ EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
 struct file *alloc_file_clone(struct file *base, int flags,
 				const struct file_operations *fops)
 {
-	struct file *f = alloc_file(&base->f_path, flags, fops);
+	struct file *f;
+
+	f = alloc_file(&base->f_path, flags, fops);
 	if (!IS_ERR(f)) {
 		path_get(&f->f_path);
 		f->f_mapping = base->f_mapping;

From 4bcda1eaf184e308f07f9c61d3a535f9ce477ce8 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Tue, 30 Jul 2024 10:58:13 +0200
Subject: [PATCH 07/58] mount: handle OOM on mnt_warn_timestamp_expiry

If no page could be allocated, an error pointer was used as format
string in pr_warn.

Rearrange the code to return early in case of OOM. Also add a check
for the return value of d_path.

Fixes: f8b92ba67c5d ("mount: Add mount warning for impending timestamp expiry")
Signed-off-by: Olaf Hering <olaf@aepfle.de>
Link: https://lore.kernel.org/r/20240730085856.32385-1-olaf@aepfle.de
[brauner: rewrite commit and commit message]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 328087a4df8a..155c6abda71d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2921,8 +2921,15 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
 	if (!__mnt_is_readonly(mnt) &&
 	   (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
 	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
-		char *buf = (char *)__get_free_page(GFP_KERNEL);
-		char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
+		char *buf, *mntpath;
+
+		buf = (char *)__get_free_page(GFP_KERNEL);
+		if (buf)
+			mntpath = d_path(mountpoint, buf, PAGE_SIZE);
+		else
+			mntpath = ERR_PTR(-ENOMEM);
+		if (IS_ERR(mntpath))
+			mntpath = "(unknown)";
 
 		pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
 			sb->s_type->name,
@@ -2930,8 +2937,9 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
 			mntpath, &sb->s_time_max,
 			(unsigned long long)sb->s_time_max);
 
-		free_page((unsigned long)buf);
 		sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
+		if (buf)
+			free_page((unsigned long)buf);
 	}
 }
 

From 6c203968f5af720ee2f4a8a17734e40aabe7b96b Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Tue, 30 Jul 2024 14:18:34 +0200
Subject: [PATCH 08/58] fs/aio: Fix __percpu annotation of *cpu pointer in
 struct kioctx

__percpu annotation of *cpu pointer in struct kioctx is put at
the wrong place, resulting in several sparse warnings:

aio.c:623:24: warning: incorrect type in argument 1 (different address spaces)
aio.c:623:24:    expected void [noderef] __percpu *__pdata
aio.c:623:24:    got struct kioctx_cpu *cpu
aio.c:788:18: warning: incorrect type in assignment (different address spaces)
aio.c:788:18:    expected struct kioctx_cpu *cpu
aio.c:788:18:    got struct kioctx_cpu [noderef] __percpu *
aio.c:835:24: warning: incorrect type in argument 1 (different address spaces)
aio.c:835:24:    expected void [noderef] __percpu *__pdata
aio.c:835:24:    got struct kioctx_cpu *cpu
aio.c:940:16: warning: incorrect type in initializer (different address spaces)
aio.c:940:16:    expected void const [noderef] __percpu *__vpp_verify
aio.c:940:16:    got struct kioctx_cpu *
aio.c:958:16: warning: incorrect type in initializer (different address spaces)
aio.c:958:16:    expected void const [noderef] __percpu *__vpp_verify
aio.c:958:16:    got struct kioctx_cpu *

Put __percpu annotation at the right place to fix these warnings.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Link: https://lore.kernel.org/r/20240730121915.4514-1-ubizjak@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index 6066f64967b3..e8920178b50f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -100,7 +100,7 @@ struct kioctx {
 
 	unsigned long		user_id;
 
-	struct __percpu kioctx_cpu *cpu;
+	struct kioctx_cpu __percpu *cpu;
 
 	/*
 	 * For percpu reqs_available, number of slots we move to/from global

From 6a64c5220c5df235448b846aeff3c0660d4cc83e Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 31 Jul 2024 23:10:27 +1000
Subject: [PATCH 09/58] autofs: fix missing fput for FSCONFIG_SET_FD

If you pass an fd using FSCONFIG_SET_FD, autofs_parse_fd() "steals" the
param->file and so the fs_context infrastructure will not do fput() for
us.

Fixes: e6ec453bd0f0 ("autofs: convert autofs to use the new mount api")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20240731-fsconfig-fsparam_fd-fixes-v2-1-e7c472224417@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/autofs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index cf792d4de4f1..64faa6c51f60 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -172,8 +172,7 @@ static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi,
 	ret = autofs_check_pipe(pipe);
 	if (ret < 0) {
 		errorf(fc, "Invalid/unusable pipe");
-		if (param->type != fs_value_is_file)
-			fput(pipe);
+		fput(pipe);
 		return -EBADF;
 	}
 

From 66e5cfee6280a361c429b6a6bfd66044e4644e86 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 31 Jul 2024 23:10:28 +1000
Subject: [PATCH 10/58] coda: use param->file for FSCONFIG_SET_FD

While the old code did support FSCONFIG_SET_FD, there's no need to
re-get the file the fs_context infrastructure already grabbed for us.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20240731-fsconfig-fsparam_fd-fixes-v2-2-e7c472224417@cyphar.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coda/inode.c | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6898dc621011..6896fce122e1 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -119,31 +119,43 @@ static const struct fs_parameter_spec coda_param_specs[] = {
 	{}
 };
 
-static int coda_parse_fd(struct fs_context *fc, int fd)
+static int coda_set_idx(struct fs_context *fc, struct file *file)
 {
 	struct coda_fs_context *ctx = fc->fs_private;
-	struct fd f;
 	struct inode *inode;
 	int idx;
 
-	f = fdget(fd);
-	if (!f.file)
-		return -EBADF;
-	inode = file_inode(f.file);
+	inode = file_inode(file);
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
-		fdput(f);
-		return invalf(fc, "code: Not coda psdev");
+		return invalf(fc, "coda: Not coda psdev");
 	}
-
 	idx = iminor(inode);
-	fdput(f);
-
 	if (idx < 0 || idx >= MAX_CODADEVS)
 		return invalf(fc, "coda: Bad minor number");
 	ctx->idx = idx;
 	return 0;
 }
 
+static int coda_parse_fd(struct fs_context *fc, struct fs_parameter *param,
+			 struct fs_parse_result *result)
+{
+	struct file *file;
+	int err;
+
+	if (param->type == fs_value_is_file) {
+		file = param->file;
+		param->file = NULL;
+	} else {
+		file = fget(result->uint_32);
+	}
+	if (!file)
+		return -EBADF;
+
+	err = coda_set_idx(fc, file);
+	fput(file);
+	return err;
+}
+
 static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
 	struct fs_parse_result result;
@@ -155,7 +167,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
 
 	switch (opt) {
 	case Opt_fd:
-		return coda_parse_fd(fc, result.uint_32);
+		return coda_parse_fd(fc, param, &result);
 	}
 
 	return 0;
@@ -167,6 +179,7 @@ static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
  */
 static int coda_parse_monolithic(struct fs_context *fc, void *_data)
 {
+	struct file *file;
 	struct coda_mount_data *data = _data;
 
 	if (!data)
@@ -175,7 +188,11 @@ static int coda_parse_monolithic(struct fs_context *fc, void *_data)
 	if (data->version != CODA_MOUNT_VERSION)
 		return invalf(fc, "coda: Bad mount version");
 
-	coda_parse_fd(fc, data->fd);
+	file = fget(data->fd);
+	if (file) {
+		coda_set_idx(fc, file);
+		fput(file);
+	}
 	return 0;
 }
 

From c01a5d89e5c8abe638107be2a4ea9e4c7fcdd7f6 Mon Sep 17 00:00:00 2001
From: Wang Long <w@laoqinren.net>
Date: Fri, 2 Aug 2024 17:19:01 +0800
Subject: [PATCH 11/58] percpu-rwsem: remove the unused parameter 'read'

In the function percpu_rwsem_release, the parameter `read`
is unused, so remove it.

Signed-off-by: Wang Long <w@laoqinren.net>
Link: https://lore.kernel.org/r/20240802091901.2546797-1-w@laoqinren.net
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c                   | 2 +-
 include/linux/fs.h           | 2 +-
 include/linux/percpu-rwsem.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 28f42502bdf9..33f35abb1990 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1905,7 +1905,7 @@ static void lockdep_sb_freeze_release(struct super_block *sb)
 	int level;
 
 	for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
-		percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
+		percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb0426f349fc..c0286d479c9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1683,7 +1683,7 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
 #define __sb_writers_acquired(sb, lev)	\
 	percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
 #define __sb_writers_release(sb, lev)	\
-	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
+	percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)
 
 /**
  * __sb_write_started - check if sb freeze level is held
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 36b942b67b7d..c012df33a9f0 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -145,7 +145,7 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 #define percpu_rwsem_assert_held(sem)	lockdep_assert_held(sem)
 
 static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
-					bool read, unsigned long ip)
+					unsigned long ip)
 {
 	lock_release(&sem->dep_map, ip);
 }

From 2e91f69afa7eb4e0bbff2254bb4cc8f97b76c009 Mon Sep 17 00:00:00 2001
From: Yue Haibing <yuehaibing@huawei.com>
Date: Sat, 3 Aug 2024 19:50:00 +0800
Subject: [PATCH 12/58] fs: mounts: Remove unused declaration mnt_cursor_del()

Commit 2eea9ce4310d ("mounts: keep list of mounts in an rbtree")
removed the implementation but leave declaration.

Signed-off-by: Yue Haibing <yuehaibing@huawei.com>
Link: https://lore.kernel.org/r/20240803115000.589872-1-yuehaibing@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mount.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/mount.h b/fs/mount.h
index ad4b1ddebb54..0a78f85cf737 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -153,5 +153,4 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 	list_add_tail(&mnt->mnt_list, dt_list);
 }
 
-extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
 bool has_locked_children(struct mount *mnt, struct dentry *dentry);

From 45fab40d34650fad9ddbaeaed1a89997d7736ea0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 2 Aug 2024 17:45:02 -0400
Subject: [PATCH 13/58] fs: remove comment about d_rcu_to_refcount

This function no longer exists.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20240802-openfast-v1-1-a1cff2a33063@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/dcache.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 3d8daaecb6d1..1af75fa68638 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2163,9 +2163,6 @@ seqretry:
  * without taking d_lock and checking d_seq sequence count against @seq
  * returned here.
  *
- * A refcount may be taken on the found dentry with the d_rcu_to_refcount
- * function.
- *
  * Alternatively, __d_lookup_rcu may be called again to look up the child of
  * the returned dentry, so long as its parent's seqlock is checked after the
  * child is looked up. Thus, an interlocking stepping of sequence lock checks

From 46460c1d42e823842db8b529156069187d566c83 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 2 Aug 2024 17:45:03 -0400
Subject: [PATCH 14/58] fs: add a kerneldoc header over lookup_fast

The lookup_fast helper in fs/namei.c has some subtlety in how dentries
are returned. Document them.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20240802-openfast-v1-2-a1cff2a33063@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/namei.c b/fs/namei.c
index 5512cb10fa89..c3459785704d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1639,6 +1639,20 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
 }
 EXPORT_SYMBOL(lookup_one_qstr_excl);
 
+/**
+ * lookup_fast - do fast lockless (but racy) lookup of a dentry
+ * @nd: current nameidata
+ *
+ * Do a fast, but racy lookup in the dcache for the given dentry, and
+ * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
+ * found. On error, an ERR_PTR will be returned.
+ *
+ * If this function returns a valid dentry and the walk is no longer
+ * lazy, the dentry will carry a reference that must later be put. If
+ * RCU mode is still in force, then this is not the case and the dentry
+ * must be legitimized before use. If this returns NULL, then the walk
+ * will no longer be in RCU mode.
+ */
 static struct dentry *lookup_fast(struct nameidata *nd)
 {
 	struct dentry *dentry, *parent = nd->path.dentry;

From 0d196e7589cefe207d5d41f37a0a28a1fdeeb7c6 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Mon, 5 Aug 2024 15:17:21 +0200
Subject: [PATCH 15/58] exec: don't WARN for racy path_noexec check

Both i_mode and noexec checks wrapped in WARN_ON stem from an artifact
of the previous implementation. They used to legitimately check for the
condition, but that got moved up in two commits:
633fb6ac3980 ("exec: move S_ISREG() check earlier")
0fd338b2d2cd ("exec: move path_noexec() check earlier")

Instead of being removed said checks are WARN_ON'ed instead, which
has some debug value.

However, the spurious path_noexec check is racy, resulting in
unwarranted warnings should someone race with setting the noexec flag.

One can note there is more to perm-checking whether execve is allowed
and none of the conditions are guaranteed to still hold after they were
tested for.

Additionally this does not validate whether the code path did any perm
checking to begin with -- it will pass if the inode happens to be
regular.

Keep the redundant path_noexec() check even though it's mindless
nonsense checking for guarantee that isn't given so drop the WARN.

Reword the commentary and do small tidy ups while here.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240805131721.765484-1-mjguzik@gmail.com
[brauner: keep redundant path_noexec() check]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exec.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 50e76cc633c4..caae051c5a95 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -145,13 +145,11 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 		goto out;
 
 	/*
-	 * may_open() has already checked for this, so it should be
-	 * impossible to trip now. But we need to be extra cautious
-	 * and check again at the very end too.
+	 * Check do_open_execat() for an explanation.
 	 */
 	error = -EACCES;
-	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
-			 path_noexec(&file->f_path)))
+	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
+	    path_noexec(&file->f_path))
 		goto exit;
 
 	error = -ENOEXEC;
@@ -954,7 +952,6 @@ EXPORT_SYMBOL(transfer_args_to_stack);
 static struct file *do_open_execat(int fd, struct filename *name, int flags)
 {
 	struct file *file;
-	int err;
 	struct open_flags open_exec_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
 		.acc_mode = MAY_EXEC,
@@ -971,24 +968,20 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 
 	file = do_filp_open(fd, name, &open_exec_flags);
 	if (IS_ERR(file))
-		goto out;
+		return file;
 
 	/*
-	 * may_open() has already checked for this, so it should be
-	 * impossible to trip now. But we need to be extra cautious
-	 * and check again at the very end too.
+	 * In the past the regular type check was here. It moved to may_open() in
+	 * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
+	 * an invariant that all non-regular files error out before we get here.
 	 */
-	err = -EACCES;
-	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) ||
-			 path_noexec(&file->f_path)))
-		goto exit;
+	if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
+	    path_noexec(&file->f_path)) {
+		fput(file);
+		return ERR_PTR(-EACCES);
+	}
 
-out:
 	return file;
-
-exit:
-	fput(file);
-	return ERR_PTR(err);
 }
 
 /**

From c5ae8e5e5a34963513bf973e77dd15da7333054e Mon Sep 17 00:00:00 2001
From: Yuesong Li <liyuesong@vivo.com>
Date: Tue, 6 Aug 2024 11:47:10 +0800
Subject: [PATCH 16/58] fs/namespace.c: Fix typo in comment

replace 'permanetly' with 'permanently' in the comment &
replace 'propogated' with 'propagated' in the comment

Signed-off-by: Yuesong Li <liyuesong@vivo.com>
Link: https://lore.kernel.org/r/20240806034710.2807788-1-liyuesong@vivo.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 155c6abda71d..5f2dddee0074 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1774,7 +1774,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		list_del_init(&p->mnt_child);
 	}
 
-	/* Add propogated mounts to the tmp_list */
+	/* Add propagated mounts to the tmp_list */
 	if (how & UMOUNT_PROPAGATE)
 		propagate_umount(&tmp_list);
 
@@ -5613,7 +5613,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
 			/* Only worry about locked mounts */
 			if (!(child->mnt.mnt_flags & MNT_LOCKED))
 				continue;
-			/* Is the directory permanetly empty? */
+			/* Is the directory permanently empty? */
 			if (!is_empty_dir_inode(inode))
 				goto next;
 		}

From 215ab0d8af59cfc394fa83a702f0af21a5e126c7 Mon Sep 17 00:00:00 2001
From: Joel Savitz <jsavitz@redhat.com>
Date: Fri, 2 Aug 2024 22:54:55 -0400
Subject: [PATCH 17/58] file: remove outdated comment after close_fd()

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: linux-fsdevel@vger.kernel.org

The comment on EXPORT_SYMBOL(close_fd) was added in commit 2ca2a09d6215
("fs: add ksys_close() wrapper; remove in-kernel calls to sys_close()"),
before commit 8760c909f54a ("file: Rename __close_fd to close_fd and remove
the files parameter") gave the function its current name, however commit
1572bfdf21d4 ("file: Replace ksys_close with close_fd") removes the
referenced caller entirely, obsoleting this comment.

Signed-off-by: Joel Savitz <jsavitz@redhat.com>
Link: https://lore.kernel.org/r/20240803025455.239276-1-jsavitz@redhat.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/file.c b/fs/file.c
index 655338effe9c..976ecd4ce2c6 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -672,7 +672,7 @@ int close_fd(unsigned fd)
 
 	return filp_close(file, files);
 }
-EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+EXPORT_SYMBOL(close_fd);
 
 /**
  * last_fd - return last valid index into fd table

From 087adb4f0f91ee330446a70af899e6a996e5cc13 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 6 Aug 2024 19:28:46 +0200
Subject: [PATCH 18/58] vfs: dodge smp_mb in break_lease and break_deleg in the
 common case

These inlines show up in the fast path (e.g., in do_dentry_open()) and
induce said full barrier regarding i_flctx access when in most cases the
pointer is NULL.

The pointer can be safely checked before issuing the barrier, dodging it
in most cases as a result.

It is plausible the consume fence would be sufficient, but I don't want
to go audit all callers regarding what they before calling here.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240806172846.886570-1-mjguzik@gmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/filelock.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index daee999d05f3..bb44224c6676 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -420,28 +420,38 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
 #ifdef CONFIG_FILE_LOCKING
 static inline int break_lease(struct inode *inode, unsigned int mode)
 {
+	struct file_lock_context *flctx;
+
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
 	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
 	 * could end up racing with tasks trying to set a new lease on this
 	 * file.
 	 */
+	flctx = READ_ONCE(inode->i_flctx);
+	if (!flctx)
+		return 0;
 	smp_mb();
-	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+	if (!list_empty_careful(&flctx->flc_lease))
 		return __break_lease(inode, mode, FL_LEASE);
 	return 0;
 }
 
 static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
+	struct file_lock_context *flctx;
+
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
 	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
 	 * could end up racing with tasks trying to set a new lease on this
 	 * file.
 	 */
+	flctx = READ_ONCE(inode->i_flctx);
+	if (!flctx)
+		return 0;
 	smp_mb();
-	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+	if (!list_empty_careful(&flctx->flc_lease))
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
 }

From 28c7658b2c4a21223c194084a59d712765e8112d Mon Sep 17 00:00:00 2001
From: Xiaxi Shen <shenxiaxi26@gmail.com>
Date: Wed, 7 Aug 2024 00:05:36 -0700
Subject: [PATCH 19/58] Fix spelling and gramatical errors

Fixed 3 typos in design.rst

Signed-off-by: Xiaxi Shen <shenxiaxi26@gmail.com>
Link: https://lore.kernel.org/r/20240807070536.14536-1-shenxiaxi26@gmail.com
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/iomap/design.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index f8ee3427bc1a..37594e1c5914 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -142,9 +142,9 @@ Definitions
  * **pure overwrite**: A write operation that does not require any
    metadata or zeroing operations to perform during either submission
    or completion.
-   This implies that the fileystem must have already allocated space
+   This implies that the filesystem must have already allocated space
    on disk as ``IOMAP_MAPPED`` and the filesystem must not place any
-   constaints on IO alignment or size.
+   constraints on IO alignment or size.
    The only constraints on I/O alignment are device level (minimum I/O
    size and alignment, typically sector size).
 
@@ -394,7 +394,7 @@ iomap is concerned:
 
  * The **upper** level primitive is provided by the filesystem to
    coordinate access to different iomap operations.
-   The exact primitive is specifc to the filesystem and operation,
+   The exact primitive is specific to the filesystem and operation,
    but is often a VFS inode, pagecache invalidation, or folio lock.
    For example, a filesystem might take ``i_rwsem`` before calling
    ``iomap_file_buffered_write`` and ``iomap_file_unshare`` to prevent

From 4f98f380f4662829e394c840062ea677956b57ec Mon Sep 17 00:00:00 2001
From: Joe Damato <jdamato@fastly.com>
Date: Wed, 7 Aug 2024 10:52:31 +0000
Subject: [PATCH 20/58] eventpoll: Don't re-zero eventpoll fields

Remove redundant and unnecessary code.

ep_alloc uses kzalloc to create struct eventpoll, so there is no need to
set fields to defaults of 0. This was accidentally introduced in commit
85455c795c07 ("eventpoll: support busy poll per epoll instance") and
expanded on in follow-up commits.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Link: https://lore.kernel.org/r/20240807105231.179158-1-jdamato@fastly.com
Reviewed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventpoll.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index f53ca4f7fced..6c0a1e9715ea 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2200,11 +2200,6 @@ static int do_epoll_create(int flags)
 		error = PTR_ERR(file);
 		goto out_free_fd;
 	}
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	ep->busy_poll_usecs = 0;
-	ep->busy_poll_budget = 0;
-	ep->prefer_busy_poll = false;
-#endif
 	ep->file = file;
 	fd_install(fd, file);
 	return fd;

From b9ca079dd6b09e08863aa998edf5c47597806c05 Mon Sep 17 00:00:00 2001
From: Martin Karsten <mkarsten@uwaterloo.ca>
Date: Tue, 6 Aug 2024 12:33:01 +0000
Subject: [PATCH 21/58] eventpoll: Annotate data-race of busy_poll_usecs

A struct eventpoll's busy_poll_usecs field can be modified via a user
ioctl at any time. All reads of this field should be annotated with
READ_ONCE.

Fixes: 85455c795c07 ("eventpoll: support busy poll per epoll instance")
Cc: stable@vger.kernel.org
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://lore.kernel.org/r/20240806123301.167557-1-jdamato@fastly.com
Reviewed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/eventpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6c0a1e9715ea..145f5349c612 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -420,7 +420,7 @@ static bool busy_loop_ep_timeout(unsigned long start_time,
 
 static bool ep_busy_loop_on(struct eventpoll *ep)
 {
-	return !!ep->busy_poll_usecs || net_busy_loop_on();
+	return !!READ_ONCE(ep->busy_poll_usecs) || net_busy_loop_on();
 }
 
 static bool ep_busy_loop_end(void *p, unsigned long start_time)

From e747e15156b79efeea0ad056df8de14b93d318c2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 7 Aug 2024 08:10:27 -0400
Subject: [PATCH 22/58] fs: try an opportunistic lookup for O_CREAT opens too

Today, when opening a file we'll typically do a fast lookup, but if
O_CREAT is set, the kernel always takes the exclusive inode lock. I
assume this was done with the expectation that O_CREAT means that we
always expect to do the create, but that's often not the case. Many
programs set O_CREAT even in scenarios where the file already exists.

This patch rearranges the pathwalk-for-open code to also attempt a
fast_lookup in certain O_CREAT cases. If a positive dentry is found, the
inode_lock can be avoided altogether, and if auditing isn't enabled, it
can stay in rcuwalk mode for the last step_into.

One notable exception that is hopefully temporary: if we're doing an
rcuwalk and auditing is enabled, skip the lookup_fast. Legitimizing the
dentry in that case is more expensive than taking the i_rwsem for now.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Link: https://lore.kernel.org/r/20240807-openfast-v3-1-040d132d2559@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c3459785704d..3e34f4d97d83 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3605,6 +3605,49 @@ out_dput:
 	return ERR_PTR(error);
 }
 
+static inline bool trailing_slashes(struct nameidata *nd)
+{
+	return (bool)nd->last.name[nd->last.len];
+}
+
+static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
+{
+	struct dentry *dentry;
+
+	if (open_flag & O_CREAT) {
+		/* Don't bother on an O_EXCL create */
+		if (open_flag & O_EXCL)
+			return NULL;
+
+		/*
+		 * FIXME: If auditing is enabled, then we'll have to unlazy to
+		 * use the dentry. For now, don't do this, since it shifts
+		 * contention from parent's i_rwsem to its d_lockref spinlock.
+		 * Reconsider this once dentry refcounting handles heavy
+		 * contention better.
+		 */
+		if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context())
+			return NULL;
+	}
+
+	if (trailing_slashes(nd))
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	dentry = lookup_fast(nd);
+	if (IS_ERR_OR_NULL(dentry))
+		return dentry;
+
+	if (open_flag & O_CREAT) {
+		/* Discard negative dentries. Need inode_lock to do the create */
+		if (!dentry->d_inode) {
+			if (!(nd->flags & LOOKUP_RCU))
+				dput(dentry);
+			dentry = NULL;
+		}
+	}
+	return dentry;
+}
+
 static const char *open_last_lookups(struct nameidata *nd,
 		   struct file *file, const struct open_flags *op)
 {
@@ -3622,28 +3665,39 @@ static const char *open_last_lookups(struct nameidata *nd,
 		return handle_dots(nd, nd->last_type);
 	}
 
+	/* We _can_ be in RCU mode here */
+	dentry = lookup_fast_for_open(nd, open_flag);
+	if (IS_ERR(dentry))
+		return ERR_CAST(dentry);
+
 	if (!(open_flag & O_CREAT)) {
-		if (nd->last.name[nd->last.len])
-			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-		/* we _can_ be in RCU mode here */
-		dentry = lookup_fast(nd);
-		if (IS_ERR(dentry))
-			return ERR_CAST(dentry);
 		if (likely(dentry))
 			goto finish_lookup;
 
 		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
 			return ERR_PTR(-ECHILD);
 	} else {
-		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {
-			if (!try_to_unlazy(nd))
+			bool unlazied;
+
+			/* can stay in rcuwalk if not auditing */
+			if (dentry && audit_dummy_context()) {
+				if (trailing_slashes(nd))
+					return ERR_PTR(-EISDIR);
+				goto finish_lookup;
+			}
+			unlazied = dentry ? try_to_unlazy_next(nd, dentry) :
+					    try_to_unlazy(nd);
+			if (!unlazied)
 				return ERR_PTR(-ECHILD);
 		}
 		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
-		/* trailing slashes? */
-		if (unlikely(nd->last.name[nd->last.len]))
+		if (trailing_slashes(nd)) {
+			dput(dentry);
 			return ERR_PTR(-EISDIR);
+		}
+		if (dentry)
+			goto finish_lookup;
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {

From c65d41c5a5279738fc07f99c0e912b28a691c46f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Aug 2024 10:16:42 +0200
Subject: [PATCH 23/58] fs: move audit parent inode

During O_CREAT we unconditionally audit the parent inode. This makes it
difficult to support a fastpath for O_CREAT when the file already exists
because we have to drop out of RCU lookup needlessly.

We worked around this by checking whether audit was actually active but
that's also suboptimal. Instead, move the audit of the parent inode down
into lookup_open() at a point where it's mostly certain that the file
needs to be created.

This also reduced the inconsistency that currently exists: while audit
on the parent is done independent of whether or no the file already
existed an audit on the file is only performed if it has been created.

By moving the audit down a bit we emit the audit a little later but it
will allow us to simplify the fastpath for O_CREAT significantly.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 3e34f4d97d83..745415fcda57 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3535,6 +3535,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 		return dentry;
 	}
 
+	if (open_flag & O_CREAT)
+		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
+
 	/*
 	 * Checking write permission is tricky, bacuse we don't know if we are
 	 * going to actually need it: O_CREAT opens should work as long as the
@@ -3691,7 +3694,6 @@ static const char *open_last_lookups(struct nameidata *nd,
 			if (!unlazied)
 				return ERR_PTR(-ECHILD);
 		}
-		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
 		if (trailing_slashes(nd)) {
 			dput(dentry);
 			return ERR_PTR(-EISDIR);

From 4770d96a6d89c7dd5675056629c0008f7f8106bf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Aug 2024 11:12:50 +0200
Subject: [PATCH 24/58] fs: pull up trailing slashes check for O_CREAT

Perform the check for trailing slashes right in the fastpath check and
don't bother with any additional work.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 745415fcda57..08eb9a53beb7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3618,6 +3618,9 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
 	struct dentry *dentry;
 
 	if (open_flag & O_CREAT) {
+		if (trailing_slashes(nd))
+			return ERR_PTR(-EISDIR);
+
 		/* Don't bother on an O_EXCL create */
 		if (open_flag & O_EXCL)
 			return NULL;
@@ -3684,20 +3687,13 @@ static const char *open_last_lookups(struct nameidata *nd,
 			bool unlazied;
 
 			/* can stay in rcuwalk if not auditing */
-			if (dentry && audit_dummy_context()) {
-				if (trailing_slashes(nd))
-					return ERR_PTR(-EISDIR);
+			if (dentry && audit_dummy_context())
 				goto finish_lookup;
-			}
 			unlazied = dentry ? try_to_unlazy_next(nd, dentry) :
 					    try_to_unlazy(nd);
 			if (!unlazied)
 				return ERR_PTR(-ECHILD);
 		}
-		if (trailing_slashes(nd)) {
-			dput(dentry);
-			return ERR_PTR(-EISDIR);
-		}
 		if (dentry)
 			goto finish_lookup;
 	}

From d459c52ab378cdcd53c57ddcbfd5af648b20a150 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Aug 2024 11:15:41 +0200
Subject: [PATCH 25/58] fs: remove audit dummy context check

Now that we audit later during lookup_open() we can remove the audit
dummy context check. This simplifies things a lot.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 08eb9a53beb7..96c79dec7184 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3624,16 +3624,6 @@ static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
 		/* Don't bother on an O_EXCL create */
 		if (open_flag & O_EXCL)
 			return NULL;
-
-		/*
-		 * FIXME: If auditing is enabled, then we'll have to unlazy to
-		 * use the dentry. For now, don't do this, since it shifts
-		 * contention from parent's i_rwsem to its d_lockref spinlock.
-		 * Reconsider this once dentry refcounting handles heavy
-		 * contention better.
-		 */
-		if ((nd->flags & LOOKUP_RCU) && !audit_dummy_context())
-			return NULL;
 	}
 
 	if (trailing_slashes(nd))
@@ -3687,7 +3677,7 @@ static const char *open_last_lookups(struct nameidata *nd,
 			bool unlazied;
 
 			/* can stay in rcuwalk if not auditing */
-			if (dentry && audit_dummy_context())
+			if (dentry)
 				goto finish_lookup;
 			unlazied = dentry ? try_to_unlazy_next(nd, dentry) :
 					    try_to_unlazy(nd);

From 0f93bb54a3a502077bca4d7beb1fe2a90d3b59db Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 8 Aug 2024 11:16:42 +0200
Subject: [PATCH 26/58] fs: rearrange general fastpath check now that O_CREAT
 uses it

If we find a positive dentry we can now simply try and open it. All
prelimiary checks are already done with or without O_CREAT.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 96c79dec7184..2699601bf8e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3666,26 +3666,17 @@ static const char *open_last_lookups(struct nameidata *nd,
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry);
 
-	if (!(open_flag & O_CREAT)) {
-		if (likely(dentry))
-			goto finish_lookup;
+	if (likely(dentry))
+		goto finish_lookup;
 
+	if (!(open_flag & O_CREAT)) {
 		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
 			return ERR_PTR(-ECHILD);
 	} else {
 		if (nd->flags & LOOKUP_RCU) {
-			bool unlazied;
-
-			/* can stay in rcuwalk if not auditing */
-			if (dentry)
-				goto finish_lookup;
-			unlazied = dentry ? try_to_unlazy_next(nd, dentry) :
-					    try_to_unlazy(nd);
-			if (!unlazied)
+			if (!try_to_unlazy(nd))
 				return ERR_PTR(-ECHILD);
 		}
-		if (dentry)
-			goto finish_lookup;
 	}
 
 	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {

From 193b72792fdba54aa2af6c3ad72ce16c3f06b4f7 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Thu, 8 Aug 2024 17:00:24 +0200
Subject: [PATCH 27/58] fs/select: Annotate struct poll_list with
 __counted_by()

Add the __counted_by compiler attribute to the flexible array member
entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Link: https://lore.kernel.org/r/20240808150023.72578-2-thorsten.blum@toblux.com
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/select.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/select.c b/fs/select.c
index 9515c3fa1a03..1a4849e2afb9 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -840,7 +840,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 struct poll_list {
 	struct poll_list *next;
 	unsigned int len;
-	struct pollfd entries[];
+	struct pollfd entries[] __counted_by(len);
 };
 
 #define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

From 8447d848e1dc6d42d1dcd00f133d7715fc732c47 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Sat, 10 Aug 2024 08:47:53 +0200
Subject: [PATCH 28/58] vfs: only read fops once in fops_get/put

In do_dentry_open() the usage is:
	f->f_op = fops_get(inode->i_fop);

In generated asm the compiler emits 2 reads from inode->i_fop instead of
just one.

This popped up due to false-sharing where loads from that offset end up
bouncing a cacheline during parallel open. While this is going to be fixed,
the spurious load does not need to be there.

This makes do_dentry_open() go down from 1177 to 1154 bytes.

fops_put() is patched to maintain some consistency.

No functional changes.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240810064753.1211441-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c0286d479c9d..696c620f90e0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2553,10 +2553,17 @@ struct super_block *sget(struct file_system_type *type,
 struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
-#define fops_get(fops) \
-	(((fops) && try_module_get((fops)->owner) ? (fops) : NULL))
-#define fops_put(fops) \
-	do { if (fops) module_put((fops)->owner); } while(0)
+#define fops_get(fops) ({						\
+	const struct file_operations *_fops = (fops);			\
+	(((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL));	\
+})
+
+#define fops_put(fops) ({						\
+	const struct file_operations *_fops = (fops);			\
+	if (_fops)							\
+		module_put((_fops)->owner);				\
+})
+
 /*
  * This one is to be used *ONLY* from ->open() instances.
  * fops must be non-NULL, pinned down *and* module dependencies

From 641bb4394f405cba498b100b44541ffc0aed5be1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Aug 2024 12:38:56 +0200
Subject: [PATCH 29/58] fs: move FMODE_UNSIGNED_OFFSET to fop_flags

This is another flag that is statically set and doesn't need to use up
an FMODE_* bit. Move it to ->fop_flags and free up another FMODE_* bit.

(1) mem_open() used from proc_mem_operations
(2) adi_open() used from adi_fops
(3) drm_open_helper():
    (3.1) accel_open() used from DRM_ACCEL_FOPS
    (3.2) drm_open() used from
    (3.2.1) amdgpu_driver_kms_fops
    (3.2.2) psb_gem_fops
    (3.2.3) i915_driver_fops
    (3.2.4) nouveau_driver_fops
    (3.2.5) panthor_drm_driver_fops
    (3.2.6) radeon_driver_kms_fops
    (3.2.7) tegra_drm_fops
    (3.2.8) vmwgfx_driver_fops
    (3.2.9) xe_driver_fops
    (3.2.10) DRM_GEM_FOPS
    (3.2.11) DEFINE_DRM_GEM_DMA_FOPS
(4) struct memdev sets fmode flags based on type of device opened. For
    devices using struct mem_fops unsigned offset is used.

Mark all these file operations as FOP_UNSIGNED_OFFSET and add asserts
into the open helper to ensure that the flag is always set.

Link: https://lore.kernel.org/r/20240809-work-fop_unsigned-v1-1-658e054d893e@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/char/adi.c                      |  8 +-------
 drivers/char/mem.c                      |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  1 +
 drivers/gpu/drm/drm_file.c              |  3 ++-
 drivers/gpu/drm/gma500/psb_drv.c        |  1 +
 drivers/gpu/drm/i915/i915_driver.c      |  1 +
 drivers/gpu/drm/nouveau/nouveau_drm.c   |  1 +
 drivers/gpu/drm/radeon/radeon_drv.c     |  1 +
 drivers/gpu/drm/tegra/drm.c             |  1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c     |  1 +
 drivers/gpu/drm/xe/xe_device.c          |  1 +
 fs/proc/base.c                          | 10 ++++------
 fs/read_write.c                         |  2 +-
 include/drm/drm_accel.h                 |  3 ++-
 include/drm/drm_gem.h                   |  3 ++-
 include/drm/drm_gem_dma_helper.h        |  1 +
 include/linux/fs.h                      |  5 +++--
 mm/mmap.c                               |  2 +-
 18 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/drivers/char/adi.c b/drivers/char/adi.c
index 751d7cc0da1b..1c76c8758f0f 100644
--- a/drivers/char/adi.c
+++ b/drivers/char/adi.c
@@ -14,12 +14,6 @@
 
 #define MAX_BUF_SZ	PAGE_SIZE
 
-static int adi_open(struct inode *inode, struct file *file)
-{
-	file->f_mode |= FMODE_UNSIGNED_OFFSET;
-	return 0;
-}
-
 static int read_mcd_tag(unsigned long addr)
 {
 	long err;
@@ -206,9 +200,9 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence)
 static const struct file_operations adi_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= adi_llseek,
-	.open		= adi_open,
 	.read		= adi_read,
 	.write		= adi_write,
+	.fop_flags	= FOP_UNSIGNED_OFFSET,
 };
 
 static struct miscdevice adi_miscdev = {
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 7c359cc406d5..169eed162a7f 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -643,6 +643,7 @@ static const struct file_operations __maybe_unused mem_fops = {
 	.get_unmapped_area = get_unmapped_area_mem,
 	.mmap_capabilities = memory_mmap_capabilities,
 #endif
+	.fop_flags	= FOP_UNSIGNED_OFFSET,
 };
 
 static const struct file_operations null_fops = {
@@ -693,7 +694,7 @@ static const struct memdev {
 	umode_t mode;
 } devlist[] = {
 #ifdef CONFIG_DEVMEM
-	[DEVMEM_MINOR] = { "mem", &mem_fops, FMODE_UNSIGNED_OFFSET, 0 },
+	[DEVMEM_MINOR] = { "mem", &mem_fops, 0, 0 },
 #endif
 	[3] = { "null", &null_fops, FMODE_NOWAIT, 0666 },
 #ifdef CONFIG_DEVPORT
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 094498a0964b..d7ef8cbecf6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2908,6 +2908,7 @@ static const struct file_operations amdgpu_driver_kms_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo = drm_show_fdinfo,
 #endif
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv)
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 714e42b05108..f8de3cba1a08 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -318,6 +318,8 @@ int drm_open_helper(struct file *filp, struct drm_minor *minor)
 	if (dev->switch_power_state != DRM_SWITCH_POWER_ON &&
 	    dev->switch_power_state != DRM_SWITCH_POWER_DYNAMIC_OFF)
 		return -EINVAL;
+	if (WARN_ON_ONCE(!(filp->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+		return -EINVAL;
 
 	drm_dbg_core(dev, "comm=\"%s\", pid=%d, minor=%d\n",
 		     current->comm, task_pid_nr(current), minor->index);
@@ -335,7 +337,6 @@ int drm_open_helper(struct file *filp, struct drm_minor *minor)
 	}
 
 	filp->private_data = priv;
-	filp->f_mode |= FMODE_UNSIGNED_OFFSET;
 	priv->filp = filp;
 
 	mutex_lock(&dev->filelist_mutex);
diff --git a/drivers/gpu/drm/gma500/psb_drv.c b/drivers/gpu/drm/gma500/psb_drv.c
index 8b64f61ffaf9..d67c2b3ad901 100644
--- a/drivers/gpu/drm/gma500/psb_drv.c
+++ b/drivers/gpu/drm/gma500/psb_drv.c
@@ -498,6 +498,7 @@ static const struct file_operations psb_gem_fops = {
 	.mmap = drm_gem_mmap,
 	.poll = drm_poll,
 	.read = drm_read,
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static const struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index fb8e9c2fcea5..cf276299bccb 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -1671,6 +1671,7 @@ static const struct file_operations i915_driver_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo = drm_show_fdinfo,
 #endif
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static int
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index a58c31089613..e243b42f8582 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -1274,6 +1274,7 @@ nouveau_driver_fops = {
 	.compat_ioctl = nouveau_compat_ioctl,
 #endif
 	.llseek = noop_llseek,
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static struct drm_driver
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 7bf08164140e..ac49779ed03d 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -520,6 +520,7 @@ static const struct file_operations radeon_driver_kms_fops = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = radeon_kms_compat_ioctl,
 #endif
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static const struct drm_ioctl_desc radeon_ioctls_kms[] = {
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 03d1c76aec2d..108c26a33edb 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -801,6 +801,7 @@ static const struct file_operations tegra_drm_fops = {
 	.read = drm_read,
 	.compat_ioctl = drm_compat_ioctl,
 	.llseek = noop_llseek,
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static int tegra_drm_context_cleanup(int id, void *p, void *data)
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 50ad3105c16e..2825dd3149ed 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -1609,6 +1609,7 @@ static const struct file_operations vmwgfx_driver_fops = {
 	.compat_ioctl = vmw_compat_ioctl,
 #endif
 	.llseek = noop_llseek,
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static const struct drm_driver driver = {
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index f2f1d8ddb221..2e5537d6608d 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -238,6 +238,7 @@ static const struct file_operations xe_driver_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo = drm_show_fdinfo,
 #endif
+	.fop_flags = FOP_UNSIGNED_OFFSET,
 };
 
 static struct drm_driver driver = {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..1409d1003101 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -827,12 +827,9 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
 
 static int mem_open(struct inode *inode, struct file *file)
 {
-	int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
-	/* OK to pass negative loff_t, we can catch out-of-range */
-	file->f_mode |= FMODE_UNSIGNED_OFFSET;
-
-	return ret;
+	if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+		return -EINVAL;
+	return __mem_open(inode, file, PTRACE_MODE_ATTACH);
 }
 
 static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -932,6 +929,7 @@ static const struct file_operations proc_mem_operations = {
 	.write		= mem_write,
 	.open		= mem_open,
 	.release	= mem_release,
+	.fop_flags	= FOP_UNSIGNED_OFFSET,
 };
 
 static int environ_open(struct inode *inode, struct file *file)
diff --git a/fs/read_write.c b/fs/read_write.c
index 90e283b31ca1..89d4af0e3b93 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(generic_ro_fops);
 
 static inline bool unsigned_offsets(struct file *file)
 {
-	return file->f_mode & FMODE_UNSIGNED_OFFSET;
+	return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
 }
 
 /**
diff --git a/include/drm/drm_accel.h b/include/drm/drm_accel.h
index f4d3784b1dce..41c78b7d712c 100644
--- a/include/drm/drm_accel.h
+++ b/include/drm/drm_accel.h
@@ -28,7 +28,8 @@
 	.poll		= drm_poll,\
 	.read		= drm_read,\
 	.llseek		= noop_llseek, \
-	.mmap		= drm_gem_mmap
+	.mmap		= drm_gem_mmap, \
+	.fop_flags	= FOP_UNSIGNED_OFFSET
 
 /**
  * DEFINE_DRM_ACCEL_FOPS() - macro to generate file operations for accelerators drivers
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index bae4865b2101..d8b86df2ec0d 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -447,7 +447,8 @@ struct drm_gem_object {
 	.poll		= drm_poll,\
 	.read		= drm_read,\
 	.llseek		= noop_llseek,\
-	.mmap		= drm_gem_mmap
+	.mmap		= drm_gem_mmap, \
+	.fop_flags	= FOP_UNSIGNED_OFFSET
 
 /**
  * DEFINE_DRM_GEM_FOPS() - macro to generate file operations for GEM drivers
diff --git a/include/drm/drm_gem_dma_helper.h b/include/drm/drm_gem_dma_helper.h
index a827bde494f6..f2678e7ecb98 100644
--- a/include/drm/drm_gem_dma_helper.h
+++ b/include/drm/drm_gem_dma_helper.h
@@ -267,6 +267,7 @@ unsigned long drm_gem_dma_get_unmapped_area(struct file *filp,
 		.read		= drm_read,\
 		.llseek		= noop_llseek,\
 		.mmap		= drm_gem_mmap,\
+		.fop_flags = FOP_UNSIGNED_OFFSET, \
 		DRM_GEM_DMA_UNMAPPED_AREA_FOPS \
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 696c620f90e0..0145bda465ff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -146,8 +146,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* Expect random access pattern */
 #define FMODE_RANDOM		((__force fmode_t)(1 << 12))
 
-/* File is huge (eg. /dev/mem): treat loff_t as unsigned */
-#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)(1 << 13))
+/* FMODE_* bit 13 */
 
 /* File is opened with O_PATH; almost nothing can be done with it */
 #define FMODE_PATH		((__force fmode_t)(1 << 14))
@@ -2073,6 +2072,8 @@ struct file_operations {
 #define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
 /* Contains huge pages */
 #define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
+/* Treat loff_t as unsigned (e.g., /dev/mem) */
+#define FOP_UNSIGNED_OFFSET	((__force fop_flags_t)(1 << 5))
 
 /* Wrap a directory iterator that needs exclusive inode access */
 int wrap_directory_iterator(struct file *, struct dir_context *,
diff --git a/mm/mmap.c b/mm/mmap.c
index d0dfc85b209b..6ddb278a5ee8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1229,7 +1229,7 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
 		return MAX_LFS_FILESIZE;
 
 	/* Special "we do even unsigned file positions" case */
-	if (file->f_mode & FMODE_UNSIGNED_OFFSET)
+	if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
 		return 0;
 
 	/* Yes, random drivers might want more. But I'm tired of buggy drivers */

From 122381a46954ad592ee93d7da2bef5074b396247 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Mon, 15 Jul 2024 09:13:24 +0200
Subject: [PATCH 30/58] vfs: use RCU in ilookup

A soft lockup in ilookup was reported when stress-testing a 512-way
system [1] (see [2] for full context) and it was verified that not
taking the lock shifts issues back to mm.

[1] https://lore.kernel.org/linux-mm/56865e57-c250-44da-9713-cf1404595bcc@amd.com/
[2] https://lore.kernel.org/linux-mm/d2841226-e27b-4d3d-a578-63587a3aa4f3@amd.com/

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240715071324.265879-1-mjguzik@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 10c4619faeef..de0969d6009c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1570,9 +1570,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 	struct hlist_head *head = inode_hashtable + hash(sb, ino);
 	struct inode *inode;
 again:
-	spin_lock(&inode_hash_lock);
-	inode = find_inode_fast(sb, head, ino, true);
-	spin_unlock(&inode_hash_lock);
+	inode = find_inode_fast(sb, head, ino, false);
 
 	if (inode) {
 		if (IS_ERR(inode))

From 433f9d76a01056dfeaefc15167b11e514e56f956 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Wed, 14 Aug 2024 17:02:31 +0800
Subject: [PATCH 31/58] autofs: add per dentry expire timeout

Add ability to set per-dentry mount expire timeout to autofs.

There are two fairly well known automounter map formats, the autofs
format and the amd format (more or less System V and Berkley).

Some time ago Linux autofs added an amd map format parser that
implemented a fair amount of the amd functionality. This was done
within the autofs infrastructure and some functionality wasn't
implemented because it either didn't make sense or required extra
kernel changes. The idea was to restrict changes to be within the
existing autofs functionality as much as possible and leave changes
with a wider scope to be considered later.

One of these changes is implementing the amd options:
1) "unmount", expire this mount according to a timeout (same as the
   current autofs default).
2) "nounmount", don't expire this mount (same as setting the autofs
   timeout to 0 except only for this specific mount) .
3) "utimeout=<seconds>", expire this mount using the specified
   timeout (again same as setting the autofs timeout but only for
   this mount).

To implement these options per-dentry expire timeouts need to be
implemented for autofs indirect mounts. This is because all map keys
(mounts) for autofs indirect mounts use an expire timeout stored in
the autofs mount super block info. structure and all indirect mounts
use the same expire timeout.

Now I have a request to add the "nounmount" option so I need to add
the per-dentry expire handling to the kernel implementation to do this.

The implementation uses the trailing path component to identify the
mount (and is also used as the autofs map key) which is passed in the
autofs_dev_ioctl structure path field. The expire timeout is passed
in autofs_dev_ioctl timeout field (well, of the timeout union).

If the passed in timeout is equal to -1 the per-dentry timeout and
flag are cleared providing for the "unmount" option. If the timeout
is greater than or equal to 0 the timeout is set to the value and the
flag is also set. If the dentry timeout is 0 the dentry will not expire
by timeout which enables the implementation of the "nounmount" option
for the specific mount. When the dentry timeout is greater than zero it
allows for the implementation of the "utimeout=<seconds>" option.

Signed-off-by: Ian Kent <raven@themaw.net>
Link: https://lore.kernel.org/r/20240814090231.963520-1-raven@themaw.net
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/autofs/autofs_i.h         |  4 ++
 fs/autofs/dev-ioctl.c        | 97 ++++++++++++++++++++++++++++++++++--
 fs/autofs/expire.c           |  7 ++-
 fs/autofs/inode.c            |  2 +
 include/uapi/linux/auto_fs.h |  2 +-
 5 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 8c1d587b3eef..77c7991d89aa 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -62,6 +62,7 @@ struct autofs_info {
 	struct list_head expiring;
 
 	struct autofs_sb_info *sbi;
+	unsigned long exp_timeout;
 	unsigned long last_used;
 	int count;
 
@@ -81,6 +82,9 @@ struct autofs_info {
 					*/
 #define AUTOFS_INF_PENDING	(1<<2) /* dentry pending mount */
 
+#define AUTOFS_INF_EXPIRE_SET	(1<<3) /* per-dentry expire timeout set for
+					  this mount point.
+					*/
 struct autofs_wait_queue {
 	wait_queue_head_t queue;
 	struct autofs_wait_queue *next;
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 5bf781ea6d67..f011e026358e 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -128,7 +128,13 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
 			goto out;
 		}
 
+		/* Setting the per-dentry expire timeout requires a trailing
+		 * path component, ie. no '/', so invert the logic of the
+		 * check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD.
+		 */
 		err = check_name(param->path);
+		if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
+			err = err ? 0 : -EINVAL;
 		if (err) {
 			pr_warn("invalid path supplied for cmd(0x%08x)\n",
 				cmd);
@@ -396,16 +402,97 @@ static int autofs_dev_ioctl_catatonic(struct file *fp,
 	return 0;
 }
 
-/* Set the autofs mount timeout */
+/*
+ * Set the autofs mount expire timeout.
+ *
+ * There are two places an expire timeout can be set, in the autofs
+ * super block info. (this is all that's needed for direct and offset
+ * mounts because there's a distinct mount corresponding to each of
+ * these) and per-dentry within within the dentry info. If a per-dentry
+ * timeout is set it will override the expire timeout set in the parent
+ * autofs super block info.
+ *
+ * If setting the autofs super block expire timeout the autofs_dev_ioctl
+ * size field will be equal to the autofs_dev_ioctl structure size. If
+ * setting the per-dentry expire timeout the mount point name is passed
+ * in the autofs_dev_ioctl path field and the size field updated to
+ * reflect this.
+ *
+ * Setting the autofs mount expire timeout sets the timeout in the super
+ * block info. struct. Setting the per-dentry timeout does a little more.
+ * If the timeout is equal to -1 the per-dentry timeout (and flag) is
+ * cleared which reverts to using the super block timeout, otherwise if
+ * timeout is 0 the timeout is set to this value and the flag is left
+ * set which disables expiration for the mount point, lastly the flag
+ * and the timeout are set enabling the dentry to use this timeout.
+ */
 static int autofs_dev_ioctl_timeout(struct file *fp,
 				    struct autofs_sb_info *sbi,
 				    struct autofs_dev_ioctl *param)
 {
-	unsigned long timeout;
+	unsigned long timeout = param->timeout.timeout;
+
+	/* If setting the expire timeout for an individual indirect
+	 * mount point dentry the mount trailing component path is
+	 * placed in param->path and param->size adjusted to account
+	 * for it otherwise param->size it is set to the structure
+	 * size.
+	 */
+	if (param->size == AUTOFS_DEV_IOCTL_SIZE) {
+		param->timeout.timeout = sbi->exp_timeout / HZ;
+		sbi->exp_timeout = timeout * HZ;
+	} else {
+		struct dentry *base = fp->f_path.dentry;
+		struct inode *inode = base->d_inode;
+		int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
+		struct dentry *dentry;
+		struct autofs_info *ino;
+
+		if (!autofs_type_indirect(sbi->type))
+			return -EINVAL;
+
+		/* An expire timeout greater than the superblock timeout
+		 * could be a problem at shutdown but the super block
+		 * timeout itself can change so all we can really do is
+		 * warn the user.
+		 */
+		if (timeout >= sbi->exp_timeout)
+			pr_warn("per-mount expire timeout is greater than "
+				"the parent autofs mount timeout which could "
+				"prevent shutdown\n");
+
+		inode_lock_shared(inode);
+		dentry = try_lookup_one_len(param->path, base, path_len);
+		inode_unlock_shared(inode);
+		if (IS_ERR_OR_NULL(dentry))
+			return dentry ? PTR_ERR(dentry) : -ENOENT;
+		ino = autofs_dentry_ino(dentry);
+		if (!ino) {
+			dput(dentry);
+			return -ENOENT;
+		}
+
+		if (ino->exp_timeout && ino->flags & AUTOFS_INF_EXPIRE_SET)
+			param->timeout.timeout = ino->exp_timeout / HZ;
+		else
+			param->timeout.timeout = sbi->exp_timeout / HZ;
+
+		if (timeout == -1) {
+			/* Revert to using the super block timeout */
+			ino->flags &= ~AUTOFS_INF_EXPIRE_SET;
+			ino->exp_timeout = 0;
+		} else {
+			/* Set the dentry expire flag and timeout.
+			 *
+			 * If timeout is 0 it will prevent the expire
+			 * of this particular automount.
+			 */
+			ino->flags |= AUTOFS_INF_EXPIRE_SET;
+			ino->exp_timeout = timeout * HZ;
+		}
+		dput(dentry);
+	}
 
-	timeout = param->timeout.timeout;
-	param->timeout.timeout = sbi->exp_timeout / HZ;
-	sbi->exp_timeout = timeout * HZ;
 	return 0;
 }
 
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 39d8c84c16f4..5c2d459e1e48 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -429,8 +429,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 	if (!root)
 		return NULL;
 
-	timeout = sbi->exp_timeout;
-
 	dentry = NULL;
 	while ((dentry = get_next_positive_subdir(dentry, root))) {
 		spin_lock(&sbi->fs_lock);
@@ -441,6 +439,11 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
 		}
 		spin_unlock(&sbi->fs_lock);
 
+		if (ino->flags & AUTOFS_INF_EXPIRE_SET)
+			timeout = ino->exp_timeout;
+		else
+			timeout = sbi->exp_timeout;
+
 		expired = should_expire(dentry, mnt, timeout, how);
 		if (!expired)
 			continue;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index 64faa6c51f60..ee2edccaef70 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -19,6 +19,7 @@ struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
 		INIT_LIST_HEAD(&ino->expiring);
 		ino->last_used = jiffies;
 		ino->sbi = sbi;
+		ino->exp_timeout = -1;
 		ino->count = 1;
 	}
 	return ino;
@@ -28,6 +29,7 @@ void autofs_clean_ino(struct autofs_info *ino)
 {
 	ino->uid = GLOBAL_ROOT_UID;
 	ino->gid = GLOBAL_ROOT_GID;
+	ino->exp_timeout = -1;
 	ino->last_used = jiffies;
 }
 
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 1f7925afad2d..8081df849743 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -23,7 +23,7 @@
 #define AUTOFS_MIN_PROTO_VERSION	3
 #define AUTOFS_MAX_PROTO_VERSION	5
 
-#define AUTOFS_PROTO_SUBVERSION		5
+#define AUTOFS_PROTO_SUBVERSION		6
 
 /*
  * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed

From b381fbbccb4cb60aefa1ab7f8e3766ddb4a461db Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Thu, 15 Aug 2024 10:33:10 +0200
Subject: [PATCH 32/58] vfs: elide smp_mb in iversion handling in the common
 case

According to bpftrace on these routines most calls result in cmpxchg,
which already provides the same guarantee.

In inode_maybe_inc_iversion elision is possible because even if the
wrong value was read due to now missing smp_mb fence, the issue is going
to correct itself after cmpxchg. If it appears cmpxchg wont be issued,
the fence + reload are there bringing back previous behavior.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240815083310.3865-1-mjguzik@gmail.com
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index 02602d00939e..7874b23364e1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -2003,13 +2003,19 @@ bool inode_maybe_inc_iversion(struct inode *inode, bool force)
 	 * information, but the legacy inode_inc_iversion code used a spinlock
 	 * to serialize increments.
 	 *
-	 * Here, we add full memory barriers to ensure that any de-facto
-	 * ordering with other info is preserved.
+	 * We add a full memory barrier to ensure that any de facto ordering
+	 * with other state is preserved (either implicitly coming from cmpxchg
+	 * or explicitly from smp_mb if we don't know upfront if we will execute
+	 * the former).
 	 *
-	 * This barrier pairs with the barrier in inode_query_iversion()
+	 * These barriers pair with inode_query_iversion().
 	 */
-	smp_mb();
 	cur = inode_peek_iversion_raw(inode);
+	if (!force && !(cur & I_VERSION_QUERIED)) {
+		smp_mb();
+		cur = inode_peek_iversion_raw(inode);
+	}
+
 	do {
 		/* If flag is clear then we needn't do anything */
 		if (!force && !(cur & I_VERSION_QUERIED))
@@ -2038,20 +2044,22 @@ EXPORT_SYMBOL(inode_maybe_inc_iversion);
 u64 inode_query_iversion(struct inode *inode)
 {
 	u64 cur, new;
+	bool fenced = false;
 
+	/*
+	 * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with
+	 * inode_maybe_inc_iversion(), see that routine for more details.
+	 */
 	cur = inode_peek_iversion_raw(inode);
 	do {
 		/* If flag is already set, then no need to swap */
 		if (cur & I_VERSION_QUERIED) {
-			/*
-			 * This barrier (and the implicit barrier in the
-			 * cmpxchg below) pairs with the barrier in
-			 * inode_maybe_inc_iversion().
-			 */
-			smp_mb();
+			if (!fenced)
+				smp_mb();
 			break;
 		}
 
+		fenced = true;
 		new = cur | I_VERSION_QUERIED;
 	} while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
 	return cur >> I_VERSION_QUERIED_SHIFT;

From 1aeb6defd11f28cf5d474f0a00875323c0d143b3 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Fri, 16 Aug 2024 14:38:49 +0800
Subject: [PATCH 33/58] fs: Use in_group_or_capable() helper to simplify the
 code

Since in_group_or_capable has been exported, we can use
it to simplify the code when check group and capable.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Link: https://lore.kernel.org/r/20240816063849.1989856-1-lihongbo22@huawei.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/posix_acl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 3f87297dbfdb..6c66a37522d0 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -715,8 +715,8 @@ int posix_acl_update_mode(struct mnt_idmap *idmap,
 		return error;
 	if (error == 0)
 		*acl = NULL;
-	if (!vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode)) &&
-	    !capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
+	if (!in_group_or_capable(idmap, inode,
+				 i_gid_into_vfsgid(idmap, inode)))
 		mode &= ~S_ISGID;
 	*mode_p = mode;
 	return 0;

From 3717bbcb5905e55153667d6a277b2c3545e83b38 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Fri, 16 Aug 2024 14:36:11 +0800
Subject: [PATCH 34/58] doc: correcting the idmapping mount example

In step 2, we obtain the kernel id `k1000`. So in next step (step
3), we should translate the `k1000` not `k21000`.

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Link: https://lore.kernel.org/r/20240816063611.1961910-1-lihongbo22@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/idmappings.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/idmappings.rst b/Documentation/filesystems/idmappings.rst
index ac0af679e61e..77930c77fcfe 100644
--- a/Documentation/filesystems/idmappings.rst
+++ b/Documentation/filesystems/idmappings.rst
@@ -821,7 +821,7 @@ the same idmapping to the mount. We now perform three steps:
       /* Map the userspace id down into a kernel id in the filesystem's idmapping. */
       make_kuid(u0:k20000:r10000, u1000) = k21000
 
-2. Verify that the caller's kernel ids can be mapped to userspace ids in the
+3. Verify that the caller's kernel ids can be mapped to userspace ids in the
    filesystem's idmapping::
 
     from_kuid(u0:k20000:r10000, k21000) = u1000
@@ -854,10 +854,10 @@ The same translation algorithm works with the third example.
        /* Map the userspace id down into a kernel id in the filesystem's idmapping. */
        make_kuid(u0:k0:r4294967295, u1000) = k1000
 
-2. Verify that the caller's kernel ids can be mapped to userspace ids in the
+3. Verify that the caller's kernel ids can be mapped to userspace ids in the
    filesystem's idmapping::
 
-    from_kuid(u0:k0:r4294967295, k21000) = u1000
+    from_kuid(u0:k0:r4294967295, k1000) = u1000
 
 So the ownership that lands on disk will be ``u1000``.
 
@@ -994,7 +994,7 @@ from above:::
       /* Map the userspace id down into a kernel id in the filesystem's idmapping. */
       make_kuid(u0:k0:r4294967295, u1000) = k1000
 
-2. Verify that the caller's filesystem ids can be mapped to userspace ids in the
+3. Verify that the caller's filesystem ids can be mapped to userspace ids in the
    filesystem's idmapping::
 
     from_kuid(u0:k0:r4294967295, k1000) = u1000

From 1c48d441468c425e878d81c5c3e7ff8a9a594cc0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 16 Aug 2024 16:35:52 +0200
Subject: [PATCH 35/58] inode: remove __I_DIO_WAKEUP

Afaict, we can just rely on inode->i_dio_count for waiting instead of
this awkward indirection through __I_DIO_WAKEUP. This survives LTP dio
and xfstests dio tests.

Link: https://lore.kernel.org/r/20240816-vfs-misc-dio-v1-1-80fe21a2c710@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 23 +++++++++++------------
 fs/netfs/locking.c | 22 +++++-----------------
 include/linux/fs.h |  8 +++-----
 3 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index de0969d6009c..248a131a02c3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2500,18 +2500,11 @@ EXPORT_SYMBOL(inode_owner_or_capable);
 /*
  * Direct i/o helper functions
  */
-static void __inode_dio_wait(struct inode *inode)
+bool inode_dio_finished(const struct inode *inode)
 {
-	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
-	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
-	do {
-		prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (atomic_read(&inode->i_dio_count))
-			schedule();
-	} while (atomic_read(&inode->i_dio_count));
-	finish_wait(wq, &q.wq_entry);
+	return atomic_read(&inode->i_dio_count) == 0;
 }
+EXPORT_SYMBOL(inode_dio_finished);
 
 /**
  * inode_dio_wait - wait for outstanding DIO requests to finish
@@ -2525,11 +2518,17 @@ static void __inode_dio_wait(struct inode *inode)
  */
 void inode_dio_wait(struct inode *inode)
 {
-	if (atomic_read(&inode->i_dio_count))
-		__inode_dio_wait(inode);
+	wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
 }
 EXPORT_SYMBOL(inode_dio_wait);
 
+void inode_dio_wait_interruptible(struct inode *inode)
+{
+	wait_var_event_interruptible(&inode->i_dio_count,
+				     inode_dio_finished(inode));
+}
+EXPORT_SYMBOL(inode_dio_wait_interruptible);
+
 /*
  * inode_set_flags - atomically set some inode flags
  *
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
index 75dc52a49b3a..21eab56ee2f9 100644
--- a/fs/netfs/locking.c
+++ b/fs/netfs/locking.c
@@ -19,25 +19,13 @@
  * Must be called under a lock that serializes taking new references
  * to i_dio_count, usually by inode->i_mutex.
  */
-static int inode_dio_wait_interruptible(struct inode *inode)
+static int netfs_inode_dio_wait_interruptible(struct inode *inode)
 {
-	if (!atomic_read(&inode->i_dio_count))
+	if (inode_dio_finished(inode))
 		return 0;
 
-	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
-	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
-
-	for (;;) {
-		prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
-		if (!atomic_read(&inode->i_dio_count))
-			break;
-		if (signal_pending(current))
-			break;
-		schedule();
-	}
-	finish_wait(wq, &q.wq_entry);
-
-	return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+	inode_dio_wait_interruptible(inode);
+	return !inode_dio_finished(inode) ? -ERESTARTSYS : 0;
 }
 
 /* Call with exclusively locked inode->i_rwsem */
@@ -46,7 +34,7 @@ static int netfs_block_o_direct(struct netfs_inode *ictx)
 	if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
 		return 0;
 	clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
-	return inode_dio_wait_interruptible(&ictx->inode);
+	return netfs_inode_dio_wait_interruptible(&ictx->inode);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0145bda465ff..8f8d274929d7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2373,8 +2373,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *
  * I_REFERENCED		Marks the inode as recently references on the LRU list.
  *
- * I_DIO_WAKEUP		Never set.  Only used as a key for wait_on_bit().
- *
  * I_WB_SWITCH		Cgroup bdi_writeback switching in progress.  Used to
  *			synchronize competing switching instances and to tell
  *			wb stat updates to grab the i_pages lock.  See
@@ -2409,8 +2407,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define __I_SYNC		7
 #define I_SYNC			(1 << __I_SYNC)
 #define I_REFERENCED		(1 << 8)
-#define __I_DIO_WAKEUP		9
-#define I_DIO_WAKEUP		(1 << __I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
 #define I_DIRTY_TIME		(1 << 11)
 #define I_WB_SWITCH		(1 << 13)
@@ -3227,7 +3223,9 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 }
 #endif
 
+bool inode_dio_finished(const struct inode *inode);
 void inode_dio_wait(struct inode *inode);
+void inode_dio_wait_interruptible(struct inode *inode);
 
 /**
  * inode_dio_begin - signal start of a direct I/O requests
@@ -3251,7 +3249,7 @@ static inline void inode_dio_begin(struct inode *inode)
 static inline void inode_dio_end(struct inode *inode)
 {
 	if (atomic_dec_and_test(&inode->i_dio_count))
-		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+		wake_up_var(&inode->i_dio_count);
 }
 
 extern void inode_set_flags(struct inode *inode, unsigned int flags,

From 3a987b88a42593875f6345188ca33731c7df728c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Aur=C3=A8le=20La=20France?= <tsi@tuyoix.net>
Date: Sat, 10 Aug 2024 13:25:27 -0600
Subject: [PATCH 36/58] debugfs show actual source in /proc/mounts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After its conversion to the new mount API, debugfs displays "none" in
/proc/mounts instead of the actual source.  Fix this by recognising its
"source" mount option.

Signed-off-by: Marc Aurèle La France <tsi@tuyoix.net>
Link: https://lore.kernel.org/r/e439fae2-01da-234b-75b9-2a7951671e27@tuyoix.net
Fixes: a20971c18752 ("vfs: Convert debugfs to use the new mount API")
Cc: stable@vger.kernel.org # 6.10.x: 49abee5991e1: debugfs: Convert to new uid/gid option parsing helpers
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/debugfs/inode.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 91521576f500..66d9b3b4c588 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -89,12 +89,14 @@ enum {
 	Opt_uid,
 	Opt_gid,
 	Opt_mode,
+	Opt_source,
 };
 
 static const struct fs_parameter_spec debugfs_param_specs[] = {
 	fsparam_gid	("gid",		Opt_gid),
 	fsparam_u32oct	("mode",	Opt_mode),
 	fsparam_uid	("uid",		Opt_uid),
+	fsparam_string	("source",	Opt_source),
 	{}
 };
 
@@ -126,6 +128,12 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
 	case Opt_mode:
 		opts->mode = result.uint_32 & S_IALLUGO;
 		break;
+	case Opt_source:
+		if (fc->source)
+			return invalfc(fc, "Multiple sources specified");
+		fc->source = param->string;
+		param->string = NULL;
+		break;
 	/*
 	 * We might like to report bad mount options here;
 	 * but traditionally debugfs has ignored all mount options

From 57510c58b5832c8cb36516cdba48543133f3ab85 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 13 Aug 2024 16:36:26 +0200
Subject: [PATCH 37/58] vfs: drop one lock trip in evict()

Most commonly neither I_LRU_ISOLATING nor I_SYNC are set, but the stock
kernel takes a back-to-back relock trip to check for them.

It probably can be avoided altogether, but for now massage things back
to just one lock acquire.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20240813143626.1573445-1-mjguzik@gmail.com
Reviewed-by: Zhihao Cheng <chengzhihao1@huawei.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c | 17 +++--------------
 fs/inode.c        |  5 +++--
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4451ecff37c4..1a5006329f6f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1510,13 +1510,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
  * Wait for writeback on an inode to complete. Called with i_lock held.
  * Caller must make sure inode cannot go away when we drop i_lock.
  */
-static void __inode_wait_for_writeback(struct inode *inode)
-	__releases(inode->i_lock)
-	__acquires(inode->i_lock)
+void inode_wait_for_writeback(struct inode *inode)
 {
 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
 	wait_queue_head_t *wqh;
 
+	lockdep_assert_held(&inode->i_lock);
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode->i_lock);
@@ -1526,16 +1525,6 @@ static void __inode_wait_for_writeback(struct inode *inode)
 	}
 }
 
-/*
- * Wait for writeback on an inode to complete. Caller must have inode pinned.
- */
-void inode_wait_for_writeback(struct inode *inode)
-{
-	spin_lock(&inode->i_lock);
-	__inode_wait_for_writeback(inode);
-	spin_unlock(&inode->i_lock);
-}
-
 /*
  * Sleep until I_SYNC is cleared. This function must be called with i_lock
  * held and drops it. It is aimed for callers not holding any inode reference
@@ -1757,7 +1746,7 @@ static int writeback_single_inode(struct inode *inode,
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL)
 			goto out;
-		__inode_wait_for_writeback(inode);
+		inode_wait_for_writeback(inode);
 	}
 	WARN_ON(inode->i_state & I_SYNC);
 	/*
diff --git a/fs/inode.c b/fs/inode.c
index 248a131a02c3..154f8689457f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -507,7 +507,7 @@ static void inode_unpin_lru_isolating(struct inode *inode)
 
 static void inode_wait_for_lru_isolating(struct inode *inode)
 {
-	spin_lock(&inode->i_lock);
+	lockdep_assert_held(&inode->i_lock);
 	if (inode->i_state & I_LRU_ISOLATING) {
 		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
 		wait_queue_head_t *wqh;
@@ -518,7 +518,6 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
 		spin_lock(&inode->i_lock);
 		WARN_ON(inode->i_state & I_LRU_ISOLATING);
 	}
-	spin_unlock(&inode->i_lock);
 }
 
 /**
@@ -690,6 +689,7 @@ static void evict(struct inode *inode)
 
 	inode_sb_list_del(inode);
 
+	spin_lock(&inode->i_lock);
 	inode_wait_for_lru_isolating(inode);
 
 	/*
@@ -699,6 +699,7 @@ static void evict(struct inode *inode)
 	 * the inode.  We just have to wait for running writeback to finish.
 	 */
 	inode_wait_for_writeback(inode);
+	spin_unlock(&inode->i_lock);
 
 	if (op->evict_inode) {
 		op->evict_inode(inode);

From 029c3f27fe84c5fd29d49a99cc5176433bf12209 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 11:28:38 +0200
Subject: [PATCH 38/58] fs: remove unused path_put_init()

This helper has been unused for a while now.

Link: https://lore.kernel.org/r/20240822-bewuchs-werktag-46672b3c0606@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/path.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/include/linux/path.h b/include/linux/path.h
index ca073e70decd..7ea389dc764b 100644
--- a/include/linux/path.h
+++ b/include/linux/path.h
@@ -18,12 +18,6 @@ static inline int path_equal(const struct path *path1, const struct path *path2)
 	return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
 }
 
-static inline void path_put_init(struct path *path)
-{
-	path_put(path);
-	*path = (struct path) { };
-}
-
 /*
  * Cleanup macro for use with __free(path_put). Avoids dereference and
  * copying @path unlike DEFINE_FREE(). path_put() will handle the empty

From 71ff58ce3428b2471efae5b00594e892b285c97c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 11:30:58 +0200
Subject: [PATCH 39/58] fs: s/__u32/u32/ for s_fsnotify_mask

The underscore variants are for uapi whereas the non-underscore variants
are for in-kernel consumers.

Link: https://lore.kernel.org/r/20240822-anwerben-nutzung-1cd6c82a565f@brauner
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8f8d274929d7..cda1f2545ea0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1266,7 +1266,7 @@ struct super_block {
 	time64_t		   s_time_min;
 	time64_t		   s_time_max;
 #ifdef CONFIG_FSNOTIFY
-	__u32			s_fsnotify_mask;
+	u32			s_fsnotify_mask;
 	struct fsnotify_sb_info	*s_fsnotify_info;
 #endif
 

From 0ac3396ea4a2ab1c80ea560b4dd22f2a3c13a7e9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 20 Aug 2024 12:51:09 -0700
Subject: [PATCH 40/58] MAINTAINERS: add the VFS git tree

The VFS git tree is missing from MAINTAINERS.  Add it.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20240820195109.38906-1-ebiggers@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index f328373463b0..c63e588e87dd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8597,6 +8597,7 @@ M:	Christian Brauner <brauner@kernel.org>
 R:	Jan Kara <jack@suse.cz>
 L:	linux-fsdevel@vger.kernel.org
 S:	Maintained
+T:	git https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
 F:	fs/*
 F:	include/linux/fs.h
 F:	include/linux/fs_types.h

From da18ecbf0fb6fb73e6f9273e7aa15610f148ce17 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:35 +0200
Subject: [PATCH 41/58] fs: add i_state helpers

The i_state member is an unsigned long so that it can be used with the
wait bit infrastructure which expects unsigned long. This wastes 4 bytes
which we're unlikely to ever use. Switch to using the var event wait
mechanism using the address of the bit. Thanks to Linus for the address
idea.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-1-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 11 +++++++++++
 include/linux/fs.h | 15 +++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index ba1645a09603..1b0f52ebae27 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -472,6 +472,17 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
 		inode->i_state |= I_REFERENCED;
 }
 
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+					    struct inode *inode, u32 bit)
+{
+        void *bit_address;
+
+        bit_address = inode_state_wait_address(inode, bit);
+        init_wait_var_entry(wqe, bit_address, 0);
+        return __var_waitqueue(bit_address);
+}
+EXPORT_SYMBOL(inode_bit_waitqueue);
+
 /*
  * Add inode to LRU if needed (inode is unused and clean).
  *
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cda1f2545ea0..d35de348d2ec 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -744,6 +744,21 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;
 
+/*
+ * Get bit address from inode->i_state to use with wait_var_event()
+ * infrastructre.
+ */
+#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))
+
+struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
+					    struct inode *inode, u32 bit);
+
+static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
+{
+	/* Caller is responsible for correct memory barriers. */
+	wake_up_var(inode_state_wait_address(inode, bit));
+}
+
 struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);
 
 static inline unsigned int i_blocksize(const struct inode *node)

From 2ed634c96ed1d6e4ee0497be176f9980866e81cb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:36 +0200
Subject: [PATCH 42/58] fs: reorder i_state bits

so that we can use the first bits to derive unique addresses from
i_state.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-2-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d35de348d2ec..a868c9823c10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2410,28 +2410,32 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *			i_count.
  *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
+ *
+ * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
+ * upon. There's one free address left.
  */
-#define I_DIRTY_SYNC		(1 << 0)
-#define I_DIRTY_DATASYNC	(1 << 1)
-#define I_DIRTY_PAGES		(1 << 2)
-#define __I_NEW			3
+#define __I_NEW			0
 #define I_NEW			(1 << __I_NEW)
-#define I_WILL_FREE		(1 << 4)
-#define I_FREEING		(1 << 5)
-#define I_CLEAR			(1 << 6)
-#define __I_SYNC		7
+#define __I_SYNC		1
 #define I_SYNC			(1 << __I_SYNC)
-#define I_REFERENCED		(1 << 8)
+#define __I_LRU_ISOLATING	2
+#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
+
+#define I_DIRTY_SYNC		(1 << 3)
+#define I_DIRTY_DATASYNC	(1 << 4)
+#define I_DIRTY_PAGES		(1 << 5)
+#define I_WILL_FREE		(1 << 6)
+#define I_FREEING		(1 << 7)
+#define I_CLEAR			(1 << 8)
+#define I_REFERENCED		(1 << 9)
 #define I_LINKABLE		(1 << 10)
 #define I_DIRTY_TIME		(1 << 11)
-#define I_WB_SWITCH		(1 << 13)
-#define I_OVL_INUSE		(1 << 14)
-#define I_CREATING		(1 << 15)
-#define I_DONTCACHE		(1 << 16)
-#define I_SYNC_QUEUED		(1 << 17)
-#define I_PINNING_NETFS_WB	(1 << 18)
-#define __I_LRU_ISOLATING	19
-#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
+#define I_WB_SWITCH		(1 << 12)
+#define I_OVL_INUSE		(1 << 13)
+#define I_CREATING		(1 << 14)
+#define I_DONTCACHE		(1 << 15)
+#define I_SYNC_QUEUED		(1 << 16)
+#define I_PINNING_NETFS_WB	(1 << 17)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)

From 532980cb1bff9b942c23fe94324dee560f5f57a5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:37 +0200
Subject: [PATCH 43/58] inode: port __I_SYNC to var event

Port the __I_SYNC mechanism to use the new var event mechanism.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-3-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c | 45 +++++++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1a5006329f6f..d8bec3c1bb1f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1386,12 +1386,13 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 
 static void inode_sync_complete(struct inode *inode)
 {
+	assert_spin_locked(&inode->i_lock);
+
 	inode->i_state &= ~I_SYNC;
 	/* If inode is clean an unused, put it into LRU now... */
 	inode_add_lru(inode);
-	/* Waiters must see I_SYNC cleared before being woken up */
-	smp_mb();
-	wake_up_bit(&inode->i_state, __I_SYNC);
+	/* Called with inode->i_lock which ensures memory ordering. */
+	inode_wake_up_bit(inode, __I_SYNC);
 }
 
 static bool inode_dirtied_after(struct inode *inode, unsigned long t)
@@ -1512,17 +1513,25 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
  */
 void inode_wait_for_writeback(struct inode *inode)
 {
-	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
-	wait_queue_head_t *wqh;
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
 
-	lockdep_assert_held(&inode->i_lock);
-	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	while (inode->i_state & I_SYNC) {
+	assert_spin_locked(&inode->i_lock);
+
+	if (!(inode->i_state & I_SYNC))
+		return;
+
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
+	for (;;) {
+		prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+		/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
+		if (!(inode->i_state & I_SYNC))
+			break;
 		spin_unlock(&inode->i_lock);
-		__wait_on_bit(wqh, &wq, bit_wait,
-			      TASK_UNINTERRUPTIBLE);
+		schedule();
 		spin_lock(&inode->i_lock);
 	}
+	finish_wait(wq_head, &wqe.wq_entry);
 }
 
 /*
@@ -1533,16 +1542,20 @@ void inode_wait_for_writeback(struct inode *inode)
 static void inode_sleep_on_writeback(struct inode *inode)
 	__releases(inode->i_lock)
 {
-	DEFINE_WAIT(wait);
-	wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-	int sleep;
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
+	bool sleep;
 
-	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
-	sleep = inode->i_state & I_SYNC;
+	assert_spin_locked(&inode->i_lock);
+
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
+	prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+	/* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
+	sleep = !!(inode->i_state & I_SYNC);
 	spin_unlock(&inode->i_lock);
 	if (sleep)
 		schedule();
-	finish_wait(wqh, &wait);
+	finish_wait(wq_head, &wqe.wq_entry);
 }
 
 /*

From 0fe340a98b584a31b07c664dc5ff0c923730581e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:38 +0200
Subject: [PATCH 44/58] inode: port __I_NEW to var event

Port the __I_NEW mechanism to use the new var event mechanism.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-4-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/bcachefs/fs.c          | 10 ++++++----
 fs/dcache.c               |  7 ++++++-
 fs/inode.c                | 32 ++++++++++++++++++++++++--------
 include/linux/writeback.h |  3 ++-
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 94c392abef65..c0900c0c0f8a 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1644,14 +1644,16 @@ again:
 				break;
 			}
 		} else if (clean_pass && this_pass_clean) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+			struct wait_bit_queue_entry wqe;
+			struct wait_queue_head *wq_head;
 
-			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+			prepare_to_wait_event(wq_head, &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
 			mutex_unlock(&c->vfs_inodes_lock);
 
 			schedule();
-			finish_wait(wq, &wait.wq_entry);
+			finish_wait(wq_head, &wqe.wq_entry);
 			goto again;
 		}
 	}
diff --git a/fs/dcache.c b/fs/dcache.c
index 1af75fa68638..894e38cdf4d0 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1908,8 +1908,13 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
 	__d_instantiate(entry, inode);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW & ~I_CREATING;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(d_instantiate_new);
diff --git a/fs/inode.c b/fs/inode.c
index 1b0f52ebae27..1aa785421f13 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -734,7 +734,13 @@ static void evict(struct inode *inode)
 	 * used as an indicator whether blocking on it is safe.
 	 */
 	spin_lock(&inode->i_lock);
-	wake_up_bit(&inode->i_state, __I_NEW);
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
+	smp_mb();
+	inode_wake_up_bit(inode, __I_NEW);
 	BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
 	spin_unlock(&inode->i_lock);
 
@@ -1146,8 +1152,13 @@ void unlock_new_inode(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW & ~I_CREATING;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 }
 EXPORT_SYMBOL(unlock_new_inode);
@@ -1158,8 +1169,13 @@ void discard_new_inode(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_NEW));
 	inode->i_state &= ~I_NEW;
+	/*
+	 * Pairs with the barrier in prepare_to_wait_event() to make sure
+	 * ___wait_var_event() either sees the bit cleared or
+	 * waitqueue_active() check in wake_up_var() sees the waiter.
+	 */
 	smp_mb();
-	wake_up_bit(&inode->i_state, __I_NEW);
+	inode_wake_up_bit(inode, __I_NEW);
 	spin_unlock(&inode->i_lock);
 	iput(inode);
 }
@@ -2348,8 +2364,8 @@ EXPORT_SYMBOL(inode_needs_sync);
  */
 static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
 {
-	wait_queue_head_t *wq;
-	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
 
 	/*
 	 * Handle racing against evict(), see that routine for more details.
@@ -2360,14 +2376,14 @@ static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_lock
 		return;
 	}
 
-	wq = bit_waitqueue(&inode->i_state, __I_NEW);
-	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
+	prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
 	spin_unlock(&inode->i_lock);
 	rcu_read_unlock();
 	if (is_inode_hash_locked)
 		spin_unlock(&inode_hash_lock);
 	schedule();
-	finish_wait(wq, &wait.wq_entry);
+	finish_wait(wq_head, &wqe.wq_entry);
 	if (is_inode_hash_locked)
 		spin_lock(&inode_hash_lock);
 	rcu_read_lock();
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 56b85841ae4c..8f651bb0a1a5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -200,7 +200,8 @@ void inode_io_list_del(struct inode *inode);
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
 {
-	wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
+	wait_var_event(inode_state_wait_address(inode, __I_NEW),
+		       !(READ_ONCE(inode->i_state) & I_NEW));
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK

From 88b1afbf0f6b221f6c5bb66cc80cd3b38d696687 Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Fri, 23 Aug 2024 21:07:30 +0800
Subject: [PATCH 45/58] vfs: fix race between evice_inodes() and
 find_inode()&iput()

Hi, all

Recently I noticed a bug[1] in btrfs, after digged it into
and I believe it'a race in vfs.

Let's assume there's a inode (ie ino 261) with i_count 1 is
called by iput(), and there's a concurrent thread calling
generic_shutdown_super().

cpu0:                              cpu1:
iput() // i_count is 1
  ->spin_lock(inode)
  ->dec i_count to 0
  ->iput_final()                    generic_shutdown_super()
    ->__inode_add_lru()               ->evict_inodes()
      // cause some reason[2]           ->if (atomic_read(inode->i_count)) continue;
      // return before                  // inode 261 passed the above check
      // list_lru_add_obj()             // and then schedule out
   ->spin_unlock()
// note here: the inode 261
// was still at sb list and hash list,
// and I_FREEING|I_WILL_FREE was not been set

btrfs_iget()
  // after some function calls
  ->find_inode()
    // found the above inode 261
    ->spin_lock(inode)
   // check I_FREEING|I_WILL_FREE
   // and passed
      ->__iget()
    ->spin_unlock(inode)                // schedule back
                                        ->spin_lock(inode)
                                        // check (I_NEW|I_FREEING|I_WILL_FREE) flags,
                                        // passed and set I_FREEING
iput()                                  ->spin_unlock(inode)
  ->spin_lock(inode)			  ->evict()
  // dec i_count to 0
  ->iput_final()
    ->spin_unlock()
    ->evict()

Now, we have two threads simultaneously evicting
the same inode, which may trigger the BUG(inode->i_state & I_CLEAR)
statement both within clear_inode() and iput().

To fix the bug, recheck the inode->i_count after holding i_lock.
Because in the most scenarios, the first check is valid, and
the overhead of spin_lock() can be reduced.

If there is any misunderstanding, please let me know, thanks.

[1]: https://lore.kernel.org/linux-btrfs/000000000000eabe1d0619c48986@google.com/
[2]: The reason might be 1. SB_ACTIVE was removed or 2. mapping_shrinkable()
return false when I reproduced the bug.

Reported-by: syzbot+67ba3c42bcbb4665d3ad@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=67ba3c42bcbb4665d3ad
CC: stable@vger.kernel.org
Fixes: 63997e98a3be ("split invalidate_inodes()")
Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Link: https://lore.kernel.org/r/20240823130730.658881-1-sunjunchao2870@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index 154f8689457f..ba1645a09603 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -771,6 +771,10 @@ again:
 			continue;
 
 		spin_lock(&inode->i_lock);
+		if (atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
 			continue;

From f469e6e6f51ba6d973d30bea1d66e17da73fc1bb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:39 +0200
Subject: [PATCH 46/58] inode: port __I_LRU_ISOLATING to var event

Port the __I_LRU_ISOLATING mechanism to use the new var event mechanism.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-5-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 1aa785421f13..aacd05749c1f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -511,24 +511,35 @@ static void inode_unpin_lru_isolating(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
 	inode->i_state &= ~I_LRU_ISOLATING;
-	smp_mb();
-	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
+	/* Called with inode->i_lock which ensures memory ordering. */
+	inode_wake_up_bit(inode, __I_LRU_ISOLATING);
 	spin_unlock(&inode->i_lock);
 }
 
 static void inode_wait_for_lru_isolating(struct inode *inode)
 {
-	lockdep_assert_held(&inode->i_lock);
-	if (inode->i_state & I_LRU_ISOLATING) {
-		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
-		wait_queue_head_t *wqh;
+	struct wait_bit_queue_entry wqe;
+	struct wait_queue_head *wq_head;
 
-		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
+	lockdep_assert_held(&inode->i_lock);
+	if (!(inode->i_state & I_LRU_ISOLATING))
+		return;
+
+	wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
+	for (;;) {
+		prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
+		/*
+		 * Checking I_LRU_ISOLATING with inode->i_lock guarantees
+		 * memory ordering.
+		 */
+		if (!(inode->i_state & I_LRU_ISOLATING))
+			break;
 		spin_unlock(&inode->i_lock);
-		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
+		schedule();
 		spin_lock(&inode->i_lock);
-		WARN_ON(inode->i_state & I_LRU_ISOLATING);
 	}
+	finish_wait(wq_head, &wqe.wq_entry);
+	WARN_ON(inode->i_state & I_LRU_ISOLATING);
 }
 
 /**

From 2b111edbe0a9c441605be5cfb73001dc98ec686f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 14:47:40 +0200
Subject: [PATCH 47/58] inode: make i_state a u32

Now that we use the wait var event mechanism make i_state a u32 and free
up 4 bytes. This means we currently have two 4 byte holes in struct
inode which we can pack.

Link: https://lore.kernel.org/r/20240823-work-i_state-v3-6-5cd5fd207a57@kernel.org
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index a868c9823c10..a1ef8c65d828 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -681,7 +681,8 @@ struct inode {
 #endif
 
 	/* Misc */
-	unsigned long		i_state;
+	u32			i_state;
+	/* 32-bit hole */
 	struct rw_semaphore	i_rwsem;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */

From 73ce1c9fce70ec6f1fcfaf3da51d0b58368b44f3 Mon Sep 17 00:00:00 2001
From: Hongbo Li <lihongbo22@huawei.com>
Date: Wed, 21 Aug 2024 14:54:56 +0800
Subject: [PATCH 48/58] fs: use LIST_HEAD() to simplify code

list_head can be initialized automatically with LIST_HEAD()
instead of calling INIT_LIST_HEAD().

Signed-off-by: Hongbo Li <lihongbo22@huawei.com>
Link: https://lore.kernel.org/r/20240821065456.2294216-1-lihongbo22@huawei.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/buffer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index e55ad471c530..31a9062cad7e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -774,12 +774,11 @@ EXPORT_SYMBOL(block_dirty_folio);
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 {
 	struct buffer_head *bh;
-	struct list_head tmp;
 	struct address_space *mapping;
 	int err = 0, err2;
 	struct blk_plug plug;
+	LIST_HEAD(tmp);
 
-	INIT_LIST_HEAD(&tmp);
 	blk_start_plug(&plug);
 
 	spin_lock(lock);

From 3c58a9575e02c2b90a3180007d57105ceaa7c246 Mon Sep 17 00:00:00 2001
From: Baokun Li <libaokun1@huawei.com>
Date: Mon, 26 Aug 2024 19:34:04 +0800
Subject: [PATCH 49/58] netfs: Delete subtree of 'fs/netfs' when netfs module
 exits

In netfs_init() or fscache_proc_init(), we create dentry under 'fs/netfs',
but in netfs_exit(), we only delete the proc entry of 'fs/netfs' without
deleting its subtree. This triggers the following WARNING:

==================================================================
remove_proc_entry: removing non-empty directory 'fs/netfs', leaking at least 'requests'
WARNING: CPU: 4 PID: 566 at fs/proc/generic.c:717 remove_proc_entry+0x160/0x1c0
Modules linked in: netfs(-)
CPU: 4 UID: 0 PID: 566 Comm: rmmod Not tainted 6.11.0-rc3 #860
RIP: 0010:remove_proc_entry+0x160/0x1c0
Call Trace:
 <TASK>
 netfs_exit+0x12/0x620 [netfs]
 __do_sys_delete_module.isra.0+0x14c/0x2e0
 do_syscall_64+0x4b/0x110
 entry_SYSCALL_64_after_hwframe+0x76/0x7e
==================================================================

Therefore use remove_proc_subtree() instead of remove_proc_entry() to
fix the above problem.

Fixes: 7eb5b3e3a0a5 ("netfs, fscache: Move /proc/fs/fscache to /proc/fs/netfs and put in a symlink")
Cc: stable@kernel.org
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Link: https://lore.kernel.org/r/20240826113404.3214786-1-libaokun@huaweicloud.com
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5f0f438e5d21..9d6b49dc6694 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -142,7 +142,7 @@ static int __init netfs_init(void)
 
 error_fscache:
 error_procfile:
-	remove_proc_entry("fs/netfs", NULL);
+	remove_proc_subtree("fs/netfs", NULL);
 error_proc:
 	mempool_exit(&netfs_subrequest_pool);
 error_subreqpool:
@@ -159,7 +159,7 @@ fs_initcall(netfs_init);
 static void __exit netfs_exit(void)
 {
 	fscache_exit();
-	remove_proc_entry("fs/netfs", NULL);
+	remove_proc_subtree("fs/netfs", NULL);
 	mempool_exit(&netfs_subrequest_pool);
 	kmem_cache_destroy(netfs_subrequest_slab);
 	mempool_exit(&netfs_request_pool);

From 75e4c6bcb88a60402c71ae87328813babb0e679a Mon Sep 17 00:00:00 2001
From: Yu Jiaoliang <yujiaoliang@vivo.com>
Date: Fri, 23 Aug 2024 09:55:41 +0800
Subject: [PATCH 50/58] mnt_idmapping: Use kmemdup_array instead of kmemdup for
 multiple allocation

Let the kememdup_array() take care about multiplication and possible
overflows.

v2:Add a new modification for reverse array.

Signed-off-by: Yu Jiaoliang <yujiaoliang@vivo.com>
Link: https://lore.kernel.org/r/20240823015542.3006262-1-yujiaoliang@vivo.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/mnt_idmapping.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 3c60f1eaca61..79491663dbc0 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -228,15 +228,15 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
 		return 0;
 	}
 
-	forward = kmemdup(map_from->forward,
-			  nr_extents * sizeof(struct uid_gid_extent),
-			  GFP_KERNEL_ACCOUNT);
+	forward = kmemdup_array(map_from->forward, nr_extents,
+				sizeof(struct uid_gid_extent),
+				GFP_KERNEL_ACCOUNT);
 	if (!forward)
 		return -ENOMEM;
 
-	reverse = kmemdup(map_from->reverse,
-			  nr_extents * sizeof(struct uid_gid_extent),
-			  GFP_KERNEL_ACCOUNT);
+	reverse = kmemdup_array(map_from->reverse, nr_extents,
+				sizeof(struct uid_gid_extent),
+				GFP_KERNEL_ACCOUNT);
 	if (!reverse) {
 		kfree(forward);
 		return -ENOMEM;

From 7f7b850689ac06a62befe26e1fd1806799e7f152 Mon Sep 17 00:00:00 2001
From: Li Zhijian <lizhijian@fujitsu.com>
Date: Mon, 26 Aug 2024 13:55:03 +0800
Subject: [PATCH 51/58] fs/inode: Prevent dump_mapping() accessing invalid
 dentry.d_name.name

It's observed that a crash occurs during hot-remove a memory device,
in which user is accessing the hugetlb. See calltrace as following:

------------[ cut here ]------------
WARNING: CPU: 1 PID: 14045 at arch/x86/mm/fault.c:1278 do_user_addr_fault+0x2a0/0x790
Modules linked in: kmem device_dax cxl_mem cxl_pmem cxl_port cxl_pci dax_hmem dax_pmem nd_pmem cxl_acpi nd_btt cxl_core crc32c_intel nvme virtiofs fuse nvme_core nfit libnvdimm dm_multipath scsi_dh_rdac scsi_dh_emc s
mirror dm_region_hash dm_log dm_mod
CPU: 1 PID: 14045 Comm: daxctl Not tainted 6.10.0-rc2-lizhijian+ #492
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
RIP: 0010:do_user_addr_fault+0x2a0/0x790
Code: 48 8b 00 a8 04 0f 84 b5 fe ff ff e9 1c ff ff ff 4c 89 e9 4c 89 e2 be 01 00 00 00 bf 02 00 00 00 e8 b5 ef 24 00 e9 42 fe ff ff <0f> 0b 48 83 c4 08 4c 89 ea 48 89 ee 4c 89 e7 5b 5d 41 5c 41 5d 41
RSP: 0000:ffffc90000a575f0 EFLAGS: 00010046
RAX: ffff88800c303600 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000001000 RSI: ffffffff82504162 RDI: ffffffff824b2c36
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffffc90000a57658
R13: 0000000000001000 R14: ffff88800bc2e040 R15: 0000000000000000
FS:  00007f51cb57d880(0000) GS:ffff88807fd00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000001000 CR3: 00000000072e2004 CR4: 00000000001706f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <TASK>
 ? __warn+0x8d/0x190
 ? do_user_addr_fault+0x2a0/0x790
 ? report_bug+0x1c3/0x1d0
 ? handle_bug+0x3c/0x70
 ? exc_invalid_op+0x14/0x70
 ? asm_exc_invalid_op+0x16/0x20
 ? do_user_addr_fault+0x2a0/0x790
 ? exc_page_fault+0x31/0x200
 exc_page_fault+0x68/0x200
<...snip...>
BUG: unable to handle page fault for address: 0000000000001000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 800000000ad92067 P4D 800000000ad92067 PUD 7677067 PMD 0
 Oops: Oops: 0000 [#1] PREEMPT SMP PTI
 ---[ end trace 0000000000000000 ]---
 BUG: unable to handle page fault for address: 0000000000001000
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 800000000ad92067 P4D 800000000ad92067 PUD 7677067 PMD 0
 Oops: Oops: 0000 [#1] PREEMPT SMP PTI
 CPU: 1 PID: 14045 Comm: daxctl Kdump: loaded Tainted: G        W          6.10.0-rc2-lizhijian+ #492
 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
 RIP: 0010:dentry_name+0x1f4/0x440
<...snip...>
? dentry_name+0x2fa/0x440
vsnprintf+0x1f3/0x4f0
vprintk_store+0x23a/0x540
vprintk_emit+0x6d/0x330
_printk+0x58/0x80
dump_mapping+0x10b/0x1a0
? __pfx_free_object_rcu+0x10/0x10
__dump_page+0x26b/0x3e0
? vprintk_emit+0xe0/0x330
? _printk+0x58/0x80
? dump_page+0x17/0x50
dump_page+0x17/0x50
do_migrate_range+0x2f7/0x7f0
? do_migrate_range+0x42/0x7f0
? offline_pages+0x2f4/0x8c0
offline_pages+0x60a/0x8c0
memory_subsys_offline+0x9f/0x1c0
? lockdep_hardirqs_on+0x77/0x100
? _raw_spin_unlock_irqrestore+0x38/0x60
device_offline+0xe3/0x110
state_store+0x6e/0xc0
kernfs_fop_write_iter+0x143/0x200
vfs_write+0x39f/0x560
ksys_write+0x65/0xf0
do_syscall_64+0x62/0x130

Previously, some sanity check have been done in dump_mapping() before
the print facility parsing '%pd' though, it's still possible to run into
an invalid dentry.d_name.name.

Since dump_mapping() only needs to dump the filename only, retrieve it
by itself in a safer way to prevent an unnecessary crash.

Note that either retrieving the filename with '%pd' or
strncpy_from_kernel_nofault(), the filename could be unreliable.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
Link: https://lore.kernel.org/r/20240826055503.1522320-1-lizhijian@fujitsu.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index aacd05749c1f..af78f515403f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -616,6 +616,7 @@ void dump_mapping(const struct address_space *mapping)
 	struct hlist_node *dentry_first;
 	struct dentry *dentry_ptr;
 	struct dentry dentry;
+	char fname[64] = {};
 	unsigned long ino;
 
 	/*
@@ -652,11 +653,14 @@ void dump_mapping(const struct address_space *mapping)
 		return;
 	}
 
+	if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0)
+		strscpy(fname, "<invalid>");
 	/*
-	 * if dentry is corrupted, the %pd handler may still crash,
-	 * but it's unlikely that we reach here with a corrupt mapping
+	 * Even if strncpy_from_kernel_nofault() succeeded,
+	 * the fname could be unreliable
 	 */
-	pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry);
+	pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n",
+		a_ops, ino, fname);
 }
 
 void clear_inode(struct inode *inode)

From 459ca85ae1feff78d1518344df88bb79a092780c Mon Sep 17 00:00:00 2001
From: Julian Sun <sunjunchao2870@gmail.com>
Date: Wed, 28 Aug 2024 16:13:59 +0800
Subject: [PATCH 52/58] writeback: Refine the show_inode_state() macro
 definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, the show_inode_state() macro only prints
part of the state of inode->i_state. Let’s improve it
to display more of its state.

Signed-off-by: Julian Sun <sunjunchao2870@gmail.com>
Link: https://lore.kernel.org/r/20240828081359.62429-1-sunjunchao2870@gmail.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/trace/events/writeback.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 54e353c9f919..a261e86e61fa 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -20,7 +20,15 @@
 		{I_CLEAR,		"I_CLEAR"},		\
 		{I_SYNC,		"I_SYNC"},		\
 		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
-		{I_REFERENCED,		"I_REFERENCED"}		\
+		{I_REFERENCED,		"I_REFERENCED"},	\
+		{I_LINKABLE,		"I_LINKABLE"},		\
+		{I_WB_SWITCH,		"I_WB_SWITCH"},		\
+		{I_OVL_INUSE,		"I_OVL_INUSE"},		\
+		{I_CREATING,		"I_CREATING"},		\
+		{I_DONTCACHE,		"I_DONTCACHE"},		\
+		{I_SYNC_QUEUED,		"I_SYNC_QUEUED"},	\
+		{I_PINNING_NETFS_WB,	"I_PINNING_NETFS_WB"},	\
+		{I_LRU_ISOLATING,	"I_LRU_ISOLATING"}	\
 	)
 
 /* enums need to be exported to user space */

From 5c40e050e6ac0218af7c520095729d440cc87e6b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 29 Aug 2024 15:06:40 +0200
Subject: [PATCH 53/58] fs: drop GFP_NOFAIL mode from alloc_page_buffers

There is only one called of alloc_page_buffers and it doesn't require
__GFP_NOFAIL so drop this allocation mode.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Link: https://lore.kernel.org/r/20240829130640.1397970-1-mhocko@kernel.org
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/md/md-bitmap.c      | 2 +-
 fs/buffer.c                 | 5 +----
 include/linux/buffer_head.h | 3 +--
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 08232d8dc815..db5330d97348 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -360,7 +360,7 @@ static int read_file_page(struct file *file, unsigned long index,
 	pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE,
 		 (unsigned long long)index << PAGE_SHIFT);
 
-	bh = alloc_page_buffers(page, blocksize, false);
+	bh = alloc_page_buffers(page, blocksize);
 	if (!bh) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 31a9062cad7e..74f4eb5c7087 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -957,12 +957,9 @@ no_grow:
 }
 EXPORT_SYMBOL_GPL(folio_alloc_buffers);
 
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-				       bool retry)
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
 {
 	gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
-	if (retry)
-		gfp |= __GFP_NOFAIL;
 
 	return folio_alloc_buffers(page_folio(page), size, gfp);
 }
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 14acf1bbe0ce..7e903457967a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -199,8 +199,7 @@ void folio_set_bh(struct buffer_head *bh, struct folio *folio,
 		  unsigned long offset);
 struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
 					gfp_t gfp);
-struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
-		bool retry);
+struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size);
 struct buffer_head *create_empty_buffers(struct folio *folio,
 		unsigned long blocksize, unsigned long b_state);
 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);

From b4fef22c2fb97fa204f0c99c7c7f1c6b422ef0aa Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 28 Aug 2024 20:19:42 +1000
Subject: [PATCH 54/58] uapi: explain how per-syscall AT_* flags should be
 allocated

Unfortunately, the way we have gone about adding new AT_* flags has
been a little messy. In the beginning, all of the AT_* flags had generic
meanings and so it made sense to share the flag bits indiscriminately.
However, we inevitably ran into syscalls that needed their own
syscall-specific flags. Due to the lack of a planned out policy, we
ended up with the following situations:

 * Existing syscalls adding new features tended to use new AT_* bits,
   with some effort taken to try to re-use bits for flags that were so
   obviously syscall specific that they only make sense for a single
   syscall (such as the AT_EACCESS/AT_REMOVEDIR/AT_HANDLE_FID triplet).

   Given the constraints of bitflags, this works well in practice, but
   ideally (to avoid future confusion) we would plan ahead and define a
   set of "per-syscall bits" ahead of time so that when allocating new
   bits we don't end up with a complete mish-mash of which bits are
   supposed to be per-syscall and which aren't.

 * New syscalls dealt with this in several ways:

   - Some syscalls (like renameat2(2), move_mount(2), fsopen(2), and
     fspick(2)) created their separate own flag spaces that have no
     overlap with the AT_* flags. Most of these ended up allocating
     their bits sequentually.

     In the case of move_mount(2) and fspick(2), several flags have
     identical meanings to AT_* flags but were allocated in their own
     flag space.

     This makes sense for syscalls that will never share AT_* flags, but
     for some syscalls this leads to duplication with AT_* flags in a
     way that could cause confusion (if renameat2(2) grew a
     RENAME_EMPTY_PATH it seems likely that users could mistake it for
     AT_EMPTY_PATH since it is an *at(2) syscall).

   - Some syscalls unfortunately ended up both creating their own flag
     space while also using bits from other flag spaces. The most
     obvious example is open_tree(2), where the standard usage ends up
     using flags from *THREE* separate flag spaces:

       open_tree(AT_FDCWD, "/foo", OPEN_TREE_CLONE|O_CLOEXEC|AT_RECURSIVE);

     (Note that O_CLOEXEC is also platform-specific, so several future
     OPEN_TREE_* bits are also made unusable in one fell swoop.)

It's not entirely clear to me what the "right" choice is for new
syscalls. Just saying that all future VFS syscalls should use AT_* flags
doesn't seem practical. openat2(2) has RESOLVE_* flags (many of which
don't make much sense to burn generic AT_* flags for) and move_mount(2)
has separate AT_*-like flags for both the source and target so separate
flags are needed anyway (though it seems possible that renameat2(2)
could grow *_EMPTY_PATH flags at some point, and it's a bit of a shame
they can't be reused).

But at least for syscalls that _do_ choose to use AT_* flags, we should
explicitly state the policy that 0x2ff is currently intended for
per-syscall flags and that new flags should err on the side of
overlapping with existing flag bits (so we can extend the scope of
generic flags in the future if necessary).

And add AT_* aliases for the RENAME_* flags to further cement that
renameat2(2) is an *at(2) flag, just with its own per-syscall flags.

Suggested-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20240828-exportfs-u64-mount-id-v3-1-10c2c4c16708@cyphar.com
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/fcntl.h | 80 ++++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index e55a3314bcb0..38a6d66d9e88 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -90,37 +90,69 @@
 #define DN_ATTRIB	0x00000020	/* File changed attibutes */
 #define DN_MULTISHOT	0x80000000	/* Don't remove notifier */
 
+#define AT_FDCWD		-100    /* Special value for dirfd used to
+					   indicate openat should use the
+					   current working directory. */
+
+
+/* Generic flags for the *at(2) family of syscalls. */
+
+/* Reserved for per-syscall flags	0xff. */
+#define AT_SYMLINK_NOFOLLOW		0x100   /* Do not follow symbolic
+						   links. */
+/* Reserved for per-syscall flags	0x200 */
+#define AT_SYMLINK_FOLLOW		0x400   /* Follow symbolic links. */
+#define AT_NO_AUTOMOUNT			0x800	/* Suppress terminal automount
+						   traversal. */
+#define AT_EMPTY_PATH			0x1000	/* Allow empty relative
+						   pathname to operate on dirfd
+						   directly. */
 /*
- * The constants AT_REMOVEDIR and AT_EACCESS have the same value.  AT_EACCESS is
- * meaningful only to faccessat, while AT_REMOVEDIR is meaningful only to
- * unlinkat.  The two functions do completely different things and therefore,
- * the flags can be allowed to overlap.  For example, passing AT_REMOVEDIR to
- * faccessat would be undefined behavior and thus treating it equivalent to
- * AT_EACCESS is valid undefined behavior.
+ * These flags are currently statx(2)-specific, but they could be made generic
+ * in the future and so they should not be used for other per-syscall flags.
  */
-#define AT_FDCWD		-100    /* Special value used to indicate
-                                           openat should use the current
-                                           working directory. */
-#define AT_SYMLINK_NOFOLLOW	0x100   /* Do not follow symbolic links.  */
+#define AT_STATX_SYNC_TYPE		0x6000	/* Type of synchronisation required from statx() */
+#define AT_STATX_SYNC_AS_STAT		0x0000	/* - Do whatever stat() does */
+#define AT_STATX_FORCE_SYNC		0x2000	/* - Force the attributes to be sync'd with the server */
+#define AT_STATX_DONT_SYNC		0x4000	/* - Don't sync attributes with the server */
+
+#define AT_RECURSIVE			0x8000	/* Apply to the entire subtree */
+
+/*
+ * Per-syscall flags for the *at(2) family of syscalls.
+ *
+ * These are flags that are so syscall-specific that a user passing these flags
+ * to the wrong syscall is so "clearly wrong" that we can safely call such
+ * usage "undefined behaviour".
+ *
+ * For example, the constants AT_REMOVEDIR and AT_EACCESS have the same value.
+ * AT_EACCESS is meaningful only to faccessat, while AT_REMOVEDIR is meaningful
+ * only to unlinkat. The two functions do completely different things and
+ * therefore, the flags can be allowed to overlap. For example, passing
+ * AT_REMOVEDIR to faccessat would be undefined behavior and thus treating it
+ * equivalent to AT_EACCESS is valid undefined behavior.
+ *
+ * Note for implementers: When picking a new per-syscall AT_* flag, try to
+ * reuse already existing flags first. This leaves us with as many unused bits
+ * as possible, so we can use them for generic bits in the future if necessary.
+ */
+
+/* Flags for renameat2(2) (must match legacy RENAME_* flags). */
+#define AT_RENAME_NOREPLACE	0x0001
+#define AT_RENAME_EXCHANGE	0x0002
+#define AT_RENAME_WHITEOUT	0x0004
+
+/* Flag for faccessat(2). */
 #define AT_EACCESS		0x200	/* Test access permitted for
                                            effective IDs, not real IDs.  */
+/* Flag for unlinkat(2). */
 #define AT_REMOVEDIR		0x200   /* Remove directory instead of
                                            unlinking file.  */
-#define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
-#define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
-#define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
+/* Flags for name_to_handle_at(2). */
+#define AT_HANDLE_FID		0x200	/* File handle is needed to compare
+					   object identity and may not be
+					   usable with open_by_handle_at(2). */
 
-#define AT_STATX_SYNC_TYPE	0x6000	/* Type of synchronisation required from statx() */
-#define AT_STATX_SYNC_AS_STAT	0x0000	/* - Do whatever stat() does */
-#define AT_STATX_FORCE_SYNC	0x2000	/* - Force the attributes to be sync'd with the server */
-#define AT_STATX_DONT_SYNC	0x4000	/* - Don't sync attributes with the server */
-
-#define AT_RECURSIVE		0x8000	/* Apply to the entire subtree */
-
-/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */
-#define AT_HANDLE_FID		AT_REMOVEDIR	/* file handle is needed to
-					compare object identity and may not
-					be usable to open_by_handle_at(2) */
 #if defined(__KERNEL__)
 #define AT_GETATTR_NOSEC	0x80000000
 #endif

From 4356d575ef0f39a3e8e0ce0c40d84ce900ac3b61 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 28 Aug 2024 20:19:43 +1000
Subject: [PATCH 55/58] fhandle: expose u64 mount id to name_to_handle_at(2)

Now that we provide a unique 64-bit mount ID interface in statx(2), we
can now provide a race-free way for name_to_handle_at(2) to provide a
file handle and corresponding mount without needing to worry about
racing with /proc/mountinfo parsing or having to open a file just to do
statx(2).

While this is not necessary if you are using AT_EMPTY_PATH and don't
care about an extra statx(2) call, users that pass full paths into
name_to_handle_at(2) need to know which mount the file handle comes from
(to make sure they don't try to open_by_handle_at a file handle from a
different filesystem) and switching to AT_EMPTY_PATH would require
allocating a file for every name_to_handle_at(2) call, turning

  err = name_to_handle_at(-EBADF, "/foo/bar/baz", &handle, &mntid,
                          AT_HANDLE_MNT_ID_UNIQUE);

into

  int fd = openat(-EBADF, "/foo/bar/baz", O_PATH | O_CLOEXEC);
  err1 = name_to_handle_at(fd, "", &handle, &unused_mntid, AT_EMPTY_PATH);
  err2 = statx(fd, "", AT_EMPTY_PATH, STATX_MNT_ID_UNIQUE, &statxbuf);
  mntid = statxbuf.stx_mnt_id;
  close(fd);

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Link: https://lore.kernel.org/r/20240828-exportfs-u64-mount-id-v3-2-10c2c4c16708@cyphar.com
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c               | 29 ++++++++++++++++++++++-------
 include/linux/syscalls.h   |  2 +-
 include/uapi/linux/fcntl.h |  1 +
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 6e8cea16790e..8cb665629f4a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -16,7 +16,8 @@
 
 static long do_sys_name_to_handle(const struct path *path,
 				  struct file_handle __user *ufh,
-				  int __user *mnt_id, int fh_flags)
+				  void __user *mnt_id, bool unique_mntid,
+				  int fh_flags)
 {
 	long retval;
 	struct file_handle f_handle;
@@ -69,9 +70,19 @@ static long do_sys_name_to_handle(const struct path *path,
 	} else
 		retval = 0;
 	/* copy the mount id */
-	if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
-	    copy_to_user(ufh, handle,
-			 struct_size(handle, f_handle, handle_bytes)))
+	if (unique_mntid) {
+		if (put_user(real_mount(path->mnt)->mnt_id_unique,
+			     (u64 __user *) mnt_id))
+			retval = -EFAULT;
+	} else {
+		if (put_user(real_mount(path->mnt)->mnt_id,
+			     (int __user *) mnt_id))
+			retval = -EFAULT;
+	}
+	/* copy the handle */
+	if (retval != -EFAULT &&
+		copy_to_user(ufh, handle,
+			     struct_size(handle, f_handle, handle_bytes)))
 		retval = -EFAULT;
 	kfree(handle);
 	return retval;
@@ -83,6 +94,7 @@ static long do_sys_name_to_handle(const struct path *path,
  * @name: name that should be converted to handle.
  * @handle: resulting file handle
  * @mnt_id: mount id of the file system containing the file
+ *          (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int)
  * @flag: flag value to indicate whether to follow symlink or not
  *        and whether a decodable file handle is required.
  *
@@ -92,7 +104,7 @@ static long do_sys_name_to_handle(const struct path *path,
  * value required.
  */
 SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
-		struct file_handle __user *, handle, int __user *, mnt_id,
+		struct file_handle __user *, handle, void __user *, mnt_id,
 		int, flag)
 {
 	struct path path;
@@ -100,7 +112,8 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 	int fh_flags;
 	int err;
 
-	if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID))
+	if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID |
+		     AT_HANDLE_MNT_ID_UNIQUE))
 		return -EINVAL;
 
 	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
@@ -109,7 +122,9 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 		lookup_flags |= LOOKUP_EMPTY;
 	err = user_path_at(dfd, name, lookup_flags, &path);
 	if (!err) {
-		err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags);
+		err = do_sys_name_to_handle(&path, handle, mnt_id,
+					    flag & AT_HANDLE_MNT_ID_UNIQUE,
+					    fh_flags);
 		path_put(&path);
 	}
 	return err;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 4bcf6754738d..5758104921e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -870,7 +870,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 #endif
 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 				      struct file_handle __user *handle,
-				      int __user *mnt_id, int flag);
+				      void __user *mnt_id, int flag);
 asmlinkage long sys_open_by_handle_at(int mountdirfd,
 				      struct file_handle __user *handle,
 				      int flags);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 38a6d66d9e88..87e2dec79fea 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -152,6 +152,7 @@
 #define AT_HANDLE_FID		0x200	/* File handle is needed to compare
 					   object identity and may not be
 					   usable with open_by_handle_at(2). */
+#define AT_HANDLE_MNT_ID_UNIQUE	0x001	/* Return the u64 unique mount ID. */
 
 #if defined(__KERNEL__)
 #define AT_GETATTR_NOSEC	0x80000000

From 33d8525dc18f743ec698cb91749132a27cd9c8a8 Mon Sep 17 00:00:00 2001
From: Kienan Stewart <kstewart@efficios.com>
Date: Wed, 4 Sep 2024 10:13:29 -0400
Subject: [PATCH 56/58] fs/pipe: Correct imprecise wording in comment

The comment inaccurately describes what pipefs is - that is, a file
system.

Signed-off-by: Kienan Stewart <kstewart@efficios.com>
Link: https://lore.kernel.org/r/20240904-pipe-correct_imprecise_wording-v1-1-2b07843472c2@efficios.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 7dff2aa50a6d..9a6dfe39f012 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1427,7 +1427,7 @@ static const struct super_operations pipefs_ops = {
 
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
- * no real gain from having the whole whorehouse mounted. So we don't need
+ * no real gain from having the whole file system mounted. So we don't need
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */

From 698e7d1680544ef114203b0cf656faa0c1216ebc Mon Sep 17 00:00:00 2001
From: Yan Zhen <yanzhen@vivo.com>
Date: Mon, 9 Sep 2024 14:33:53 +0800
Subject: [PATCH 57/58] proc: Fix typo in the comment

The deference here confuses me.

Maybe here want to say that because show_fd_locks() does not dereference
the files pointer, using the stale value of the files pointer is safe.

Correctly spelled comments make it easier for the reader to understand
the code.

replace 'deferences' with 'dereferences' in the comment &
replace 'inialized' with 'initialized' in the comment.

Signed-off-by: Yan Zhen <yanzhen@vivo.com>
Link: https://lore.kernel.org/r/20240909063353.2246419-1-yanzhen@vivo.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/fd.c    | 2 +-
 fs/proc/kcore.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 586bbc84ca04..7baafb1eba13 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v)
 		   real_mount(file->f_path.mnt)->mnt_id,
 		   file_inode(file)->i_ino);
 
-	/* show_fd_locks() never deferences files so a stale value is safe */
+	/* show_fd_locks() never dereferences files, so a stale value is safe */
 	show_fd_locks(m, file, files);
 	if (seq_has_overflowed(m))
 		goto out;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8e08a9a1b7ed..7d0acdad74e2 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list)
 	int nid, ret;
 	unsigned long end_pfn;
 
-	/* Not inialized....update now */
+	/* Not initialized....update now */
 	/* find out "max pfn" */
 	end_pfn = 0;
 	for_each_node_state(nid, N_MEMORY) {

From 2077006d4725c82c6e9612cec3a6c140921b067f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 10 Sep 2024 10:16:39 +0200
Subject: [PATCH 58/58] uidgid: make sure we fit into one cacheline

When I expanded uidgid mappings I intended for a struct uid_gid_map to
fit into a single cacheline on x86 as they tend to be pretty
performance sensitive (idmapped mounts etc). But a 4 byte hole was added
that brought it over 64 bytes. Fix that by adding the static extent
array and the extent counter into a substruct. C's type punning for
unions guarantees that we can access ->nr_extents even if the last
written to member wasn't within the same object. This is also what we
rely on in struct_group() and friends. This of course relies on
non-strict aliasing which we don't do.

99) If the member used to read the contents of a union object is not the
    same as the member last used to store a value in the object, the
    appropriate part of the object representation of the value is
    reinterpreted as an object representation in the new type as
    described in 6.2.6 (a process sometimes called "type punning").

Link: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2310.pdf
Link: https://lore.kernel.org/r/20240910-work-uid_gid_map-v1-1-e6bc761363ed@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/user_namespace.h | 6 ++++--
 kernel/user.c                  | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..3625096d5f85 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -21,9 +21,11 @@ struct uid_gid_extent {
 };
 
 struct uid_gid_map { /* 64 bytes -- 1 cache line */
-	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+		struct {
+			struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
+			u32 nr_extents;
+		};
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
diff --git a/kernel/user.c b/kernel/user.c
index aa1162deafe4..f46b1d41163b 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -36,33 +36,33 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc);
  */
 struct user_namespace init_user_ns = {
 	.uid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.gid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.projid_map = {
-		.nr_extents = 1,
 		{
 			.extent[0] = {
 				.first = 0,
 				.lower_first = 0,
 				.count = 4294967295U,
 			},
+			.nr_extents = 1,
 		},
 	},
 	.ns.count = REFCOUNT_INIT(3),