From 1934b212615dc617ac84fc306333ab2b9fc3b04f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Aug 2024 18:00:01 +0200 Subject: [PATCH 01/25] file: reclaim 24 bytes from f_owner We do embedd struct fown_struct into struct file letting it take up 32 bytes in total. We could tweak struct fown_struct to be more compact but really it shouldn't even be embedded in struct file in the first place. Instead, actual users of struct fown_struct should allocate the struct on demand. This frees up 24 bytes in struct file. That will have some potentially user-visible changes for the ownership fcntl()s. Some of them can now fail due to allocation failures. Practically, that probably will almost never happen as the allocations are small and they only happen once per file. The fown_struct is used during kill_fasync() which is used by e.g., pipes to generate a SIGIO signal. Sending of such signals is conditional on userspace having set an owner for the file using one of the F_OWNER fcntl()s. Such users will be unaffected if struct fown_struct is allocated during the fcntl() call. There are a few subsystems that call __f_setown() expecting file->f_owner to be allocated: (1) tun devices file->f_op->fasync::tun_chr_fasync() -> __f_setown() There are no callers of tun_chr_fasync(). (2) tty devices file->f_op->fasync::tty_fasync() -> __tty_fasync() -> __f_setown() tty_fasync() has no additional callers but __tty_fasync() has. Note that __tty_fasync() only calls __f_setown() if the @on argument is true. It's called from: file->f_op->release::tty_release() -> tty_release() -> __tty_fasync() -> __f_setown() tty_release() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of tty_release() are safe as well. file->f_op->release::tty_open() -> tty_release() -> __tty_fasync() -> __f_setown() __tty_hangup() calls __tty_fasync() with @on false => __f_setown() is never called from tty_release(). => All callers of __tty_hangup() are safe as well. From the callchains it's obvious that (1) and (2) end up getting called via file->f_op->fasync(). That can happen either through the F_SETFL fcntl() with the FASYNC flag raised or via the FIOASYNC ioctl(). If FASYNC is requested and the file isn't already FASYNC then file->f_op->fasync() is called with @on true which ends up causing both (1) and (2) to call __f_setown(). (1) and (2) are the only subsystems that call __f_setown() from the file->f_op->fasync() handler. So both (1) and (2) have been updated to allocate a struct fown_struct prior to calling fasync_helper() to register with the fasync infrastructure. That's safe as they both call fasync_helper() which also does allocations if @on is true. The other interesting case are file leases: (3) file leases lease_manager_ops->lm_setup::lease_setup() -> __f_setown() Which in turn is called from: generic_add_lease() -> lease_manager_ops->lm_setup::lease_setup() -> __f_setown() So here again we can simply make generic_add_lease() allocate struct fown_struct prior to the lease_manager_ops->lm_setup::lease_setup() which happens under a spinlock. With that the two remaining subsystems that call __f_setown() are: (4) dnotify (5) sockets Both have their own custom ioctls to set struct fown_struct and both have been converted to allocate a struct fown_struct on demand from their respective ioctls. Interactions with O_PATH are fine as well e.g., when opening a /dev/tty as O_PATH then no file->f_op->open() happens thus no file->f_owner is allocated. That's fine as no file operation will be set for those and the device has never been opened. fcntl()s called on such things will just allocate a ->f_owner on demand. Although I have zero idea why'd you care about f_owner on an O_PATH fd. Link: https://lore.kernel.org/r/20240813-work-f_owner-v2-1-4e9343a79f9f@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- drivers/net/tun.c | 6 ++ drivers/tty/tty_io.c | 6 ++ fs/fcntl.c | 166 ++++++++++++++++++++++++++++-------- fs/file_table.c | 3 +- fs/internal.h | 1 + fs/locks.c | 6 +- fs/notify/dnotify/dnotify.c | 6 +- include/linux/fs.h | 11 ++- net/core/sock.c | 2 +- security/selinux/hooks.c | 2 +- security/smack/smack_lsm.c | 2 +- 11 files changed, 168 insertions(+), 43 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1d06c560c5e6..6fe5e8f7017c 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -3451,6 +3451,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on) struct tun_file *tfile = file->private_data; int ret; + if (on) { + ret = file_f_owner_allocate(file); + if (ret) + goto out; + } + if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) goto out; diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 407b0d87b7c1..7ae0c8934f42 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on) if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) goto out; + if (on) { + retval = file_f_owner_allocate(filp); + if (retval) + goto out; + } + retval = fasync_helper(fd, filp, on, &tty->fasync); if (retval <= 0) goto out; diff --git a/fs/fcntl.c b/fs/fcntl.c index 300e5d9ad913..0587a0e221a6 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -33,6 +33,8 @@ #include #include +#include "internal.h" + #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) static int setfl(int fd, struct file * filp, unsigned int arg) @@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg) return error; } +/* + * Allocate an file->f_owner struct if it doesn't exist, handling racing + * allocations correctly. + */ +int file_f_owner_allocate(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) + return 0; + + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); + if (!f_owner) + return -ENOMEM; + + rwlock_init(&f_owner->lock); + f_owner->file = file; + /* If someone else raced us, drop our allocation. */ + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) + kfree(f_owner); + return 0; +} +EXPORT_SYMBOL(file_f_owner_allocate); + +void file_f_owner_release(struct file *file) +{ + struct fown_struct *f_owner; + + f_owner = file_f_owner(file); + if (f_owner) { + put_pid(f_owner->pid); + kfree(f_owner); + } +} + static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - write_lock_irq(&filp->f_owner.lock); - if (force || !filp->f_owner.pid) { - put_pid(filp->f_owner.pid); - filp->f_owner.pid = get_pid(pid); - filp->f_owner.pid_type = type; + struct fown_struct *f_owner; + + f_owner = file_f_owner(filp); + if (WARN_ON_ONCE(!f_owner)) + return; + + write_lock_irq(&f_owner->lock); + if (force || !f_owner->pid) { + put_pid(f_owner->pid); + f_owner->pid = get_pid(pid); + f_owner->pid_type = type; if (pid) { const struct cred *cred = current_cred(); - filp->f_owner.uid = cred->uid; - filp->f_owner.euid = cred->euid; + f_owner->uid = cred->uid; + f_owner->euid = cred->euid; } } - write_unlock_irq(&filp->f_owner.lock); + write_unlock_irq(&f_owner->lock); } void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, @@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force) struct pid *pid = NULL; int ret = 0; + might_sleep(); + type = PIDTYPE_TGID; if (who < 0) { /* avoid overflow below */ @@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force) who = -who; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); if (who) { pid = find_vpid(who); @@ -152,16 +202,21 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; + struct fown_struct *f_owner; - read_lock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (!f_owner) + return pid; + + read_lock_irq(&f_owner->lock); rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { - pid = pid_vnr(filp->f_owner.pid); - if (filp->f_owner.pid_type == PIDTYPE_PGID) + if (pid_task(f_owner->pid, f_owner->pid_type)) { + pid = pid_vnr(f_owner->pid); + if (f_owner->pid_type == PIDTYPE_PGID) pid = -pid; } rcu_read_unlock(); - read_unlock_irq(&filp->f_owner.lock); + read_unlock_irq(&f_owner->lock); return pid; } @@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg) return -EINVAL; } + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + rcu_read_lock(); pid = find_vpid(owner.pid); if (owner.pid && !pid) @@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex __user *owner_p = (void __user *)arg; struct f_owner_ex owner = {}; int ret = 0; + struct fown_struct *f_owner; + enum pid_type pid_type = PIDTYPE_PID; - read_lock_irq(&filp->f_owner.lock); - rcu_read_lock(); - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) - owner.pid = pid_vnr(filp->f_owner.pid); - rcu_read_unlock(); - switch (filp->f_owner.pid_type) { + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + rcu_read_lock(); + if (pid_task(f_owner->pid, f_owner->pid_type)) + owner.pid = pid_vnr(f_owner->pid); + rcu_read_unlock(); + pid_type = f_owner->pid_type; + } + + switch (pid_type) { case PIDTYPE_PID: owner.type = F_OWNER_TID; break; @@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock_irq(&filp->f_owner.lock); + if (f_owner) + read_unlock_irq(&f_owner->lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg) static int f_getowner_uids(struct file *filp, unsigned long arg) { struct user_namespace *user_ns = current_user_ns(); + struct fown_struct *f_owner; uid_t __user *dst = (void __user *)arg; - uid_t src[2]; + uid_t src[2] = {0, 0}; int err; - read_lock_irq(&filp->f_owner.lock); - src[0] = from_kuid(user_ns, filp->f_owner.uid); - src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock_irq(&filp->f_owner.lock); + f_owner = file_f_owner(filp); + if (f_owner) { + read_lock_irq(&f_owner->lock); + src[0] = from_kuid(user_ns, f_owner->uid); + src[1] = from_kuid(user_ns, f_owner->euid); + read_unlock_irq(&f_owner->lock); + } err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -343,6 +414,30 @@ static long f_dupfd_query(int fd, struct file *filp) return f.file == filp; } +static int f_owner_sig(struct file *filp, int signum, bool setsig) +{ + int ret = 0; + struct fown_struct *f_owner; + + might_sleep(); + + if (setsig) { + if (!valid_signal(signum)) + return -EINVAL; + + ret = file_f_owner_allocate(filp); + if (ret) + return ret; + } + + f_owner = file_f_owner(filp); + if (setsig) + f_owner->signum = signum; + else if (f_owner) + ret = f_owner->signum; + return ret; +} + static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { @@ -421,15 +516,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = f_getowner_uids(filp, arg); break; case F_GETSIG: - err = filp->f_owner.signum; + err = f_owner_sig(filp, 0, false); break; case F_SETSIG: - /* arg == 0 restores default behaviour. */ - if (!valid_signal(argi)) { - break; - } - err = 0; - filp->f_owner.signum = argi; + err = f_owner_sig(filp, argi, true); break; case F_GETLEASE: err = fcntl_getlease(filp); @@ -844,14 +934,19 @@ static void send_sigurg_to_task(struct task_struct *p, do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); } -int send_sigurg(struct fown_struct *fown) +int send_sigurg(struct file *file) { + struct fown_struct *fown; struct task_struct *p; enum pid_type type; struct pid *pid; unsigned long flags; int ret = 0; + fown = file_f_owner(file); + if (!fown) + return 0; + read_lock_irqsave(&fown->lock, flags); type = fown->pid_type; @@ -1027,13 +1122,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) } read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { - fown = &fa->fa_file->f_owner; + fown = file_f_owner(fa->fa_file); + if (!fown) + goto next; /* Don't send SIGURG to processes which have not set a queued signum: SIGURG has its own default signalling mechanism. */ if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } +next: read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } diff --git a/fs/file_table.c b/fs/file_table.c index ca7843dde56d..fdf98709dde2 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -155,7 +155,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); f->f_flags = flags; @@ -425,7 +424,7 @@ static void __fput(struct file *file) cdev_put(inode->i_cdev); } fops_put(file->f_op); - put_pid(file->f_owner.pid); + file_f_owner_release(file); put_file_access(file); dput(dentry); if (unlikely(mode & FMODE_NEED_UNMOUNT)) diff --git a/fs/internal.h b/fs/internal.h index cdd73209eecb..8c1b7acbbe8f 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } +void file_f_owner_release(struct file *file); diff --git a/fs/locks.c b/fs/locks.c index e45cad40f8b6..b51b1c395ce6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose) struct file *filp = fl->c.flc_file; f_delown(filp); - filp->f_owner.signum = 0; + file_f_owner(filp)->signum = 0; fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); if (fl->fl_fasync != NULL) { printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); @@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr lease = *flp; trace_generic_add_lease(inode, lease); + error = file_f_owner_allocate(filp); + if (error) + return error; + /* Note that arg is never F_UNLCK here */ ctx = locks_get_lock_context(inode, arg); if (!ctx) diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index f3669403fabf..46440fbb8662 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, prev = &dn->dn_next; continue; } - fown = &dn->dn_filp->f_owner; + fown = file_f_owner(dn->dn_filp); send_sigio(fown, dn->dn_fd, POLL_MSG); if (dn->dn_mask & FS_DN_MULTISHOT) prev = &dn->dn_next; @@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) goto out_err; } + error = file_f_owner_allocate(filp); + if (error) + goto out_err; + /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; fsnotify_init_mark(new_fsn_mark, dnotify_group); diff --git a/include/linux/fs.h b/include/linux/fs.h index fb0426f349fc..7af239ca87e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -947,6 +947,7 @@ static inline unsigned imajor(const struct inode *inode) } struct fown_struct { + struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ @@ -1011,7 +1012,7 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; - struct fown_struct f_owner; + struct fown_struct *f_owner; const struct cred *f_cred; struct file_ra_state f_ra; struct path f_path; @@ -1076,6 +1077,12 @@ struct file_lease; #define OFFT_OFFSET_MAX type_max(off_t) #endif +int file_f_owner_allocate(struct file *file); +static inline struct fown_struct *file_f_owner(const struct file *file) +{ + return READ_ONCE(file->f_owner); +} + extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) @@ -1124,7 +1131,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); -extern int send_sigurg(struct fown_struct *fown); +extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where diff --git a/net/core/sock.c b/net/core/sock.c index 9abc4fe25953..bbe4c58470c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3429,7 +3429,7 @@ static void sock_def_destruct(struct sock *sk) void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) - if (send_sigurg(&sk->sk_socket->file->f_owner)) + if (send_sigurg(sk->sk_socket->file)) sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } EXPORT_SYMBOL(sk_send_sigurg); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index bfa61e005aac..577e1fcbf6df 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3950,7 +3950,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk, struct file_security_struct *fsec; /* struct fown_struct is never outside the context of a struct file */ - file = container_of(fown, struct file, f_owner); + file = fown->file; fsec = selinux_file(file); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 4164699cd4f6..cb33920ab67c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1950,7 +1950,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, /* * struct fown_struct is never outside the context of a struct file */ - file = container_of(fown, struct file, f_owner); + file = fown->file; /* we don't log here as rc can be overriden */ blob = smack_file(file); From a55d1cbd1720679cfe9837bce250e397ec513989 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 22 Aug 2024 16:14:46 +0200 Subject: [PATCH 02/25] fs: switch f_iocb_flags and f_ra Now that we shrank struct file by 24 bytes we still have a 4 byte hole. If we move struct file_ra_state into the union and f_iocb_flags out of the union we close that whole and bring down struct file to 192 bytes. Which means struct file is 3 cachelines and we managed to shrink it by 40 bytes this cycle. I've tried to audit all codepaths that use f_ra and none of them seem to rely on it in file->f_op->release() and never have since commit 1da177e4c3f4 ("Linux-2.6.12-rc2"). Link: https://lore.kernel.org/r/20240823-luftdicht-berappen-d69a2166a0db@brauner Reviewed-by: Jeff Layton Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7af239ca87e2..095a956aeb29 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -999,9 +999,9 @@ struct file { struct callback_head f_task_work; /* fput() must use workqueue (most kernel threads). */ struct llist_node f_llist; - unsigned int f_iocb_flags; + /* Invalid after last fput(). */ + struct file_ra_state f_ra; }; - /* * Protects f_ep, f_flags. * Must not be taken from IRQ context. @@ -1012,9 +1012,9 @@ struct file { struct mutex f_pos_lock; loff_t f_pos; unsigned int f_flags; + unsigned int f_iocb_flags; struct fown_struct *f_owner; const struct cred *f_cred; - struct file_ra_state f_ra; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; From c0390d541128e8820af8177a572d9d87ff68a3bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 23 Aug 2024 21:06:58 +0200 Subject: [PATCH 03/25] fs: pack struct file Now that we shrunk struct file to 192 bytes aka 3 cachelines reorder struct file to not leave any holes or have members cross cachelines. Add a short comment to each of the fields and mark the cachelines. It's possible that we may have to tweak this based on profiling in the future. So far I had Jens test this comparing io_uring with non-fixed and fixed files and it improved performance. The layout is a combination of Jens' and my changes. Link: https: //lore.kernel.org/r/20240824-peinigen-hocken-7384b977c643@brauner Signed-off-by: Christian Brauner --- include/linux/fs.h | 91 ++++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 40 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 095a956aeb29..af8bbd4eeb3a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -987,52 +987,63 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } -/* - * f_{lock,count,pos_lock} members can be highly contended and share - * the same cacheline. f_{lock,mode} are very frequently used together - * and so share the same cacheline as well. The read-mostly - * f_{path,inode,op} are kept on a separate cacheline. +/** + * struct file - Represents a file + * @f_count: reference count + * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. + * @f_mode: FMODE_* flags often used in hotpaths + * @f_op: file operations + * @f_mapping: Contents of a cacheable, mappable object. + * @private_data: filesystem or driver specific data + * @f_inode: cached inode + * @f_flags: file flags + * @f_iocb_flags: iocb flags + * @f_cred: stashed credentials of creator/opener + * @f_path: path of the file + * @f_pos_lock: lock protecting file position + * @f_pos: file position + * @f_version: file version + * @f_security: LSM security context of this file + * @f_owner: file owner + * @f_wb_err: writeback error + * @f_sb_err: per sb writeback errors + * @f_ep: link of all epoll hooks for this file + * @f_task_work: task work entry point + * @f_llist: work queue entrypoint + * @f_ra: file's readahead state */ struct file { + atomic_long_t f_count; + spinlock_t f_lock; + fmode_t f_mode; + const struct file_operations *f_op; + struct address_space *f_mapping; + void *private_data; + struct inode *f_inode; + unsigned int f_flags; + unsigned int f_iocb_flags; + const struct cred *f_cred; + /* --- cacheline 1 boundary (64 bytes) --- */ + struct path f_path; + struct mutex f_pos_lock; + loff_t f_pos; + u64 f_version; + /* --- cacheline 2 boundary (128 bytes) --- */ +#ifdef CONFIG_SECURITY + void *f_security; +#endif + struct fown_struct *f_owner; + errseq_t f_wb_err; + errseq_t f_sb_err; +#ifdef CONFIG_EPOLL + struct hlist_head *f_ep; +#endif union { - /* fput() uses task work when closing and freeing file (default). */ - struct callback_head f_task_work; - /* fput() must use workqueue (most kernel threads). */ + struct callback_head f_task_work; struct llist_node f_llist; - /* Invalid after last fput(). */ struct file_ra_state f_ra; }; - /* - * Protects f_ep, f_flags. - * Must not be taken from IRQ context. - */ - spinlock_t f_lock; - fmode_t f_mode; - atomic_long_t f_count; - struct mutex f_pos_lock; - loff_t f_pos; - unsigned int f_flags; - unsigned int f_iocb_flags; - struct fown_struct *f_owner; - const struct cred *f_cred; - struct path f_path; - struct inode *f_inode; /* cached value */ - const struct file_operations *f_op; - - u64 f_version; -#ifdef CONFIG_SECURITY - void *f_security; -#endif - /* needed for tty driver, and maybe others */ - void *private_data; - -#ifdef CONFIG_EPOLL - /* Used by fs/eventpoll.c to link all the hooks to this file */ - struct hlist_head *f_ep; -#endif /* #ifdef CONFIG_EPOLL */ - struct address_space *f_mapping; - errseq_t f_wb_err; - errseq_t f_sb_err; /* for syncfs */ + /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ From e446f18e98e89fb7de2b320620ce983929bb2486 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:23 +0200 Subject: [PATCH 04/25] mm: remove unused argument from create_cache() That root_cache argument is unused so remove it. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-1-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- mm/slab_common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 40b582a014b8..c8dd7e08c5f6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -204,8 +204,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, static struct kmem_cache *create_cache(const char *name, unsigned int object_size, unsigned int align, slab_flags_t flags, unsigned int useroffset, - unsigned int usersize, void (*ctor)(void *), - struct kmem_cache *root_cache) + unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s; int err; @@ -334,7 +333,7 @@ kmem_cache_create_usercopy(const char *name, s = create_cache(cache_name, size, calculate_alignment(flags, align, size), - flags, useroffset, usersize, ctor, NULL); + flags, useroffset, usersize, ctor); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); From d345bd2e9834e2da505977e154a1c179c793b7b2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:24 +0200 Subject: [PATCH 05/25] mm: add kmem_cache_create_rcu() When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer must be located outside of the object because we don't know what part of the memory can safely be overwritten as it may be needed to prevent object recycling. That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a new cacheline. This is the case for e.g., struct file. After having it shrunk down by 40 bytes and having it fit in three cachelines we still have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to accommodate the free pointer. Add a new kmem_cache_create_rcu() function that allows the caller to specify an offset where the free pointer is supposed to be placed. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-2-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- include/linux/slab.h | 9 +++ mm/slab.h | 2 + mm/slab_common.c | 136 ++++++++++++++++++++++++++++++------------- mm/slub.c | 20 ++++--- 4 files changed, 121 insertions(+), 46 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..5b2da2cf31a8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -212,6 +212,12 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * @@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, + unsigned int freeptr_offset, + slab_flags_t flags); void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); diff --git a/mm/slab.h b/mm/slab.h index dcdb56b8e7f5..a6051385186e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,6 +261,8 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ + /* Specific free pointer requested (if not UINT_MAX) */ + unsigned int rcu_freeptr_offset; #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; diff --git a/mm/slab_common.c b/mm/slab_common.c index c8dd7e08c5f6..887f6b9855dd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -202,9 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int align, - slab_flags_t flags, unsigned int useroffset, - unsigned int usersize, void (*ctor)(void *)) + unsigned int object_size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) { struct kmem_cache *s; int err; @@ -212,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name, if (WARN_ON(useroffset + usersize > object_size)) useroffset = usersize = 0; + /* If a custom freelist pointer is requested make sure it's sane. */ + err = -EINVAL; + if (freeptr_offset != UINT_MAX && + (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || + !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) + goto out; + err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (!s) @@ -219,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name, s->name = name; s->size = s->object_size = object_size; + s->rcu_freeptr_offset = freeptr_offset; s->align = align; s->ctor = ctor; #ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); if (err) goto out_free_cache; @@ -240,38 +248,10 @@ out: return ERR_PTR(err); } -/** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache * -kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, +static struct kmem_cache * +do_kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { @@ -331,7 +311,7 @@ kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, + s = create_cache(cache_name, size, freeptr_offset, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor); if (IS_ERR(s)) { @@ -355,6 +335,45 @@ out_unlock: } return s; } + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @freeptr_offset: Custom offset for the free pointer in RCU caches + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache * +kmem_cache_create_usercopy(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, + useroffset, usersize, ctor); +} EXPORT_SYMBOL(kmem_cache_create_usercopy); /** @@ -386,11 +405,50 @@ struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, - ctor); + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, + 0, 0, ctor); } EXPORT_SYMBOL(kmem_cache_create); +/** + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @freeptr_offset: The offset into the memory to the free pointer + * @flags: SLAB flags + * + * Cannot be called within an interrupt, but can be interrupted. + * + * See kmem_cache_create() for an explanation of possible @flags. + * + * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside + * of the object. This might cause the object to grow in size. Callers + * that have a reason to avoid this can specify a custom free pointer + * offset in their struct where the free pointer will be placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for + * details.). + * + * Using zero as a value for @freeptr_offset is valid. To request no + * offset UINT_MAX must be specified. + * + * Note that @ctor isn't supported with custom free pointers as a @ctor + * requires an external free pointer. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, + unsigned int freeptr_offset, + slab_flags_t flags) +{ + return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, + NULL); +} +EXPORT_SYMBOL(kmem_cache_create_rcu); + static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** diff --git a/mm/slub.c b/mm/slub.c index c9d8a2497fd6..9aa5da1e8e27 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq; * Core slab cache functions *******************************************************************/ -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * Returns freelist pointer (ptr). With hardening, this is obfuscated * with an XOR of the address where the pointer is held and a per-cache @@ -3921,6 +3915,9 @@ static void *__slab_alloc_node(struct kmem_cache *s, /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers specified via + * s->rcu_freeptr_offset. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -5144,6 +5141,12 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } +/* Was a valid freeptr offset requested? */ +static inline bool has_freeptr_offset(const struct kmem_cache *s) +{ + return s->rcu_freeptr_offset != UINT_MAX; +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -5189,7 +5192,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || + (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* @@ -5210,6 +5214,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { + s->offset = s->rcu_freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep From ea566e18b4deea6e998088de4f1a76d1f39c8d3f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 28 Aug 2024 12:56:25 +0200 Subject: [PATCH 06/25] fs: use kmem_cache_create_rcu() Switch to the new kmem_cache_create_rcu() helper which allows us to use a custom free pointer offset avoiding the need to have an external free pointer which would grow struct file behind our backs. Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-3-5460bc1f09f6@kernel.org Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner --- fs/file_table.c | 6 +++--- include/linux/fs.h | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index fdf98709dde2..3ef558f27a1c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -511,9 +511,9 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | - SLAB_PANIC | SLAB_ACCOUNT, NULL); + filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), + offsetof(struct file, f_freeptr), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/include/linux/fs.h b/include/linux/fs.h index af8bbd4eeb3a..58c91a52cad1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1011,6 +1011,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state + * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { atomic_long_t f_count; @@ -1042,6 +1043,7 @@ struct file { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; + freeptr_t f_freeptr; }; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout From 0f389adb4b80eef29920db2e2c99392aa5d06811 Mon Sep 17 00:00:00 2001 From: R Sundar Date: Mon, 2 Sep 2024 07:35:55 +0530 Subject: [PATCH 07/25] mm: Removed @freeptr_offset to prevent doc warning Removed @freeptr_offset to fix below doc warning. ./mm/slab_common.c:385: warning: Excess function parameter 'freeptr_offset' description in 'kmem_cache_create_usercopy' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202408292249.5oUpnCbS-lkp@intel.com/ Signed-off-by: R Sundar Link: https://lore.kernel.org/r/20240902020555.11506-1-prosunofficial@gmail.com Signed-off-by: Christian Brauner --- mm/slab_common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 887f6b9855dd..95db3702f8d6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -341,7 +341,6 @@ out_unlock: * for copying to userspace * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. - * @freeptr_offset: Custom offset for the free pointer in RCU caches * @align: The required alignment for the objects. * @flags: SLAB flags * @useroffset: Usercopy region offset From 53d3d210864e532ee5d8ae48f66168c12dd8f530 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:44 +0200 Subject: [PATCH 08/25] slab: s/__kmem_cache_create/do_kmem_cache_create/g Free up reusing the double-underscore variant for follow-up patches. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 2 +- mm/slab_common.c | 4 ++-- mm/slub.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index a6051385186e..684bb48c4f39 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -424,7 +424,7 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ -int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +int do_kmem_cache_create(struct kmem_cache *, slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, diff --git a/mm/slab_common.c b/mm/slab_common.c index 95db3702f8d6..91e0e36e4379 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -234,7 +234,7 @@ static struct kmem_cache *create_cache(const char *name, s->useroffset = useroffset; s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, flags); if (err) goto out_free_cache; @@ -778,7 +778,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, s->usersize = usersize; #endif - err = __kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, flags); if (err) panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", diff --git a/mm/slub.c b/mm/slub.c index 9aa5da1e8e27..23d9d783ff26 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5902,7 +5902,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; From 879fb3c274c12cb0892718e1e54f85fc406e3c7b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:45 +0200 Subject: [PATCH 09/25] slab: add struct kmem_cache_args Currently we have multiple kmem_cache_create*() variants that take up to seven separate parameters with one of the functions having to grow an eigth parameter in the future to handle both usercopy and a custom freelist pointer. Add a struct kmem_cache_args structure and move less common parameters into it. Core parameters such as name, object size, and flags continue to be passed separately. Add a new function __kmem_cache_create_args() that takes a struct kmem_cache_args pointer and port do_kmem_cache_create_usercopy() over to it. In follow-up patches we will port the other kmem_cache_create*() variants over to it as well. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 22 +++++++++++++++ mm/slab_common.c | 65 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 5b2da2cf31a8..2b8eeca7fd2c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -240,6 +240,28 @@ struct mem_cgroup; */ bool slab_is_available(void); +/** + * struct kmem_cache_args - Less common arguments for kmem_cache_create() + * @align: The required alignment for the objects. + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @freeptr_offset: Custom offset for the free pointer in RCU caches + * @use_freeptr_offset: Whether a @freeptr_offset is used + * @ctor: A constructor for the objects. + */ +struct kmem_cache_args { + unsigned int align; + unsigned int useroffset; + unsigned int usersize; + unsigned int freeptr_offset; + bool use_freeptr_offset; + void (*ctor)(void *); +}; + +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags); struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); diff --git a/mm/slab_common.c b/mm/slab_common.c index 91e0e36e4379..0f13c045b8d1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -248,14 +248,24 @@ out: return ERR_PTR(err); } -static struct kmem_cache * -do_kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) +/** + * __kmem_cache_create_args - Create a kmem cache + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @object_size: The size of objects to be created in this cache. + * @args: Arguments for the cache creation (see struct kmem_cache_args). + * @flags: See %SLAB_* flags for an explanation of individual @flags. + * + * Cannot be called within a interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +struct kmem_cache *__kmem_cache_create_args(const char *name, + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s = NULL; + unsigned int freeptr_offset = UINT_MAX; const char *cache_name; int err; @@ -275,7 +285,7 @@ do_kmem_cache_create_usercopy(const char *name, mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(name, size); + err = kmem_cache_sanity_check(name, object_size); if (err) { goto out_unlock; } @@ -296,12 +306,14 @@ do_kmem_cache_create_usercopy(const char *name, /* Fail closed on bad usersize of useroffset values. */ if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || - WARN_ON(!usersize && useroffset) || - WARN_ON(size < usersize || size - usersize < useroffset)) - usersize = useroffset = 0; + WARN_ON(!args->usersize && args->useroffset) || + WARN_ON(object_size < args->usersize || + object_size - args->usersize < args->useroffset)) + args->usersize = args->useroffset = 0; - if (!usersize) - s = __kmem_cache_alias(name, size, align, flags, ctor); + if (!args->usersize) + s = __kmem_cache_alias(name, object_size, args->align, flags, + args->ctor); if (s) goto out_unlock; @@ -311,9 +323,11 @@ do_kmem_cache_create_usercopy(const char *name, goto out_unlock; } - s = create_cache(cache_name, size, freeptr_offset, - calculate_alignment(flags, align, size), - flags, useroffset, usersize, ctor); + if (args->use_freeptr_offset) + freeptr_offset = args->freeptr_offset; + s = create_cache(cache_name, object_size, freeptr_offset, + calculate_alignment(flags, args->align, object_size), + flags, args->useroffset, args->usersize, args->ctor); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -335,6 +349,27 @@ out_unlock: } return s; } +EXPORT_SYMBOL(__kmem_cache_create_args); + +static struct kmem_cache * +do_kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int freeptr_offset, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .use_freeptr_offset = freeptr_offset != UINT_MAX, + .freeptr_offset = freeptr_offset, + .useroffset = useroffset, + .usersize = usersize, + .ctor = ctor, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} + /** * kmem_cache_create_usercopy - Create a cache with a region suitable From f6cd98c9407f8b8118464c633b27765e751e2750 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:46 +0200 Subject: [PATCH 10/25] slab: port kmem_cache_create() to struct kmem_cache_args Port kmem_cache_create() to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 0f13c045b8d1..ac0832dac01e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -439,8 +439,12 @@ struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, - 0, 0, ctor); + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); } EXPORT_SYMBOL(kmem_cache_create); From 9816c3c4e778b2477fda1966bf7047a2d9772d14 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:47 +0200 Subject: [PATCH 11/25] slab: port kmem_cache_create_rcu() to struct kmem_cache_args Port kmem_cache_create_rcu() to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ac0832dac01e..da62ed30f95d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -481,9 +481,13 @@ struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, unsigned int freeptr_offset, slab_flags_t flags) { - return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, - flags | SLAB_TYPESAFE_BY_RCU, 0, 0, - NULL); + struct kmem_cache_args kmem_args = { + .freeptr_offset = freeptr_offset, + .use_freeptr_offset = true, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, + flags | SLAB_TYPESAFE_BY_RCU); } EXPORT_SYMBOL(kmem_cache_create_rcu); From 1d3d7645d789c9ce8fec48b16c0040b4a3dc6604 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:48 +0200 Subject: [PATCH 12/25] slab: port kmem_cache_create_usercopy() to struct kmem_cache_args Port kmem_cache_create_usercopy() to struct kmem_cache_args and remove the now unused do_kmem_cache_create_usercopy() helper. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index da62ed30f95d..16c36a946135 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -351,26 +351,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -static struct kmem_cache * -do_kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .use_freeptr_offset = freeptr_offset != UINT_MAX, - .freeptr_offset = freeptr_offset, - .useroffset = useroffset, - .usersize = usersize, - .ctor = ctor, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} - - /** * kmem_cache_create_usercopy - Create a cache with a region suitable * for copying to userspace @@ -405,8 +385,14 @@ kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { - return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, - useroffset, usersize, ctor); + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + .useroffset = useroffset, + .usersize = usersize, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); } EXPORT_SYMBOL(kmem_cache_create_usercopy); From 34410a906080e7f669330c45c8e38215fe09f481 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:49 +0200 Subject: [PATCH 13/25] slab: pass struct kmem_cache_args to create_cache() Pass struct kmem_cache_args to create_cache() so that we can later simplify further helpers. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 16c36a946135..9baa61c9c670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -202,22 +202,22 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, } static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int freeptr_offset, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) + unsigned int object_size, + struct kmem_cache_args *args, + slab_flags_t flags) { struct kmem_cache *s; int err; - if (WARN_ON(useroffset + usersize > object_size)) - useroffset = usersize = 0; + if (WARN_ON(args->useroffset + args->usersize > object_size)) + args->useroffset = args->usersize = 0; /* If a custom freelist pointer is requested make sure it's sane. */ err = -EINVAL; - if (freeptr_offset != UINT_MAX && - (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || - !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) + if (args->use_freeptr_offset && + (args->freeptr_offset >= object_size || + !(flags & SLAB_TYPESAFE_BY_RCU) || + !IS_ALIGNED(args->freeptr_offset, sizeof(freeptr_t)))) goto out; err = -ENOMEM; @@ -227,12 +227,15 @@ static struct kmem_cache *create_cache(const char *name, s->name = name; s->size = s->object_size = object_size; - s->rcu_freeptr_offset = freeptr_offset; - s->align = align; - s->ctor = ctor; + if (args->use_freeptr_offset) + s->rcu_freeptr_offset = args->freeptr_offset; + else + s->rcu_freeptr_offset = UINT_MAX; + s->align = args->align; + s->ctor = args->ctor; #ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; + s->useroffset = args->useroffset; + s->usersize = args->usersize; #endif err = do_kmem_cache_create(s, flags); if (err) @@ -265,7 +268,6 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, slab_flags_t flags) { struct kmem_cache *s = NULL; - unsigned int freeptr_offset = UINT_MAX; const char *cache_name; int err; @@ -323,11 +325,8 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, goto out_unlock; } - if (args->use_freeptr_offset) - freeptr_offset = args->freeptr_offset; - s = create_cache(cache_name, object_size, freeptr_offset, - calculate_alignment(flags, args->align, object_size), - flags, args->useroffset, args->usersize, args->ctor); + args->align = calculate_alignment(flags, args->align, object_size); + s = create_cache(cache_name, object_size, args, flags); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); From fc0eac57d08cad87b478a3be944899c81c950d43 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:50 +0200 Subject: [PATCH 14/25] slab: pull kmem_cache_open() into do_kmem_cache_create() do_kmem_cache_create() is the only caller and we're going to pass down struct kmem_cache_args in a follow-up patch. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slub.c | 132 +++++++++++++++++++++++++----------------------------- 1 file changed, 62 insertions(+), 70 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 23d9d783ff26..30f4ca6335c7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5290,65 +5290,6 @@ static int calculate_sizes(struct kmem_cache *s) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s)) - goto error; - } - } - -#ifdef system_has_freelist_aba - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; - } -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - static void list_slab_objects(struct kmem_cache *s, struct slab *slab, const char *text) { @@ -5904,26 +5845,77 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + + if (!calculate_sizes(s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s)) + goto out; + } + } + +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } + + if (!init_kmem_cache_nodes(s)) + goto out; + + if (!alloc_kmem_cache_cpus(s)) + goto out; /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; + if (slab_state <= UP) { + err = 0; + goto out; + } err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; - } + if (err) + goto out; if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return 0; +out: + if (err) + __kmem_cache_release(s); + return err; } #ifdef SLAB_SUPPORTS_SYSFS From 3dbe2bad5785e4a9f62d99186e7d31410a8f1e2d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:51 +0200 Subject: [PATCH 15/25] slab: pass struct kmem_cache_args to do_kmem_cache_create() and initialize most things in do_kmem_cache_create(). In a follow-up patch we'll remove rcu_freeptr_offset from struct kmem_cache. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 +++- mm/slab_common.c | 27 ++++++--------------------- mm/slub.c | 17 ++++++++++++++++- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 684bb48c4f39..c7a4e0fc3cf1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -424,7 +424,9 @@ kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ -int do_kmem_cache_create(struct kmem_cache *, slab_flags_t flags); +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags); void __init kmem_cache_init(void); extern void create_boot_cache(struct kmem_cache *, const char *name, diff --git a/mm/slab_common.c b/mm/slab_common.c index 9baa61c9c670..19ae3dd6e36f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -224,20 +224,7 @@ static struct kmem_cache *create_cache(const char *name, s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); if (!s) goto out; - - s->name = name; - s->size = s->object_size = object_size; - if (args->use_freeptr_offset) - s->rcu_freeptr_offset = args->freeptr_offset; - else - s->rcu_freeptr_offset = UINT_MAX; - s->align = args->align; - s->ctor = args->ctor; -#ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = args->useroffset; - s->usersize = args->usersize; -#endif - err = do_kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, object_size, args, flags); if (err) goto out_free_cache; @@ -788,9 +775,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, { int err; unsigned int align = ARCH_KMALLOC_MINALIGN; - - s->name = name; - s->size = s->object_size = size; + struct kmem_cache_args kmem_args = {}; /* * kmalloc caches guarantee alignment of at least the largest @@ -799,14 +784,14 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, */ if (flags & SLAB_KMALLOC) align = max(align, 1U << (ffs(size) - 1)); - s->align = calculate_alignment(flags, align, size); + kmem_args.align = calculate_alignment(flags, align, size); #ifdef CONFIG_HARDENED_USERCOPY - s->useroffset = useroffset; - s->usersize = usersize; + kmem_args.useroffset = useroffset; + kmem_args.usersize = usersize; #endif - err = do_kmem_cache_create(s, flags); + err = do_kmem_cache_create(s, name, size, &kmem_args, flags); if (err) panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", diff --git a/mm/slub.c b/mm/slub.c index 30f4ca6335c7..4719b60215b8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5843,14 +5843,29 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int do_kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { int err = -EINVAL; + s->name = name; + s->size = s->object_size = size; + s->flags = kmem_cache_flags(flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif + if (args->use_freeptr_offset) + s->rcu_freeptr_offset = args->freeptr_offset; + else + s->rcu_freeptr_offset = UINT_MAX; + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif if (!calculate_sizes(s)) goto out; From dacf472bcdfa4f3b08cd2c1beef034e3e5ab4bd0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:52 +0200 Subject: [PATCH 16/25] slab: remove rcu_freeptr_offset from struct kmem_cache Pass down struct kmem_cache_args to calculate_sizes() so we can use args->{use}_freeptr_offset directly. This allows us to remove ->rcu_freeptr_offset from struct kmem_cache. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- mm/slab.h | 2 -- mm/slub.c | 25 +++++++------------------ 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index c7a4e0fc3cf1..36ac38e21fcb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -261,8 +261,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ - /* Specific free pointer requested (if not UINT_MAX) */ - unsigned int rcu_freeptr_offset; #ifdef CONFIG_SLUB_CPU_PARTIAL /* Number of per cpu partial objects to keep around */ unsigned int cpu_partial; diff --git a/mm/slub.c b/mm/slub.c index 4719b60215b8..a23c7036cd61 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3916,8 +3916,7 @@ static void *__slab_alloc_node(struct kmem_cache *s, * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. * - * Note that we also wipe custom freelist pointers specified via - * s->rcu_freeptr_offset. + * Note that we also wipe custom freelist pointers. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) @@ -5141,17 +5140,11 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } -/* Was a valid freeptr offset requested? */ -static inline bool has_freeptr_offset(const struct kmem_cache *s) -{ - return s->rcu_freeptr_offset != UINT_MAX; -} - /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -5192,7 +5185,7 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || (flags & SLAB_POISON) || s->ctor || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { @@ -5214,8 +5207,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { - s->offset = s->rcu_freeptr_offset; + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -5856,10 +5849,6 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif - if (args->use_freeptr_offset) - s->rcu_freeptr_offset = args->freeptr_offset; - else - s->rcu_freeptr_offset = UINT_MAX; s->align = args->align; s->ctor = args->ctor; #ifdef CONFIG_HARDENED_USERCOPY @@ -5867,7 +5856,7 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->usersize = args->usersize; #endif - if (!calculate_sizes(s)) + if (!calculate_sizes(args, s)) goto out; if (disable_higher_order_debug) { /* @@ -5877,7 +5866,7 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (get_order(s->size) > get_order(s->object_size)) { s->flags &= ~DEBUG_METADATA_FLAGS; s->offset = 0; - if (!calculate_sizes(s)) + if (!calculate_sizes(args, s)) goto out; } } From 052d67b46bcd91b6785e8e6e047241814f6142a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:53 +0200 Subject: [PATCH 17/25] slab: port KMEM_CACHE() to struct kmem_cache_args Make KMEM_CACHE() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2b8eeca7fd2c..1f38d94387cc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -284,9 +284,11 @@ int kmem_cache_shrink(struct kmem_cache *s); * f.e. add ____cacheline_aligned_in_smp to the struct declaration * then the objects will be properly aligned in SMP configurations. */ -#define KMEM_CACHE(__struct, __flags) \ - kmem_cache_create(#__struct, sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), NULL) +#define KMEM_CACHE(__struct, __flags) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + }, (__flags)) /* * To whitelist a single field for copying to/from usercopy, use this From 199cd13a745eb44fb4828bca373155293cdcfa5c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:54 +0200 Subject: [PATCH 18/25] slab: port KMEM_CACHE_USERCOPY() to struct kmem_cache_args Make KMEM_CACHE_USERCOPY() use struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 1f38d94387cc..7e15a3a3edb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -294,12 +294,13 @@ int kmem_cache_shrink(struct kmem_cache *s); * To whitelist a single field for copying to/from usercopy, use this * macro instead for KMEM_CACHE() above. */ -#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ - kmem_cache_create_usercopy(#__struct, \ - sizeof(struct __struct), \ - __alignof__(struct __struct), (__flags), \ - offsetof(struct __struct, __field), \ - sizeof_field(struct __struct, __field), NULL) +#define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ + __kmem_cache_create_args(#__struct, sizeof(struct __struct), \ + &(struct kmem_cache_args) { \ + .align = __alignof__(struct __struct), \ + .useroffset = offsetof(struct __struct, __field), \ + .usersize = sizeof_field(struct __struct, __field), \ + }, (__flags)) /* * Common kmalloc functions provided by all allocators From b2e7456b5c25c41eda7a8a15f7ccaa4e7579949f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:55 +0200 Subject: [PATCH 19/25] slab: create kmem_cache_create() compatibility layer Use _Generic() to create a compatibility layer that type switches on the third argument to either call __kmem_cache_create() or __kmem_cache_create_args(). If NULL is passed for the struct kmem_cache_args argument use default args making porting for callers that don't care about additional arguments easy. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 29 ++++++++++++++++++++++++++--- mm/slab_common.c | 10 +++++----- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 7e15a3a3edb1..15eb0a0defd6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,9 +262,10 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); -struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + +struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + void (*ctor)(void *)); struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, @@ -273,6 +274,28 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, unsigned int freeptr_offset, slab_flags_t flags); + +/* If NULL is passed for @args, use this variant with default arguments. */ +static inline struct kmem_cache * +__kmem_cache_default_args(const char *name, unsigned int size, + struct kmem_cache_args *args, + slab_flags_t flags) +{ + struct kmem_cache_args kmem_default_args = {}; + + /* Make sure we don't get passed garbage. */ + if (WARN_ON_ONCE(args)) + return ERR_PTR(-EINVAL); + + return __kmem_cache_create_args(name, size, &kmem_default_args, flags); +} + +#define kmem_cache_create(__name, __object_size, __args, ...) \ + _Generic((__args), \ + struct kmem_cache_args *: __kmem_cache_create_args, \ + void *: __kmem_cache_default_args, \ + default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__) + void kmem_cache_destroy(struct kmem_cache *s); int kmem_cache_shrink(struct kmem_cache *s); diff --git a/mm/slab_common.c b/mm/slab_common.c index 19ae3dd6e36f..418459927670 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -383,7 +383,7 @@ kmem_cache_create_usercopy(const char *name, unsigned int size, EXPORT_SYMBOL(kmem_cache_create_usercopy); /** - * kmem_cache_create - Create a cache. + * __kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. @@ -407,9 +407,9 @@ EXPORT_SYMBOL(kmem_cache_create_usercopy); * * Return: a pointer to the cache on success, NULL on failure. */ -struct kmem_cache * -kmem_cache_create(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) +struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + void (*ctor)(void *)) { struct kmem_cache_args kmem_args = { .align = align, @@ -418,7 +418,7 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, return __kmem_cache_create_args(name, size, &kmem_args, flags); } -EXPORT_SYMBOL(kmem_cache_create); +EXPORT_SYMBOL(__kmem_cache_create); /** * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. From 5f7d25668217bb7841bc5784d2185618a01e336b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:56 +0200 Subject: [PATCH 20/25] file: port to struct kmem_cache_args Port filp_cache to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- fs/file_table.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 3ef558f27a1c..861c03608e83 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -511,9 +511,14 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { - filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), - offsetof(struct file, f_freeptr), - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct file, f_freeptr), + }; + + filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } From 3d453e60f1a9c377d670d5fe306d3b9eed477bc0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:57 +0200 Subject: [PATCH 21/25] slab: remove kmem_cache_create_rcu() Now that we have ported all users of kmem_cache_create_rcu() to struct kmem_cache_args the function is unused and can be removed. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 3 --- mm/slab_common.c | 43 ------------------------------------------- 2 files changed, 46 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 15eb0a0defd6..cb82a6414a28 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -271,9 +271,6 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, - unsigned int freeptr_offset, - slab_flags_t flags); /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * diff --git a/mm/slab_common.c b/mm/slab_common.c index 418459927670..9133b9fafcb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -420,49 +420,6 @@ struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, } EXPORT_SYMBOL(__kmem_cache_create); -/** - * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @freeptr_offset: The offset into the memory to the free pointer - * @flags: SLAB flags - * - * Cannot be called within an interrupt, but can be interrupted. - * - * See kmem_cache_create() for an explanation of possible @flags. - * - * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside - * of the object. This might cause the object to grow in size. Callers - * that have a reason to avoid this can specify a custom free pointer - * offset in their struct where the free pointer will be placed. - * - * Note that placing the free pointer inside the object requires the - * caller to ensure that no fields are invalidated that are required to - * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for - * details.). - * - * Using zero as a value for @freeptr_offset is valid. To request no - * offset UINT_MAX must be specified. - * - * Note that @ctor isn't supported with custom free pointers as a @ctor - * requires an external free pointer. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, - unsigned int freeptr_offset, - slab_flags_t flags) -{ - struct kmem_cache_args kmem_args = { - .freeptr_offset = freeptr_offset, - .use_freeptr_offset = true, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, - flags | SLAB_TYPESAFE_BY_RCU); -} -EXPORT_SYMBOL(kmem_cache_create_rcu); - static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** From 0c9050b09cfb1ddb3fe4b2d401dca1fb674c825a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:58 +0200 Subject: [PATCH 22/25] slab: make kmem_cache_create_usercopy() static inline Make kmem_cache_create_usercopy() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 49 +++++++++++++++++++++++++++++++++++++++----- mm/slab_common.c | 45 ---------------------------------------- 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index cb82a6414a28..634717c9f73b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -266,11 +266,50 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -struct kmem_cache *kmem_cache_create_usercopy(const char *name, - unsigned int size, unsigned int align, - slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)); + +/** + * kmem_cache_create_usercopy - Create a cache with a region suitable + * for copying to userspace + * @name: A string which is used in /proc/slabinfo to identify this cache. + * @size: The size of objects to be created in this cache. + * @align: The required alignment for the objects. + * @flags: SLAB flags + * @useroffset: Usercopy region offset + * @usersize: Usercopy region size + * @ctor: A constructor for the objects. + * + * Cannot be called within a interrupt, but can be interrupted. + * The @ctor is run when new pages are allocated by the cache. + * + * The flags are + * + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) + * to catch references to uninitialised memory. + * + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check + * for buffer overruns. + * + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware + * cacheline. This can be beneficial if you're counting cycles as closely + * as davem. + * + * Return: a pointer to the cache on success, NULL on failure. + */ +static inline struct kmem_cache * +kmem_cache_create_usercopy(const char *name, unsigned int size, + unsigned int align, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, + void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + .useroffset = useroffset, + .usersize = usersize, + }; + + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /* If NULL is passed for @args, use this variant with default arguments. */ static inline struct kmem_cache * diff --git a/mm/slab_common.c b/mm/slab_common.c index 9133b9fafcb1..3477a3918afd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -337,51 +337,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -/** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache * -kmem_cache_create_usercopy(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - unsigned int useroffset, unsigned int usersize, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .ctor = ctor, - .useroffset = useroffset, - .usersize = usersize, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} -EXPORT_SYMBOL(kmem_cache_create_usercopy); - /** * __kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. From 781aee755638dbb6dbfe022bdd2f732621ec9534 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:56:59 +0200 Subject: [PATCH 23/25] slab: make __kmem_cache_create() static inline Make __kmem_cache_create() a static inline function. Signed-off-by: Christian Brauner Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Roman Gushchin Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 13 ++++++++++--- mm/slab_common.c | 38 -------------------------------------- 2 files changed, 10 insertions(+), 41 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 634717c9f73b..331412a9f4f2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -262,10 +262,17 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, unsigned int object_size, struct kmem_cache_args *args, slab_flags_t flags); +static inline struct kmem_cache * +__kmem_cache_create(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) +{ + struct kmem_cache_args kmem_args = { + .align = align, + .ctor = ctor, + }; -struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)); + return __kmem_cache_create_args(name, size, &kmem_args, flags); +} /** * kmem_cache_create_usercopy - Create a cache with a region suitable diff --git a/mm/slab_common.c b/mm/slab_common.c index 3477a3918afd..30000dcf0736 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -337,44 +337,6 @@ out_unlock: } EXPORT_SYMBOL(__kmem_cache_create_args); -/** - * __kmem_cache_create - Create a cache. - * @name: A string which is used in /proc/slabinfo to identify this cache. - * @size: The size of objects to be created in this cache. - * @align: The required alignment for the objects. - * @flags: SLAB flags - * @ctor: A constructor for the objects. - * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. - * - * Return: a pointer to the cache on success, NULL on failure. - */ -struct kmem_cache *__kmem_cache_create(const char *name, unsigned int size, - unsigned int align, slab_flags_t flags, - void (*ctor)(void *)) -{ - struct kmem_cache_args kmem_args = { - .align = align, - .ctor = ctor, - }; - - return __kmem_cache_create_args(name, size, &kmem_args, flags); -} -EXPORT_SYMBOL(__kmem_cache_create); - static struct kmem_cache *kmem_buckets_cache __ro_after_init; /** From a6711d1cd4e291f334ae900065e18f585732acfa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 5 Sep 2024 09:57:00 +0200 Subject: [PATCH 24/25] io_uring: port to struct kmem_cache_args Port req_cachep to struct kmem_cache_args. Reviewed-by: Kees Cook Reviewed-by: Jens Axboe Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Vlastimil Babka Signed-off-by: Christian Brauner Signed-off-by: Vlastimil Babka --- io_uring/io_uring.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3942db160f18..d9d721d1424e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3638,6 +3638,11 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, static int __init io_uring_init(void) { + struct kmem_cache_args kmem_args = { + .useroffset = offsetof(struct io_kiocb, cmd.data), + .usersize = sizeof_field(struct io_kiocb, cmd.data), + }; + #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \ @@ -3722,12 +3727,9 @@ static int __init io_uring_init(void) * range, and HARDENED_USERCOPY will complain if we haven't * correctly annotated this range. */ - req_cachep = kmem_cache_create_usercopy("io_kiocb", - sizeof(struct io_kiocb), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, - offsetof(struct io_kiocb, cmd.data), - sizeof_field(struct io_kiocb, cmd.data), NULL); + req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | + SLAB_TYPESAFE_BY_RCU); io_buf_cachep = KMEM_CACHE(io_buffer, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); From 4b7ff9ab98af11a477d50f08382bcc4c2f899926 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 13 Sep 2024 10:15:56 +0200 Subject: [PATCH 25/25] mm, slab: restore kerneldoc for kmem_cache_create() As kmem_cache_create() became a _Generic() wrapper macro, it currently has no kerneldoc despite being the main API to use. Add it. Also adjust kmem_cache_create_usercopy() kerneldoc to indicate it is now a legacy wrapper. Also expand the kerneldoc for struct kmem_cache_args, especially for the freeptr_offset field, where important details were removed with the removal of kmem_cache_create_rcu(). Signed-off-by: Vlastimil Babka Reviewed-by: Christian Brauner --- include/linux/slab.h | 114 ++++++++++++++++++++++++++++++++++--------- mm/slab_common.c | 10 ++-- 2 files changed, 98 insertions(+), 26 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 331412a9f4f2..6a8ab7ef3af7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -242,19 +242,72 @@ bool slab_is_available(void); /** * struct kmem_cache_args - Less common arguments for kmem_cache_create() - * @align: The required alignment for the objects. - * @useroffset: Usercopy region offset - * @usersize: Usercopy region size - * @freeptr_offset: Custom offset for the free pointer in RCU caches - * @use_freeptr_offset: Whether a @freeptr_offset is used - * @ctor: A constructor for the objects. + * + * Any uninitialized fields of the structure are interpreted as unused. The + * exception is @freeptr_offset where %0 is a valid value, so + * @use_freeptr_offset must be also set to %true in order to interpret the field + * as used. For @useroffset %0 is also valid, but only with non-%0 + * @usersize. + * + * When %NULL args is passed to kmem_cache_create(), it is equivalent to all + * fields unused. */ struct kmem_cache_args { + /** + * @align: The required alignment for the objects. + * + * %0 means no specific alignment is requested. + */ unsigned int align; + /** + * @useroffset: Usercopy region offset. + * + * %0 is a valid offset, when @usersize is non-%0 + */ unsigned int useroffset; + /** + * @usersize: Usercopy region size. + * + * %0 means no usercopy region is specified. + */ unsigned int usersize; + /** + * @freeptr_offset: Custom offset for the free pointer + * in &SLAB_TYPESAFE_BY_RCU caches + * + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer + * outside of the object. This might cause the object to grow in size. + * Cache creators that have a reason to avoid this can specify a custom + * free pointer offset in their struct where the free pointer will be + * placed. + * + * Note that placing the free pointer inside the object requires the + * caller to ensure that no fields are invalidated that are required to + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for + * details). + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, %use_freeptr_offset must be set %true. + * + * Note that @ctor currently isn't supported with custom free pointers + * as a @ctor requires an external free pointer. + */ unsigned int freeptr_offset; + /** + * @use_freeptr_offset: Whether a @freeptr_offset is used. + */ bool use_freeptr_offset; + /** + * @ctor: A constructor for the objects. + * + * The constructor is invoked for each object in a newly allocated slab + * page. It is the cache user's responsibility to free object in the + * same state as after calling the constructor, or deal appropriately + * with any differences between a freshly constructed and a reallocated + * object. + * + * %NULL means no constructor. + */ void (*ctor)(void *); }; @@ -275,30 +328,20 @@ __kmem_cache_create(const char *name, unsigned int size, unsigned int align, } /** - * kmem_cache_create_usercopy - Create a cache with a region suitable - * for copying to userspace + * kmem_cache_create_usercopy - Create a kmem cache with a region suitable + * for copying to userspace. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. * @align: The required alignment for the objects. * @flags: SLAB flags * @useroffset: Usercopy region offset * @usersize: Usercopy region size - * @ctor: A constructor for the objects. + * @ctor: A constructor for the objects, or %NULL. * - * Cannot be called within a interrupt, but can be interrupted. - * The @ctor is run when new pages are allocated by the cache. - * - * The flags are - * - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) - * to catch references to uninitialised memory. - * - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check - * for buffer overruns. - * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY() + * if whitelisting a single field is sufficient, or kmem_cache_create() with + * the necessary parameters passed via the args parameter (see + * &struct kmem_cache_args) * * Return: a pointer to the cache on success, NULL on failure. */ @@ -333,6 +376,31 @@ __kmem_cache_default_args(const char *name, unsigned int size, return __kmem_cache_create_args(name, size, &kmem_default_args, flags); } +/** + * kmem_cache_create - Create a kmem cache. + * @__name: A string which is used in /proc/slabinfo to identify this cache. + * @__object_size: The size of objects to be created in this cache. + * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL + * means defaults will be used for all the arguments. + * + * This is currently implemented as a macro using ``_Generic()`` to call + * either the new variant of the function, or a legacy one. + * + * The new variant has 4 parameters: + * ``kmem_cache_create(name, object_size, args, flags)`` + * + * See __kmem_cache_create_args() which implements this. + * + * The legacy variant has 5 parameters: + * ``kmem_cache_create(name, object_size, align, flags, ctor)`` + * + * The align and ctor parameters map to the respective fields of + * &struct kmem_cache_args + * + * Context: Cannot be called within a interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. + */ #define kmem_cache_create(__name, __object_size, __args, ...) \ _Generic((__args), \ struct kmem_cache_args *: __kmem_cache_create_args, \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 30000dcf0736..86c2e6f4a1ce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -239,13 +239,17 @@ out: } /** - * __kmem_cache_create_args - Create a kmem cache + * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @object_size: The size of objects to be created in this cache. - * @args: Arguments for the cache creation (see struct kmem_cache_args). + * @args: Additional arguments for the cache creation (see + * &struct kmem_cache_args). * @flags: See %SLAB_* flags for an explanation of individual @flags. * - * Cannot be called within a interrupt, but can be interrupted. + * Not to be called directly, use the kmem_cache_create() wrapper with the same + * parameters. + * + * Context: Cannot be called within a interrupt, but can be interrupted. * * Return: a pointer to the cache on success, NULL on failure. */