linux/block/ioprio.c
Jan Kara 7ad47f414b block: fix default IO priority handling again
commit e589f46445 upstream.

Commit e70344c059 ("block: fix default IO priority handling")
introduced an inconsistency in get_current_ioprio() that tasks without
IO context return IOPRIO_DEFAULT priority while tasks with freshly
allocated IO context will return 0 (IOPRIO_CLASS_NONE/0) IO priority.
Tasks without IO context used to be rare before 5a9d041ba2 ("block:
move io_context creation into where it's needed") but after this commit
they became common because now only BFQ IO scheduler setups task's IO
context. Similar inconsistency is there for get_task_ioprio() so this
inconsistency is now exposed to userspace and userspace will see
different IO priority for tasks operating on devices with BFQ compared
to devices without BFQ. Furthemore the changes done by commit
e70344c059 change the behavior when no IO priority is set for BFQ IO
scheduler which is also documented in ioprio_set(2) manpage:

"If no I/O scheduler has been set for a thread, then by default the I/O
priority will follow the CPU nice value (setpriority(2)).  In Linux
kernels before version 2.6.24, once an I/O priority had been set using
ioprio_set(), there was no way to reset the I/O scheduling behavior to
the default. Since Linux 2.6.24, specifying ioprio as 0 can be used to
reset to the default I/O scheduling behavior."

So make sure we default to IOPRIO_CLASS_NONE as used to be the case
before commit e70344c059. Also cleanup alloc_io_context() to
explicitely set this IO priority for the allocated IO context to avoid
future surprises. Note that we tweak ioprio_best() to maintain
ioprio_get(2) behavior and make this commit easily backportable.

CC: stable@vger.kernel.org
Fixes: e70344c059 ("block: fix default IO priority handling")
Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Tested-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20220623074840.5960-1-jack@suse.cz
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2022-08-11 13:07:50 +02:00

269 lines
5.5 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.rst
*
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/sched/user.h>
#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
struct io_context *ioc;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
if (ioc) {
ioc->ioprio = ioprio;
put_io_context(ioc);
}
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int data = IOPRIO_PRIO_DATA(ioprio);
switch (class) {
case IOPRIO_CLASS_RT:
/*
* Originally this only checked for CAP_SYS_ADMIN,
* which was implicitly allowed for pid 0 by security
* modules such as SELinux. Make sure we check
* CAP_SYS_ADMIN first to avoid a denial/avc for
* possibly missing CAP_SYS_NICE permission.
*/
if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
if (data >= IOPRIO_NR_LEVELS || data < 0)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
if (data)
return -EINVAL;
break;
default:
return -EINVAL;
}
return 0;
}
SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
{
struct task_struct *p, *g;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret;
ret = ioprio_check_cap(ioprio);
if (ret)
return ret;
ret = -ESRCH;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret) {
read_unlock(&tasklist_lock);
goto out;
}
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!uid_valid(uid))
break;
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), uid) ||
!task_pid_vnr(p))
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
goto free_uid;
}
free_uid:
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
out:
rcu_read_unlock();
return ret;
}
static int get_task_ioprio(struct task_struct *p)
{
int ret;
ret = security_task_getioprio(p);
if (ret)
goto out;
ret = IOPRIO_DEFAULT;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
task_unlock(p);
out:
return ret;
}
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
if (!ioprio_valid(aprio))
aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM);
if (!ioprio_valid(bprio))
bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM);
return min(aprio, bprio);
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
{
struct task_struct *g, *p;
struct user_struct *user;
struct pid *pgrp;
kuid_t uid;
int ret = -ESRCH;
int tmpio;
rcu_read_lock();
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_vpid(who);
if (p)
ret = get_task_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock);
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
if (!who)
user = current_user();
else
user = find_user(uid);
if (!user)
break;
for_each_process_thread(g, p) {
if (!uid_eq(task_uid(p), user->uid) ||
!task_pid_vnr(p))
continue;
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
if (ret == -ESRCH)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
}
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
rcu_read_unlock();
return ret;
}