From 92fb1db26eefc11554820f11ce8e92007da2fbf4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 7 Jun 2020 21:40:04 -0700 Subject: [PATCH 01/52] mm/page_idle.c: skip offline pages 'Idle page tracking' users can pass random pfn that might be mapped to an offline page. To avoid accessing such pages, this commit modifies the 'page_idle_get_page()' to use 'pfn_to_online_page()' instead of 'pfn_valid()' and 'pfn_to_page()' combination, so that the pfn mapped to an offline page can be skipped. Reported-by: David Hildenbrand Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Link: http://lkml.kernel.org/r/20200605092502.18018-2-sjpark@amazon.com Signed-off-by: Linus Torvalds --- mm/page_idle.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/page_idle.c b/mm/page_idle.c index 295512465065..057c61df12db 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -30,13 +31,9 @@ */ static struct page *page_idle_get_page(unsigned long pfn) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); pg_data_t *pgdat; - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); if (!page || !PageLRU(page) || !get_page_unless_zero(page)) return NULL; From 4b78e2013a374f379457ceba7dc860d9fc28156b Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 7 Jun 2020 21:40:07 -0700 Subject: [PATCH 02/52] ipc/msg: add missing annotation for freeque() Sparse reports a warning at freeque() warning: context imbalance in freeque() - unexpected unlock The root cause is the missing annotation at freeque() Add the missing __releases(RCU) annotation Add the missing __releases(&msq->q_perm) annotation Signed-off-by: Jules Irenge Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Boqun Feng Cc: Lu Shuaibing Cc: Nathan Chancellor Cc: Manfred Spraul Cc: Davidlohr Bueso Link: http://lkml.kernel.org/r/20200403160505.2832-2-jbi.octave@gmail.com Signed-off-by: Linus Torvalds --- ipc/msg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ipc/msg.c b/ipc/msg.c index caca67368cb5..acd1bc7af55a 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -268,6 +268,8 @@ static void expunge_all(struct msg_queue *msq, int res, * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) + __releases(RCU) + __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); From e1eb26fa62d04ec0955432be1aa8722a97cb52e7 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Sun, 7 Jun 2020 21:40:10 -0700 Subject: [PATCH 03/52] ipc/namespace.c: use a work queue to free_ipc the reason is to avoid a delay caused by the synchronize_rcu() call in kern_umount() when the mqueue mount is freed. the code: #define _GNU_SOURCE #include #include #include #include int main() { int i; for (i = 0; i < 1000; i++) if (unshare(CLONE_NEWIPC) < 0) error(EXIT_FAILURE, errno, "unshare"); } goes from Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.06 Percent of CPU this job got: 0% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:08.05 to Command being timed: "./ipc-namespace" User time (seconds): 0.00 System time (seconds): 0.02 Percent of CPU this job got: 96% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:00.03 Signed-off-by: Giuseppe Scrivano Signed-off-by: Andrew Morton Reviewed-by: Paul E. McKenney Reviewed-by: Waiman Long Cc: Davidlohr Bueso Cc: Manfred Spraul Link: http://lkml.kernel.org/r/20200225145419.527994-1-gscrivan@redhat.com Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ ipc/namespace.c | 24 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45..a06a78c67f19 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -68,6 +68,8 @@ struct ipc_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; + struct llist_node mnt_llist; + struct ns_common ns; } __randomize_layout; diff --git a/ipc/namespace.c b/ipc/namespace.c index fdc3b5f3f53a..24e7b45320f7 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { + /* mq_put_mnt() waits for a grace period as kern_unmount() + * uses synchronize_rcu(). + */ + mq_put_mnt(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); @@ -127,6 +131,21 @@ static void free_ipc_ns(struct ipc_namespace *ns) kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static DECLARE_WORK(free_ipc_work, free_ipc); + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -148,8 +167,9 @@ void put_ipc_ns(struct ipc_namespace *ns) if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); + + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); } } From ceabef7dd71720aef58bd182943352c9c307a3de Mon Sep 17 00:00:00 2001 From: Orson Zhai Date: Sun, 7 Jun 2020 21:40:14 -0700 Subject: [PATCH 04/52] dynamic_debug: add an option to enable dynamic debug for modules only Instead of enabling dynamic debug globally with CONFIG_DYNAMIC_DEBUG, CONFIG_DYNAMIC_DEBUG_CORE will only enable core function of dynamic debug. With the DYNAMIC_DEBUG_MODULE defined for any modules, dynamic debug will be tied to them. This is useful for people who only want to enable dynamic debug for kernel modules without worrying about kernel image size and memory consumption is increasing too much. [orson.zhai@unisoc.com: v2] Link: http://lkml.kernel.org/r/1587408228-10861-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Orson Zhai Signed-off-by: Andrew Morton Acked-by: Greg Kroah-Hartman Acked-by: Petr Mladek Cc: Jonathan Corbet Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Baron Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1586521984-5890-1-git-send-email-orson.unisoc@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/dynamic-debug-howto.rst | 5 +++++ include/linux/dev_printk.h | 6 ++++-- include/linux/dynamic_debug.h | 2 +- include/linux/net.h | 3 ++- include/linux/netdevice.h | 6 ++++-- include/linux/printk.h | 9 ++++++--- include/rdma/ib_verbs.h | 6 ++++-- lib/Kconfig.debug | 12 ++++++++++++ lib/Makefile | 2 +- lib/dynamic_debug.c | 9 +++++++-- 10 files changed, 46 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a5..016a9c5faa34 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c..ef2f3986c493 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bd4eb7f5ec1..333e878d8af9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 32f19b4d1d2a..315516fa4ef4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -190,7 +190,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; From db38d5c106dfdd7cb7207c83267d82fdf4950b61 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:17 -0700 Subject: [PATCH 05/52] kernel: add panic_on_taint Analogously to the introduction of panic_on_warn, this patch introduces a kernel option named panic_on_taint in order to provide a simple and generic way to stop execution and catch a coredump when the kernel gets tainted by any given flag. This is useful for debugging sessions as it avoids having to rebuild the kernel to explicitly add calls to panic() into the code sites that introduce the taint flags of interest. For instance, if one is interested in proceeding with a post-mortem analysis at the point a given code path is hitting a bad page (i.e. unaccount_page_cache_page(), or slab_bug()), a coredump can be collected by rebooting the kernel with 'panic_on_taint=0x20' amended to the command line. Another, perhaps less frequent, use for this option would be as a means for assuring a security policy case where only a subset of taints, or no single taint (in paranoid mode), is allowed for the running system. The optional switch 'nousertaint' is handy in this particular scenario, as it will avoid userspace induced crashes by writes to sysctl interface /proc/sys/kernel/tainted causing false positive hits for such policies. [akpm@linux-foundation.org: tweak kernel-parameters.txt wording] Suggested-by: Qian Cai Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Dave Young Cc: Baoquan He Cc: Jonathan Corbet Cc: Kees Cook Cc: Randy Dunlap Cc: "Theodore Ts'o" Cc: Adrian Bunk Cc: Greg Kroah-Hartman Cc: Laura Abbott Cc: Jeff Mahoney Cc: Jiri Kosina Cc: Takashi Iwai Link: http://lkml.kernel.org/r/20200515175502.146720-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kdump/kdump.rst | 8 +++++ .../admin-guide/kernel-parameters.txt | 13 +++++++ Documentation/admin-guide/sysctl/kernel.rst | 7 ++++ include/linux/kernel.h | 3 ++ kernel/panic.c | 34 +++++++++++++++++++ kernel/sysctl.c | 11 +++++- 6 files changed, 75 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d2935..2da65fef2a1c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f63..df9b0fe2ed60 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: [,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d01141..3b00b9223157 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -1239,6 +1239,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9b7a8d74a9d6..f7835db7102e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -528,6 +528,8 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; @@ -596,6 +598,7 @@ extern enum system_states { #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..94b5c973770c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -44,6 +44,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +436,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -686,3 +693,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..587ed0494f2f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,11 +866,20 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) add_taint(i, LOCKDEP_STILL_OK); From 01f39c1c11ee5bf44a1df49e47eb53a86515b9dc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Jun 2020 21:40:20 -0700 Subject: [PATCH 06/52] xarray.h: correct return code documentation for xa_store_{bh,irq}() __xa_store() and xa_store() document that the functions can fail, and that the return code can be an xa_err() encoded error code. xa_store_bh() and xa_store_irq() do not document that the functions can fail and that they can also return xa_err() encoded error codes. Thus: Update the documentation. Signed-off-by: Manfred Spraul Signed-off-by: Andrew Morton Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200430111424.16634-1-manfred@colorfullife.com Signed-off-by: Linus Torvalds --- include/linux/xarray.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 14c893433139..b4d70e7568b2 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -576,7 +576,7 @@ void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) @@ -602,7 +602,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) From 3db978d480e2843979a2b56f2f7da726f2b295b2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:24 -0700 Subject: [PATCH 07/52] kernel/sysctl: support setting sysctl parameters from kernel command line Patch series "support setting sysctl parameters from kernel command line", v3. This series adds support for something that seems like many people always wanted but nobody added it yet, so here's the ability to set sysctl parameters via kernel command line options in the form of sysctl.vm.something=1 The important part is Patch 1. The second, not so important part is an attempt to clean up legacy one-off parameters that do the same thing as a sysctl. I don't want to remove them completely for compatibility reasons, but with generic sysctl support the idea is to remove the one-off param handlers and treat the parameters as aliases for the sysctl variants. I have identified several parameters that mention sysctl counterparts in Documentation/admin-guide/kernel-parameters.txt but there might be more. The conversion also has varying level of success: - numa_zonelist_order is converted in Patch 2 together with adding the necessary infrastructure. It's easy as it doesn't really do anything but warn on deprecated value these days. - hung_task_panic is converted in Patch 3, but there's a downside that now it only accepts 0 and 1, while previously it was any integer value - nmi_watchdog maps to two sysctls nmi_watchdog and hardlockup_panic, so there's no straighforward conversion possible - traceoff_on_warning is a flag without value and it would be required to handle that somehow in the conversion infractructure, which seems pointless for a single flag This patch (of 5): A recently proposed patch to add vm_swappiness command line parameter in addition to existing sysctl [1] made me wonder why we don't have a general support for passing sysctl parameters via command line. Googling found only somebody else wondering the same [2], but I haven't found any prior discussion with reasons why not to do this. Settings the vm_swappiness issue aside (the underlying issue might be solved in a different way), quick search of kernel-parameters.txt shows there are already some that exist as both sysctl and kernel parameter - hung_task_panic, nmi_watchdog, numa_zonelist_order, traceoff_on_warning. A general mechanism would remove the need to add more of those one-offs and might be handy in situations where configuration by e.g. /etc/sysctl.d/ is impractical. Hence, this patch adds a new parse_args() pass that looks for parameters prefixed by 'sysctl.' and tries to interpret them as writes to the corresponding sys/ files using an temporary in-kernel procfs mount. This mechanism was suggested by Eric W. Biederman [3], as it handles all dynamically registered sysctl tables, even though we don't handle modular sysctls. Errors due to e.g. invalid parameter name or value are reported in the kernel log. The processing is hooked right before the init process is loaded, as some handlers might be more complicated than simple setters and might need some subsystems to be initialized. At the moment the init process can be started and eventually execute a process writing to /proc/sys/ then it should be also fine to do that from the kernel. Sysctls registered later on module load time are not set by this mechanism - it's expected that in such scenarios, setting sysctl values from userspace is practical enough. [1] https://lore.kernel.org/r/BL0PR02MB560167492CA4094C91589930E9FC0@BL0PR02MB5601.namprd02.prod.outlook.com/ [2] https://unix.stackexchange.com/questions/558802/how-to-set-sysctl-using-kernel-command-line-parameter [3] https://lore.kernel.org/r/87bloj2skm.fsf@x220.int.ebiederm.org/ Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Reviewed-by: Masami Hiramatsu Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: "Eric W . Biederman" Cc: "Guilherme G . Piccoli" Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Christian Brauner Link: http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Link: http://lkml.kernel.org/r/20200427180433.7029-2-vbabka@suse.cz Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 9 ++ fs/proc/proc_sysctl.c | 107 ++++++++++++++++++ include/linux/sysctl.h | 4 + init/main.c | 2 + 4 files changed, 122 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index df9b0fe2ed60..d02b9d023660 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4969,6 +4969,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index df2143e05c57..973acf96f37c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1703,3 +1704,109 @@ int __init proc_sys_init(void) return sysctl_init(); } + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) + return 0; + + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f2401e45a3c2..50bb7f383a1b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +void do_sysctl_args(void); extern int pwrsw_enabled; extern int unaligned_enabled; @@ -235,6 +236,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } +static inline void do_sysctl_args(void) +{ +} #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, diff --git a/init/main.c b/init/main.c index 76df62fc3e2c..b59e09353881 100644 --- a/init/main.c +++ b/init/main.c @@ -1412,6 +1412,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) From 0a477e1ae21b28267b9bd8599f75c115291b1666 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:27 -0700 Subject: [PATCH 08/52] kernel/sysctl: support handling command line aliases We can now handle sysctl parameters on kernel command line, but historically some parameters introduced their own command line equivalent, which we don't want to remove for compatibility reasons. We can, however, convert them to the generic infrastructure with a table translating the legacy command line parameters to their sysctl names, and removing the one-off param handlers. This patch adds the support and makes the first conversion to demonstrate it, on the (deprecated) numa_zonelist_order parameter. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Acked-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-3-vbabka@suse.cz Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 48 ++++++++++++++++++++++++++++++++++++------- mm/page_alloc.c | 9 -------- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 973acf96f37c..124298168f8b 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1705,6 +1705,37 @@ int __init proc_sys_init(void) return sysctl_init(); } +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + /* Set sysctl value passed on kernel command line. */ static int process_sysctl_arg(char *param, char *val, const char *unused, void *arg) @@ -1718,15 +1749,18 @@ static int process_sysctl_arg(char *param, char *val, loff_t pos = 0; ssize_t wret; - if (strncmp(param, "sysctl", sizeof("sysctl") - 1)) - return 0; + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; - param += sizeof("sysctl") - 1; + if (param[0] != '/' && param[0] != '.') + return 0; - if (param[0] != '/' && param[0] != '.') - return 0; - - param++; + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } /* * To set sysctl options, we use a temporary mount of proc, look up the diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07ae77d97952..727751219003 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,15 +5575,6 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* From b467f3ef3c50c4fa8926ca07f7db9a33a645e13a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:31 -0700 Subject: [PATCH 09/52] kernel/hung_task convert hung_task_panic boot parameter to sysctl We can now handle sysctl parameters on kernel command line and have infrastructure to convert legacy command line options that duplicate sysctl to become a sysctl alias. This patch converts the hung_task_panic parameter. Note that the sysctl handler is more strict and allows only 0 and 1, while the legacy parameter allowed any non-zero value. But there is little reason anyone would not be using 1. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-4-vbabka@suse.cz Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 2 +- fs/proc/proc_sysctl.c | 1 + kernel/hung_task.c | 10 ---------- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index d02b9d023660..0c72b5c22416 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1515,7 +1515,7 @@ [KNL] Should the hung task detector generate panics. Format: - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 124298168f8b..15030784566c 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1721,6 +1721,7 @@ struct sysctl_alias { */ static const struct sysctl_alias sysctl_aliases[] = { {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"hung_task_panic", "kernel.hung_task_panic" }, { } }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..b22b5eeab3cb 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,16 +63,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { From 4546cde96f9fbef8f59c645f2e0677b5a390ed0d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:35 -0700 Subject: [PATCH 10/52] tools/testing/selftests/sysctl/sysctl.sh: support CONFIG_TEST_SYSCTL=y The testing script recommends CONFIG_TEST_SYSCTL=y, but actually only works with CONFIG_TEST_SYSCTL=m. Testing of sysctl setting via boot param however requires the test to be built-in, so make sure the test script supports it. Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Acked-by: Luis Chamberlain Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-5-vbabka@suse.cz Signed-off-by: Linus Torvalds --- tools/testing/selftests/sysctl/sysctl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..a4262955d30f 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -122,7 +122,7 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" exit $ksft_skip From 4f2f682d89d83fb6194562321d875253282d8496 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Sun, 7 Jun 2020 21:40:38 -0700 Subject: [PATCH 11/52] lib/test_sysctl: support testing of sysctl. boot parameter Testing is done by a new parameter debug.test_sysctl.boot_int which defaults to 0 and it's expected that the tester passes a boot parameter that sets it to 1. The test checks if it's set to 1. To distinguish true failure from parameter not being set, the test checks /proc/cmdline for the expected parameter, and whether test_sysctl is built-in and not a module. [vbabka@suse.cz: skip the new test if boot_int sysctl is not present] Link: http://lkml.kernel.org/r/305af605-1e60-cf84-fada-6ce1ca37c102@suse.cz Signed-off-by: Vlastimil Babka Signed-off-by: Andrew Morton Cc: Alexey Dobriyan Cc: Christian Brauner Cc: David Rientjes Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: "Guilherme G . Piccoli" Cc: Iurii Zaikin Cc: Ivan Teterevkov Cc: Kees Cook Cc: Luis Chamberlain Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michal Hocko Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200427180433.7029-6-vbabka@suse.cz Signed-off-by: Linus Torvalds --- lib/test_sysctl.c | 13 ++++++++ tools/testing/selftests/sysctl/sysctl.sh | 42 ++++++++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 566dad3f4196..84eaae22d3a6 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -44,6 +44,8 @@ struct test_sysctl_data { int int_0002; int int_0003[4]; + int boot_int; + unsigned int uint_0001; char string_0001[65]; @@ -61,6 +63,8 @@ static struct test_sysctl_data test_data = { .int_0003[2] = 2, .int_0003[3] = 3, + .boot_int = 0, + .uint_0001 = 314, .string_0001 = "(none)", @@ -91,6 +95,15 @@ static struct ctl_table test_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "boot_int", + .data = &test_data.boot_int, + .maxlen = sizeof(test_data.boot_int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, { .procname = "uint_0001", .data = &test_data.uint_0001, diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index a4262955d30f..0aa8a86140e9 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,6 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" test_modprobe() { @@ -752,6 +753,46 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return $ksft_skip + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip +} + list_tests() { echo "Test ID list:" @@ -766,6 +807,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() From f117955a2255721a6a0e9cecf6cad3a6eb43cbc3 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:42 -0700 Subject: [PATCH 12/52] kernel/watchdog.c: convert {soft/hard}lockup boot parameters to sysctl aliases After a recent change introduced by Vlastimil's series [0], kernel is able now to handle sysctl parameters on kernel command line; also, the series introduced a simple infrastructure to convert legacy boot parameters (that duplicate sysctls) into sysctl aliases. This patch converts the watchdog parameters softlockup_panic and {hard,soft}lockup_all_cpu_backtrace to use the new alias infrastructure. It fixes the documentation too, since the alias only accepts values 0 or 1, not the full range of integers. We also took the opportunity here to improve the documentation of the previously converted hung_task_panic (see the patch series [0]) and put the alias table in alphabetical order. [0] http://lkml.kernel.org/r/20200427180433.7029-1-vbabka@suse.cz Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Kees Cook Cc: Iurii Zaikin Cc: Luis Chamberlain Link: http://lkml.kernel.org/r/20200507214624.21911-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 10 ++--- fs/proc/proc_sysctl.c | 7 +++- kernel/watchdog.c | 37 +++++-------------- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0c72b5c22416..ed9597e37415 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,7 +1513,7 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: + Format: 0 | 1 A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled @@ -4665,9 +4665,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4676,7 +4676,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 15030784566c..5b405f32971d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1720,8 +1720,11 @@ struct sysctl_alias { * parameter. */ static const struct sysctl_alias sysctl_aliases[] = { - {"numa_zonelist_order", "vm.numa_zonelist_order" }, - {"hung_task_panic", "kernel.hung_task_panic" }, + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + {"softlockup_panic", "kernel.softlockup_panic" }, { } }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* From 0ec9dc9bcba0a62b0844e54c1caf6b8b0bf6b5b4 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:45 -0700 Subject: [PATCH 13/52] kernel/hung_task.c: introduce sysctl to print all traces when a hung task is detected Commit 401c636a0eeb ("kernel/hung_task.c: show all hung tasks before panic") introduced a change in that we started to show all CPUs backtraces when a hung task is detected _and_ the sysctl/kernel parameter "hung_task_panic" is set. The idea is good, because usually when observing deadlocks (that may lead to hung tasks), the culprit is another task holding a lock and not necessarily the task detected as hung. The problem with this approach is that dumping backtraces is a slightly expensive task, specially printing that on console (and specially in many CPU machines, as servers commonly found nowadays). So, users that plan to collect a kdump to investigate the hung tasks and narrow down the deadlock definitely don't need the CPUs backtrace on dmesg/console, which will delay the panic and pollute the log (crash tool would easily grab all CPUs traces with 'bt -a' command). Also, there's the reciprocal scenario: some users may be interested in seeing the CPUs backtraces but not have the system panic when a hung task is detected. The current approach hence is almost as embedding a policy in the kernel, by forcing the CPUs backtraces' dump (only) on hung_task_panic. This patch decouples the panic event on hung task from the CPUs backtraces dump, by creating (and documenting) a new sysctl called "hung_task_all_cpu_backtrace", analog to the approach taken on soft/hard lockups, that have both a panic and an "all_cpu_backtrace" sysctl to allow individual control. The new mechanism for dumping the CPUs backtraces on hung task detection respects "hung_task_warnings" by not dumping the traces in case there's no warnings left. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Tetsuo Handa Link: http://lkml.kernel.org/r/20200327223646.20779-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 14 ++++++++++++++ include/linux/sched/sysctl.h | 7 +++++++ kernel/hung_task.c | 20 ++++++++++++++++++-- kernel/sysctl.c | 11 +++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b00b9223157..861820d27c19 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7b4d3a49b6c5..660ac49f2b53 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,6 +7,13 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK + +#ifdef CONFIG_SMP +extern unsigned int sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index b22b5eeab3cb..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -127,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -191,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 587ed0494f2f..34c1278951b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2437,6 +2437,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, From 60c958d8df9cfc40b745d6cd583cfbfa7525ead6 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Sun, 7 Jun 2020 21:40:48 -0700 Subject: [PATCH 14/52] panic: add sysctl to dump all CPUs backtraces on oops event Usually when the kernel reaches an oops condition, it's a point of no return; in case not enough debug information is available in the kernel splat, one of the last resorts would be to collect a kernel crash dump and analyze it. The problem with this approach is that in order to collect the dump, a panic is required (to kexec-load the crash kernel). When in an environment of multiple virtual machines, users may prefer to try living with the oops, at least until being able to properly shutdown their VMs / finish their important tasks. This patch implements a way to collect a bit more debug details when an oops event is reached, by printing all the CPUs backtraces through the usage of NMIs (on architectures that support that). The sysctl added (and documented) here was called "oops_all_cpu_backtrace", and when set will (as the name suggests) dump all CPUs backtraces. Far from ideal, this may be the last option though for users that for some reason cannot panic on oops. Most of times oopses are clear enough to indicate the kernel portion that must be investigated, but in virtual environments it's possible to observe hypervisor/KVM issues that could lead to oopses shown in other guests CPUs (like virtual APIC crashes). This patch hence aims to help debug such complex issues without resorting to kdump. Signed-off-by: Guilherme G. Piccoli Signed-off-by: Andrew Morton Reviewed-by: Kees Cook Cc: Luis Chamberlain Cc: Iurii Zaikin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Randy Dunlap Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200327224116.21030-1-gpiccoli@canonical.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/kernel.rst | 16 ++++++++++++++++ include/linux/kernel.h | 6 ++++++ kernel/panic.c | 11 +++++++++++ kernel/sysctl.c | 11 +++++++++++ 4 files changed, 44 insertions(+) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 861820d27c19..83acf5025488 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -646,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7835db7102e..82d91547d122 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -520,6 +520,12 @@ static inline u32 int_sqrt64(u64 x) } #endif +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; diff --git a/kernel/panic.c b/kernel/panic.c index 94b5c973770c..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -522,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34c1278951b9..f69d581d39c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2150,6 +2150,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, From e77132e75845470065768e2205727e6be52cb7f4 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Sun, 7 Jun 2020 21:40:51 -0700 Subject: [PATCH 15/52] kernel/sysctl.c: ignore out-of-range taint bits introduced via kernel.tainted Users with SYS_ADMIN capability can add arbitrary taint flags to the running kernel by writing to /proc/sys/kernel/tainted or issuing the command 'sysctl -w kernel.tainted=...'. This interface, however, is open for any integer value and this might cause an invalid set of flags being committed to the tainted_mask bitset. This patch introduces a simple way for proc_taint() to ignore any eventual invalid bit coming from the user input before committing those bits to the kernel tainted_mask. Signed-off-by: Rafael Aquini Signed-off-by: Andrew Morton Reviewed-by: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: "Theodore Ts'o" Link: http://lkml.kernel.org/r/20200512223946.888020-1-aquini@redhat.com Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f69d581d39c3..db1ce7af2563 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -880,10 +880,9 @@ static int proc_taint(struct ctl_table *table, int write, * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; From dadbb612f6e50bbf9101c2f5d82690ce9ea4d66b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sun, 7 Jun 2020 21:40:55 -0700 Subject: [PATCH 16/52] mm/gup.c: convert to use get_user_{page|pages}_fast_only() API __get_user_pages_fast() renamed to get_user_pages_fast_only() to align with pin_user_pages_fast_only(). As part of this we will get rid of write parameter. Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). This will not change any existing functionality of the API. All the callers are changed to pass FOLL_WRITE. Also introduce get_user_page_fast_only(), and use it in a few places that hard-code nr_pages to 1. Updated the documentation of the API. Signed-off-by: Souptick Joarder Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Paul Mackerras [arch/powerpc/kvm] Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Aneesh Kumar K.V Cc: Michal Suchanek Link: http://lkml.kernel.org/r/1590396812-31277-1-git-send-email-jrdr.linux@gmail.com Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/powerpc/perf/callchain_64.c | 4 +--- include/linux/mm.h | 10 +++++++-- kernel/events/core.c | 4 ++-- mm/gup.c | 29 ++++++++++++++------------ virt/kvm/kvm_main.c | 8 +++---- 7 files changed, 32 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 18aed9775a3c..ddfc4c90ebb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -581,7 +581,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * We always ask for write permission since the common case * is that the page is writable. */ - if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 02219e28b1e4..a47fa8b4d0f0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -795,7 +795,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index b63086b663ef..9cc1a129737e 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -30,11 +30,9 @@ int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned long addr = (unsigned long) ptr; unsigned long offset; struct page *page; - int nrpages; void *kaddr; - nrpages = __get_user_pages_fast(addr, 1, 1, &page); - if (nrpages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { kaddr = page_address(page); /* align address to page boundary */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 86adc71a972f..22010c4175f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1824,10 +1824,16 @@ extern int mprotect_fixup(struct vm_area_struct *vma, /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } diff --git a/mm/gup.c b/mm/gup.c index e19ff770eb4c..9d1bf3ec8e39 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2294,7 +2294,7 @@ pte_unmap: * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2699,7 +2699,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2811,8 +2811,14 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -/* +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the @@ -2825,8 +2831,8 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { int nr_pinned; /* @@ -2836,10 +2842,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; - - if (write) - gup_flags |= FOLL_WRITE; + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -2855,7 +2858,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr_pinned; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2926,8 +2929,8 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* - * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the - * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..7b0da1c28e51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1740,7 +1740,6 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { struct page *page[1]; - int npages; /* * Fast pin a writable pfn only if it is a write fault request @@ -1750,8 +1749,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, if (!(write_fault || writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); if (writable) @@ -1791,7 +1789,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (unlikely(!write_fault) && writable) { struct page *wpage; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { *writable = true; put_page(page); page = wpage; @@ -2003,7 +2001,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); From a8f80f53fb5c367e1a160adfb3b092a788faeb29 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:40:59 -0700 Subject: [PATCH 17/52] mm/gup: update pin_user_pages.rst for "case 3" (mmu notifiers) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update case 3 so that it covers the use of mmu notifiers, for hardware that does, or does not have replayable page faults. Also, elaborate case 4 slightly, as it was quite cryptic. Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Jonathan Corbet Link: http://lkml.kernel.org/r/20200527194953.11130-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 29 +++++++++++++---------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86..4675b04e8829 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,28 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. -Therefore, neither flag needs to be set. +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. page_maybe_dma_pinned(): the whole point of pinning =================================================== From 420c2091b65a95b1daea4c36d27df4ac5f38033d Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:02 -0700 Subject: [PATCH 18/52] mm/gup: introduce pin_user_pages_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/gup: introduce pin_user_pages_locked(), use it in frame_vector.c", v2. This adds yet one more pin_user_pages*() variant, and uses that to convert mm/frame_vector.c. With this, along with maybe 20 or 30 other recent patches in various trees, we are close to having the relevant gup call sites converted--with the notable exception of the bio/block layer. This patch (of 2): Introduce pin_user_pages_locked(), which is nearly identical to get_user_pages_locked() except that it sets FOLL_PIN and rejects FOLL_GET. As with other pairs of get_user_pages*() and pin_user_pages() API calls, it's prudent to assert that FOLL_PIN is *not* set in the get_user_pages*() call, so add that as part of this. [jhubbard@nvidia.com: v2] Link: http://lkml.kernel.org/r/20200531234131.770697-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Souptick Joarder Link: http://lkml.kernel.org/r/20200531234131.770697-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200527223243.884385-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/gup.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 22010c4175f2..9d6042178ca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1706,6 +1706,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, diff --git a/mm/gup.c b/mm/gup.c index 9d1bf3ec8e39..5ce0a99dd8f3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2035,6 +2035,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, @@ -3058,3 +3064,32 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); From 55a650c35fea7cfdf989647f37960dfaf76da737 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:05 -0700 Subject: [PATCH 19/52] mm/gup: frame_vector: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), and all of the callers so far were in a "Case 2" scenario (DMA/RDMA), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Cc: Daniel Vetter Cc: Jérôme Glisse Cc: Vlastimil Babka Cc: Jan Kara Cc: Dave Chinner Cc: Pankaj Gupta Cc: Souptick Joarder Link: http://lkml.kernel.org/r/20200527223243.884385-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- mm/frame_vector.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad5..4107dbca0056 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(start, nr_frames, + ret = pin_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames); */ void put_vaddr_frames(struct frame_vector *vec) { - int i; struct page **pages; if (!vec->got_ref) @@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec) */ if (WARN_ON(IS_ERR(pages))) goto out; - for (i = 0; i < vec->nr_frames; i++) - put_page(pages[i]); + + unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; From 6a005645edd6de2b535783c96e66e08cccc5e067 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:08 -0700 Subject: [PATCH 20/52] mm/gup: documentation fix for pin_user_pages*() APIs All of the pin_user_pages*() API calls will cause pages to be dma-pinned. As such, they are all suitable for either DMA, RDMA, and/or Direct IO. The documentation should say so, but it was instead saying that three of the API calls were only suitable for Direct IO. This was discovered when a reviewer wondered why an API call that specifically recommended against Case 2 (DMA/RDMA) was being used in a DMA situation [1]. Fix this by simply deleting those claims. The gup.c comments already refer to the more extensive Documentation/core-api/pin_user_pages.rst, which does have the correct guidance. So let's just write it once, there. [1] https://lore.kernel.org/r/20200529074658.GM30374@kadam Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Acked-by: Pankaj Gupta Acked-by: Souptick Joarder Cc: Dan Carpenter Cc: Jan Kara Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200529084515.46259-1-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- mm/gup.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 5ce0a99dd8f3..8bd090b36d1d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2918,9 +2918,6 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2994,9 +2991,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -3030,9 +3024,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, From eaf4d22a9ea419bc4c85ad8f825a331bcc1ae340 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:11 -0700 Subject: [PATCH 21/52] docs: mm/gup: pin_user_pages.rst: add a "case 5" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "vhost, docs: convert to pin_user_pages(), new "case 5"" It recently became clear to me that there are some get_user_pages*() callers that don't fit neatly into any of the four cases that are so far listed in pin_user_pages.rst. vhost.c is one of those. Add a Case 5 to the documentation, and refer to that when converting vhost.c. Thanks to Jan Kara for helping me (again) in understanding the interaction between get_user_pages() and page writeback [1]. This is based on today's mmotm, which has a nearby patch to pin_user_pages.rst that rewords cases 3 and 4. Note that I have only compile-tested the vhost.c patch, although that does also include cross-compiling for a few other arches. Any run-time testing would be greatly appreciated. [1] https://lore.kernel.org/r/20200529070343.GL14550@quack2.suse.cz This patch (of 2): There are four cases listed in pin_user_pages.rst. These are intended to help developers figure out whether to use get_user_pages*(), or pin_user_pages*(). However, the four cases do not cover all the situations. For example, drivers/vhost/vhost.c has a "pin, write to page, set page dirty, unpin" case. Add a fifth case, to help explain that there is a general pattern that requires pin_user_pages*() API calls. [jhubbard@nvidia.com: v2] Link: http://lkml.kernel.org/r/20200601052633.853874-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Cc: Vlastimil Babka Cc: Jan Kara Cc: Jérôme Glisse Cc: Dave Chinner Cc: Jonathan Corbet Cc: Souptick Joarder Cc: "Michael S . Tsirkin" Cc: Jason Wang Link: http://lkml.kernel.org/r/20200529234309.484480-1-jhubbard@nvidia.com Link: http://lkml.kernel.org/r/20200529234309.484480-2-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 4675b04e8829..6068266dd303 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -171,6 +171,24 @@ If only struct page data (as opposed to the actual memory contents that a page is tracking) is affected, then normal GUP calls are sufficient, and neither flag needs to be set. +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +write to a page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + write to the data within the pages + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + write to the data within the pages + put_page() + page_maybe_dma_pinned(): the whole point of pinning =================================================== From 690623e1b49603bdd3df4532b7911a2ce26fb4c4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Sun, 7 Jun 2020 21:41:15 -0700 Subject: [PATCH 22/52] vhost: convert get_user_pages() --> pin_user_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This code was using get_user_pages*(), in approximately a "Case 5" scenario (accessing the data within a page), using the categorization from [1]. That means that it's time to convert the get_user_pages*() + put_page() calls to pin_user_pages*() + unpin_user_pages() calls. There is some helpful background in [2]: basically, this is a small part of fixing a long-standing disconnect between pinning pages, and file systems' use of those pages. [1] Documentation/core-api/pin_user_pages.rst [2] "Explicit pinning of user-space pages": https://lwn.net/Articles/807108/ Signed-off-by: John Hubbard Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Acked-by: Michael S. Tsirkin Acked-by: Pankaj Gupta Cc: Jason Wang Cc: Dave Chinner Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Souptick Joarder Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200529234309.484480-3-jhubbard@nvidia.com Signed-off-by: Linus Torvalds --- drivers/vhost/vhost.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 21a59b598ed8..596132a96cd5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1762,15 +1762,14 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); + r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); - set_page_dirty_lock(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); return 0; } From ce450ebf6179acf6e90dcc090e90face215faec4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:25 -0700 Subject: [PATCH 23/52] arm: fix the flush_icache_range arguments in set_fiq_handler Patch series "sort out the flush_icache_range mess", v2. flush_icache_range is mostly used for kernel address, except for the following cases: - the nommu brk and mmap implementations - the read_code helper that is only used for binfmt_flat, binfmt_elf_fdpic, and binfmt_aout including the broken ia32 compat version - binfmt_flat itself none of which really are used by a typical MMU enabled kernel, as a.out can only be build for alpha and m68k to start with. But strangely enough commit ae92ef8a4424 ("PATCH] flush icache in correct context") added a "set_fs(KERNEL_DS)" around the flush_icache_range call in the module loader, because apparently m68k assumed user pointers. This series first cleans up the cacheflush implementations, largely by switching as much as possible to the asm-generic version after a few preparations, then moves the misnamed current flush_icache_user_range to a new name, to finally introduce a real flush_icache_user_range to be used for the above use cases to flush the instruction cache for a userspace address range. The last patch then drops the set_fs in the module code and moves it into the m68k implementation. This patch (of 29): The arguments passed look bogus, try to fix them to something that seems to make sense. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Roman Zippel Cc: Jessica Yu Cc: Michal Simek Cc: Albert Ou Cc: Alexander Shishkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Anton Ivanov Cc: Arnaldo Carvalho de Melo Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: Daniel Borkmann Cc: Dan Williams Cc: Dave Jiang Cc: "David S. Miller" Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ira Weiny Cc: Ivan Kokshaysky Cc: Jeff Dike Cc: Jiri Olsa Cc: Jonas Bonn Cc: Keith Busch Cc: Mark Rutland Cc: Mark Salter Cc: Martin KaFai Lau Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Namhyung Kim Cc: Nick Piggin Cc: Palmer Dabbelt Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vishal Verma Cc: Will Deacon Cc: Yonghong Song Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200515143646.3857579-1-hch@lst.de Link: http://lkml.kernel.org/r/20200515143646.3857579-2-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/kernel/fiq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index cd1234c103fc..98ca3e3fa847 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -98,8 +98,8 @@ void set_fiq_handler(void *start, unsigned int length) memcpy(base + offset, start, length); if (!cache_is_vipt_nonaliasing()) - flush_icache_range((unsigned long)base + offset, offset + - length); + flush_icache_range((unsigned long)base + offset, + (unsigned long)base + offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); } From e7c1fa11b0b007e5b74c39be38d507d06aa19e8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:29 -0700 Subject: [PATCH 24/52] nds32: unexport flush_icache_page flush_icache_page is only used by mm/memory.c. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Greentime Hu Cc: Vincent Chen Link: http://lkml.kernel.org/r/20200515143646.3857579-3-hch@lst.de Signed-off-by: Linus Torvalds --- arch/nds32/mm/cacheflush.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 254703653b6f..8f168b33065f 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -35,7 +35,6 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) kunmap_atomic((void *)kaddr); local_irq_restore(flags); } -EXPORT_SYMBOL(flush_icache_page); void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) From e292e7403e3fe048abec1a5fd83b3181748dcb1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:32 -0700 Subject: [PATCH 25/52] powerpc: unexport flush_icache_user_range flush_icache_user_range is only used by copy_to_user_page, which is only used by core VM code. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Link: http://lkml.kernel.org/r/20200515143646.3857579-4-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5f7fe13211e9..3f1a70c6781b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -586,7 +586,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, flush_icache_range(maddr, maddr + len); kunmap(page); } -EXPORT_SYMBOL(flush_icache_user_range); /* * System memory should not be in /proc/iomem but various tools expect it From 7c95fda549bdd6308dc833e13263227a7238c50d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:36 -0700 Subject: [PATCH 26/52] unicore32: remove flush_cache_user_range flush_cache_user_range is an ARMism not used by any generic or unicore32 specific code. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200515143646.3857579-5-hch@lst.de Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/cacheflush.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index dc8c0b41538f..9393ca4047e9 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -132,14 +132,6 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define flush_cache_dup_mm(mm) flush_cache_mm(mm) -/* - * flush_cache_user_range is used when we want to ensure that the - * Harvard caches are synchronised for the user space address range. - * This is used for the UniCore private sys_cacheflush system call. - */ -#define flush_cache_user_range(vma, start, end) \ - __cpuc_coherent_user_range((start) & PAGE_MASK, PAGE_ALIGN(end)) - /* * Perform necessary cache operations to ensure that data previously * stored within this range of addresses can be executed by the CPU. From 92a73bd29ad0a31cfa9241e86a557252ac77b217 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:39 -0700 Subject: [PATCH 27/52] asm-generic: fix the inclusion guards for cacheflush.h cacheflush.h uses a somewhat to generic include guard name that clashes with various arch files. Use a more specific one. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-6-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index cac7404b2bdd..906277492ec5 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_CACHEFLUSH_H -#define __ASM_CACHEFLUSH_H +#ifndef _ASM_GENERIC_CACHEFLUSH_H +#define _ASM_GENERIC_CACHEFLUSH_H /* Keep includes the same across arches. */ #include @@ -109,4 +109,4 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) memcpy(dst, src, len) #endif -#endif /* __ASM_CACHEFLUSH_H */ +#endif /* _ASM_GENERIC_CACHEFLUSH_H */ From e0cf615d725cb3b69f0bdf1d8afdd4d4c31b4fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:42 -0700 Subject: [PATCH 28/52] asm-generic: don't include in cacheflush.h This seems to lead to some crazy include loops when using asm-generic/cacheflush.h on more architectures, so leave it to the arch header for now. [hch@lst.de: fix warning] Link: http://lkml.kernel.org/r/20200520173520.GA11199@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Will Deacon Cc: Nick Piggin Cc: Peter Zijlstra Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Keith Busch Cc: Ira Weiny Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-7-hch@lst.de Signed-off-by: Linus Torvalds --- arch/um/include/asm/tlb.h | 2 ++ arch/x86/include/asm/cacheflush.h | 2 ++ drivers/media/platform/omap3isp/ispvideo.c | 2 +- drivers/nvdimm/pmem.c | 3 ++- include/asm-generic/cacheflush.h | 3 --- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 70ee60383900..ff9c62828962 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,6 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H +#include + #include #include #include diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63feaf2a5f93..b192d917a6d0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H +#include + /* Caches aren't brain-dead on the intel. */ #include #include diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 6f769c527fae..10c214bd0903 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -10,7 +10,6 @@ * Sakari Ailus */ -#include #include #include #include @@ -19,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 97f948f8f4e6..d1ecd6da11a2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -7,7 +7,6 @@ * Copyright (c) 2015, Boaz Harrosh . */ -#include #include #include #include @@ -25,6 +24,8 @@ #include #include #include +#include +#include #include "pmem.h" #include "pfn.h" #include "nd.h" diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 906277492ec5..bf9bb83e9fc8 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,9 +2,6 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H -/* Keep includes the same across arches. */ -#include - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 /* From 76b3b58fac350c45e44a3868946ff5e454e8d367 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:45 -0700 Subject: [PATCH 29/52] asm-generic: improve the flush_dcache_page stub There is a magic ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE cpp symbol that guards non-stub availability of flush_dcache_pagge. Use that to check if flush_dcache_pagg is implemented. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-8-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index bf9bb83e9fc8..bbbb4d4ef651 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,8 +2,6 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory @@ -42,12 +40,14 @@ static inline void flush_cache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_dcache_page +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static inline void flush_dcache_page(struct page *page) { } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #endif + #ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) { From 43c74ca337ce623a5f0cb93e2903f4a88d3e98a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:48 -0700 Subject: [PATCH 30/52] alpha: use asm-generic/cacheflush.h Alpha needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Link: http://lkml.kernel.org/r/20200515143646.3857579-9-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/cacheflush.h | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 89128489cb59..636d7ca0d05f 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -4,19 +4,6 @@ #include -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* Note that the following two definitions are _highly_ dependent on the contexts in which they are used in the kernel. I personally think it is criminal how loosely defined these macros are. */ @@ -59,20 +46,17 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#else +#define flush_icache_user_range flush_icache_user_range +#else /* CONFIG_SMP */ extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#endif +#define flush_icache_user_range flush_icache_user_range +#endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_range((vma), (page), 0, 0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ALPHA_CACHEFLUSH_H */ From a7ba121215faad444f94db5e89eb5d7dfc58ecb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:51 -0700 Subject: [PATCH 31/52] arm64: use asm-generic/cacheflush.h ARM64 needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Catalin Marinas Cc: Will Deacon Link: http://lkml.kernel.org/r/20200515143646.3857579-10-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/cacheflush.h | 46 ++++------------------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index ce50c1f1f1ea..9384fd8fc13c 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -94,20 +94,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) kick_all_cpus_sync(); } - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long user_addr, unsigned long pfn) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} +#define flush_icache_range flush_icache_range /* * Cache maintenance functions used by the DMA API. No to be used directly. @@ -123,12 +110,7 @@ extern void __dma_flush_area(const void *, size_t); */ extern void copy_to_user_page(struct vm_area_struct *, struct page *, unsigned long, void *, const void *, unsigned long); -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - } while (0) - -#define flush_cache_dup_mm(mm) flush_cache_mm(mm) +#define copy_to_user_page copy_to_user_page /* * flush_dcache_page is used when the kernel has written to the page @@ -154,29 +136,11 @@ static __always_inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma,page) do { } while (0) - -/* - * Not required on AArch64 (PIPT or VIPT non-aliasing D-cache). - */ -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); -#endif +#include + +#endif /* __ASM_CACHEFLUSH_H */ From 2d49d89c73fe9b76b02799a71e768b312ad65039 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:55 -0700 Subject: [PATCH 32/52] c6x: use asm-generic/cacheflush.h C6x needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Mark Salter Cc: Aurelien Jacquiot Link: http://lkml.kernel.org/r/20200515143646.3857579-11-hch@lst.de Signed-off-by: Linus Torvalds --- arch/c6x/include/asm/cacheflush.h | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/arch/c6x/include/asm/cacheflush.h b/arch/c6x/include/asm/cacheflush.h index 4540b40475e6..10922d528de6 100644 --- a/arch/c6x/include/asm/cacheflush.h +++ b/arch/c6x/include/asm/cacheflush.h @@ -16,21 +16,6 @@ #include #include -/* - * virtually-indexed cache management (our cache is physically indexed) - */ -#define flush_cache_all() do {} while (0) -#define flush_cache_mm(mm) do {} while (0) -#define flush_cache_dup_mm(mm) do {} while (0) -#define flush_cache_range(mm, start, end) do {} while (0) -#define flush_cache_page(vma, vmaddr, pfn) do {} while (0) -#define flush_cache_vmap(start, end) do {} while (0) -#define flush_cache_vunmap(start, end) do {} while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do {} while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - /* * physically-indexed cache management */ @@ -49,14 +34,12 @@ do { \ (unsigned long) page_address(page) + PAGE_SIZE)); \ } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ flush_icache_range((unsigned) (dst), (unsigned) (dst) + (len)); \ } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ASM_C6X_CACHEFLUSH_H */ From af23eea561961108b22422750f07025d2fd91240 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:58 -0700 Subject: [PATCH 33/52] hexagon: use asm-generic/cacheflush.h Hexagon needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Brian Cain Link: http://lkml.kernel.org/r/20200515143646.3857579-12-hch@lst.de Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/cacheflush.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index fb447de45d54..6eff0730e6ef 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -25,29 +25,17 @@ #define LINESIZE 32 #define LINEBITS 5 -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* * Flush Dcache range through current map. */ extern void flush_dcache_range(unsigned long start, unsigned long end); +#define flush_dcache_range flush_dcache_range /* * Flush Icache range through current map. */ extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range /* * Memory-management related flushes are there to ensure in non-physically @@ -78,6 +66,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); +#define copy_to_user_page copy_to_user_page #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) @@ -85,4 +74,6 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, extern void hexagon_inv_dcache_range(unsigned long start, unsigned long end); extern void hexagon_clean_dcache_range(unsigned long start, unsigned long end); +#include + #endif From 57b94ff59796bfbf7debebb2293b66d96f726a12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:01 -0700 Subject: [PATCH 34/52] ia64: use asm-generic/cacheflush.h IA64 needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Tony Luck Cc: Fenghua Yu Link: http://lkml.kernel.org/r/20200515143646.3857579-13-hch@lst.de Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/cacheflush.h | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index 6d3478f8abc8..a8f1c86ac242 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -12,44 +12,22 @@ #include -/* - * Cache flushing routines. This is the kind of stuff that can be very expensive, so try - * to avoid them whenever possible. - */ - -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma,page) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) \ do { \ clear_bit(PG_arch_1, &(page)->flags); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -extern void flush_icache_range (unsigned long start, unsigned long end); +extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range extern void clflush_cache_range(void *addr, int size); - #define flush_icache_user_range(vma, page, user_addr, len) \ do { \ unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK); \ flush_icache_range(_addr, _addr + (len)); \ } while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* _ASM_IA64_CACHEFLUSH_H */ From 03518c82b4b33592fed428d38925ce59df91b1c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:04 -0700 Subject: [PATCH 35/52] microblaze: use asm-generic/cacheflush.h Microblaze needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Michal Simek Link: http://lkml.kernel.org/r/20200515143646.3857579-14-hch@lst.de Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/cacheflush.h | 29 ++---------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index 11f56c85056b..39f8fb6768d8 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -57,9 +57,6 @@ void microblaze_cache_init(void); #define invalidate_icache() mbc->iin(); #define invalidate_icache_range(start, end) mbc->iinr(start, end); -#define flush_icache_user_range(vma, pg, adr, len) flush_icache(); -#define flush_icache_page(vma, pg) do { } while (0) - #define enable_dcache() mbc->de(); #define disable_dcache() mbc->dd(); /* FIXME for LL-temac driver */ @@ -77,27 +74,9 @@ do { \ flush_dcache_range((unsigned) (addr), (unsigned) (addr) + PAGE_SIZE); \ } while (0); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) -#define flush_cache_mm(mm) do { } while (0) - #define flush_cache_page(vma, vmaddr, pfn) \ flush_dcache_range(pfn << PAGE_SHIFT, (pfn << PAGE_SHIFT) + PAGE_SIZE); -/* MS: kgdb code use this macro, wrong len with FLASH */ -#if 0 -#define flush_cache_range(vma, start, len) { \ - flush_icache_range((unsigned) (start), (unsigned) (start) + (len)); \ - flush_dcache_range((unsigned) (start), (unsigned) (start) + (len)); \ -} -#endif - -#define flush_cache_range(vma, start, len) do { } while (0) - static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len) @@ -109,12 +88,8 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, flush_dcache_range(addr, addr + PAGE_SIZE); } } +#define copy_to_user_page copy_to_user_page -static inline void copy_from_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, void *src, int len) -{ - memcpy(dst, src, len); -} +#include #endif /* _ASM_MICROBLAZE_CACHEFLUSH_H */ From 9e730ffac148127bc0cbc45097fe23d7e29a8130 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:06 -0700 Subject: [PATCH 36/52] m68knommu: use asm-generic/cacheflush.h m68knommu needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Greg Ungerer Cc: Greg Ungerer Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-15-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/cacheflush_no.h | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h index 11e9a9dcbfb2..2731f07e7be8 100644 --- a/arch/m68k/include/asm/cacheflush_no.h +++ b/arch/m68k/include/asm/cacheflush_no.h @@ -9,25 +9,8 @@ #include #define flush_cache_all() __flush_cache_all() -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr) do { } while (0) #define flush_dcache_range(start, len) __flush_dcache_all() -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_range(start, len) __flush_icache_all() -#define flush_icache_page(vma,pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) void mcf_cache_push(void); @@ -98,4 +81,6 @@ static inline void cache_clear(unsigned long paddr, int len) __clear_cache_all(); } +#include + #endif /* _M68KNOMMU_CACHEFLUSH_H */ From e050945123065b1804446cf68cdd8dc2a6a2b9f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:09 -0700 Subject: [PATCH 37/52] openrisc: use asm-generic/cacheflush.h OpenRISC needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Link: http://lkml.kernel.org/r/20200515143646.3857579-16-hch@lst.de Signed-off-by: Linus Torvalds --- arch/openrisc/include/asm/cacheflush.h | 31 +++++--------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 79d5d7753fe4..74d1fce4e883 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -62,31 +62,12 @@ static inline void flush_dcache_page(struct page *page) clear_bit(PG_dc_clean, &page->flags); } -/* - * Other interfaces are not required since we do not have virtually - * indexed or tagged caches. So we can use the default here. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +#define flush_icache_user_range(vma, page, addr, len) \ +do { \ + if (vma->vm_flags & VM_EXEC) \ + sync_icache_dcache(page); \ +} while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - if (vma->vm_flags & VM_EXEC) \ - sync_icache_dcache(page); \ - } while (0) - -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include #endif /* __ASM_CACHEFLUSH_H */ From 5019f76019e5780bad60248638891d13fd905e6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:12 -0700 Subject: [PATCH 38/52] powerpc: use asm-generic/cacheflush.h Power needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Also remove the pointless __KERNEL__ ifdef while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Link: http://lkml.kernel.org/r/20200515143646.3857579-17-hch@lst.de Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/cacheflush.h | 42 +++++++-------------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index e92191b390f3..e682c8e10e90 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -4,23 +4,9 @@ #ifndef _ASM_POWERPC_CACHEFLUSH_H #define _ASM_POWERPC_CACHEFLUSH_H -#ifdef __KERNEL__ - #include #include -/* - * No cache flushing is required when address mappings are changed, - * because the caches on PowerPCs are physically addressed. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #ifdef CONFIG_PPC_BOOK3S_64 /* * Book3s has no ptesync after setting a pte, so without this ptesync it's @@ -33,20 +19,20 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) { asm volatile("ptesync" ::: "memory"); } -#else -static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } -#endif +#define flush_cache_vmap flush_cache_vmap +#endif /* CONFIG_PPC_BOOK3S_64 */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) void flush_icache_range(unsigned long start, unsigned long stop); -extern void flush_icache_user_range(struct vm_area_struct *vma, - struct page *page, unsigned long addr, - int len); -extern void flush_dcache_icache_page(struct page *page); +#define flush_icache_range flush_icache_range + +void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len); +#define flush_icache_user_range flush_icache_user_range + +void flush_dcache_icache_page(struct page *page); void __flush_dcache_icache(void *page); /** @@ -111,14 +97,6 @@ static inline void invalidate_dcache_range(unsigned long start, mb(); /* sync */ } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - -#endif /* __KERNEL__ */ +#include #endif /* _ASM_POWERPC_CACHEFLUSH_H */ From 396eb69c6e3914c4aa7b2bdeaa7476cc48265df6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:15 -0700 Subject: [PATCH 39/52] riscv: use asm-generic/cacheflush.h RISC-V needs almost no cache flushing routines of its own. Rely on asm-generic/cacheflush.h for the defaults. Also remove the pointless __KERNEL__ ifdef while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Link: http://lkml.kernel.org/r/20200515143646.3857579-18-hch@lst.de Signed-off-by: Linus Torvalds --- arch/riscv/include/asm/cacheflush.h | 62 ++--------------------------- 1 file changed, 3 insertions(+), 59 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index c8677c75f82c..a167b4fbdf00 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -8,65 +8,6 @@ #include -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - -/* - * The cache doesn't need to be flushed when TLB entries change when - * the cache is mapped to physical memory, not virtual memory - */ -static inline void flush_cache_all(void) -{ -} - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_dup_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long vmaddr, - unsigned long pfn) -{ -} - -static inline void flush_dcache_mmap_lock(struct address_space *mapping) -{ -} - -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) -{ -} - -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) -{ -} - -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - static inline void local_flush_icache_all(void) { asm volatile ("fence.i" ::: "memory"); @@ -79,6 +20,7 @@ static inline void flush_dcache_page(struct page *page) if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, @@ -105,4 +47,6 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL #define SYS_RISCV_FLUSH_ICACHE_ALL (SYS_RISCV_FLUSH_ICACHE_LOCAL) +#include + #endif /* _ASM_RISCV_CACHEFLUSH_H */ From 97f52c1536f041fd9f0b62667864930c92ac93cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:19 -0700 Subject: [PATCH 40/52] arm,sparc,unicore32: remove flush_icache_user_range flush_icache_user_range is only used by , so remove it from the architectures that implement it, but don't use . Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Russell King Cc: "David S. Miller" Cc: Guan Xuetao Link: http://lkml.kernel.org/r/20200515143646.3857579-19-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/include/asm/cacheflush.h | 3 --- arch/sparc/include/asm/cacheflush_32.h | 2 -- arch/sparc/include/asm/cacheflush_64.h | 1 - arch/unicore32/include/asm/cacheflush.h | 3 --- 4 files changed, 9 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 7114b9aa46b8..c78e14fcfb5d 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -318,9 +318,6 @@ extern void flush_kernel_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index fb66094a2c30..41c6d734a474 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -17,8 +17,6 @@ #define flush_icache_range(start, end) do { } while (0) #define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ flush_cache_page(vma, vaddr, page_to_pfn(page));\ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index e7517434d1fa..b9341836597e 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -49,7 +49,6 @@ void __flush_dcache_range(unsigned long start, unsigned long end); void flush_dcache_page(struct page *page); #define flush_icache_page(vma, pg) do { } while(0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index 9393ca4047e9..ff0be92ebc32 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -162,9 +162,6 @@ extern void flush_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_user_range(vma, page, addr, len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). From 885f7f8e30468705926b3de53d32925bd7a51a03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:22 -0700 Subject: [PATCH 41/52] mm: rename flush_icache_user_range to flush_icache_user_page The function currently known as flush_icache_user_range only operates on a single page. Rename it to flush_icache_user_page as we'll need the name flush_icache_user_range for something else soon. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Mark Rutland Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20200515143646.3857579-20-hch@lst.de Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/cacheflush.h | 10 +++++----- arch/alpha/kernel/smp.c | 2 +- arch/ia64/include/asm/cacheflush.h | 2 +- arch/m68k/include/asm/cacheflush_mm.h | 4 ++-- arch/m68k/mm/cache.c | 2 +- arch/nds32/include/asm/cacheflush.h | 4 ++-- arch/nds32/mm/cacheflush.c | 2 +- arch/openrisc/include/asm/cacheflush.h | 2 +- arch/powerpc/include/asm/cacheflush.h | 4 ++-- arch/powerpc/mm/mem.c | 2 +- arch/riscv/include/asm/cacheflush.h | 3 ++- include/asm-generic/cacheflush.h | 6 +++--- kernel/events/uprobes.c | 2 +- 13 files changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 636d7ca0d05f..9945ff483eaf 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -35,7 +35,7 @@ extern void smp_imb(void); extern void __load_new_mm_context(struct mm_struct *); static inline void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (vma->vm_flags & VM_EXEC) { @@ -46,16 +46,16 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #else /* CONFIG_SMP */ -extern void flush_icache_user_range(struct vm_area_struct *vma, +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_page((vma), (page), 0, 0) #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20..52995bf413fe 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -740,7 +740,7 @@ ipi_flush_icache_page(void *x) } void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index a8f1c86ac242..708c0fa5d975 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -22,7 +22,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range flush_icache_range extern void clflush_cache_range(void *addr, int size); -#define flush_icache_user_range(vma, page, user_addr, len) \ +#define flush_icache_user_page(vma, page, user_addr, len) \ do { \ unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK); \ flush_icache_range(_addr, _addr + (len)); \ diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1e2544ecaf88..95376bf84faa 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -254,7 +254,7 @@ static inline void __flush_page_to_ram(void *vaddr) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) -extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); extern void flush_icache_range(unsigned long address, unsigned long endaddr); @@ -264,7 +264,7 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, { flush_cache_page(vma, vaddr, page_to_pfn(page)); memcpy(dst, src, len); - flush_icache_user_range(vma, page, vaddr, len); + flush_icache_user_page(vma, page, vaddr, len); } static inline void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 079e64898e6a..99057cd5ff7f 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -106,7 +106,7 @@ void flush_icache_range(unsigned long address, unsigned long endaddr) } EXPORT_SYMBOL(flush_icache_range); -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (CPU_IS_COLDFIRE) { diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index caddded56e77..7d6824f7c0e8 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -44,9 +44,9 @@ void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #include #endif diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 8f168b33065f..6eb98a7ad27d 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -36,7 +36,7 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) local_irq_restore(flags); } -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long kaddr; diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 74d1fce4e883..eeac40d4a854 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -62,7 +62,7 @@ static inline void flush_dcache_page(struct page *page) clear_bit(PG_dc_clean, &page->flags); } -#define flush_icache_user_range(vma, page, addr, len) \ +#define flush_icache_user_page(vma, page, addr, len) \ do { \ if (vma->vm_flags & VM_EXEC) \ sync_icache_dcache(page); \ diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index e682c8e10e90..de600b915a3c 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -28,9 +28,9 @@ extern void flush_dcache_page(struct page *page); void flush_icache_range(unsigned long start, unsigned long stop); #define flush_icache_range flush_icache_range -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page void flush_dcache_icache_page(struct page *page); void __flush_dcache_icache(void *page); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3f1a70c6781b..e2d6a6236aa7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -577,7 +577,7 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, flush_dcache_page(pg); } -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long maddr; diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index a167b4fbdf00..23ff70350992 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -27,7 +27,8 @@ static inline void flush_dcache_page(struct page *page) * so instead we just flush the whole thing. */ #define flush_icache_range(start, end) flush_icache_all() -#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0) +#define flush_icache_user_page(vma, pg, addr, len) \ + flush_icache_mm(vma->vm_mm, 0) #ifndef CONFIG_SMP diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index bbbb4d4ef651..2c9686fefb71 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -73,8 +73,8 @@ static inline void flush_icache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_icache_user_range -static inline void flush_icache_user_range(struct vm_area_struct *vma, +#ifndef flush_icache_user_page +static inline void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { @@ -97,7 +97,7 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ + flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eddc8db96027..e51ec844c87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1668,7 +1668,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. From 1268c33382facefd9c389d4c425e8f906f279c7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:26 -0700 Subject: [PATCH 42/52] asm-generic: add a flush_icache_user_range stub Define flush_icache_user_range to flush_icache_range unless the architecture provides its own implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-21-hch@lst.de Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 2c9686fefb71..907fa5d16494 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -66,6 +66,10 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) } #endif +#ifndef flush_icache_user_range +#define flush_icache_user_range flush_icache_range +#endif + #ifndef flush_icache_page static inline void flush_icache_page(struct vm_area_struct *vma, struct page *page) From 952ec41c44a4e9802f2c991912c117d2347832af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:28 -0700 Subject: [PATCH 43/52] sh: implement flush_icache_user_range The SuperH implementation of flush_icache_range seems to be able to cope with user addresses. Just define flush_icache_user_range to flush_icache_range. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Yoshinori Sato Cc: Rich Felker Link: http://lkml.kernel.org/r/20200515143646.3857579-22-hch@lst.de Signed-off-by: Linus Torvalds --- arch/sh/include/asm/cacheflush.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index b932e42ef028..fe7400079b97 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -46,6 +46,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_user_range flush_icache_range extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); extern void flush_cache_sigtramp(unsigned long address); From 70cd3444c17d80d227c9c6ff2287f10b7114e479 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:32 -0700 Subject: [PATCH 44/52] xtensa: implement flush_icache_user_range The Xtensa implementation of flush_icache_range seems to be able to cope with user addresses. Just define flush_icache_user_range to flush_icache_range. [jcmvbkbc@gmail.com: fix flush_icache_user_range in noMMU configs] Link: http://lkml.kernel.org/r/20200525221556.4270-1-jcmvbkbc@gmail.com Signed-off-by: Christoph Hellwig Signed-off-by: Max Filippov Signed-off-by: Andrew Morton Cc: Chris Zankel Cc: Max Filippov Link: http://lkml.kernel.org/r/20200515143646.3857579-23-hch@lst.de Signed-off-by: Linus Torvalds --- arch/xtensa/include/asm/cacheflush.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index a0d50be5a8cb..cf907e5bf2f2 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -145,6 +145,8 @@ void local_flush_cache_page(struct vm_area_struct *vma, #endif +#define flush_icache_user_range flush_icache_range + /* Ensure consistency between data and instruction cache. */ #define local_flush_icache_range(start, end) \ do { \ From fca7f8e6fdeb716420a348acf6ca9bc7e609f5c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:34 -0700 Subject: [PATCH 45/52] arm: rename flush_cache_user_range to flush_icache_user_range flush_icache_user_range will be the name for a generic primitive. Move the arm name so that arm already has an implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Russell King Link: http://lkml.kernel.org/r/20200515143646.3857579-24-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm/include/asm/cacheflush.h | 4 ++-- arch/arm/kernel/traps.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index c78e14fcfb5d..2e24e765e6d3 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -258,11 +258,11 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the + * flush_icache_user_range is used when we want to ensure that the * Harvard caches are synchronised for the user space address range. * This is used for the ARM private sys_cacheflush system call. */ -#define flush_cache_user_range(s,e) __cpuc_coherent_user_range(s,e) +#define flush_icache_user_range(s,e) __cpuc_coherent_user_range(s,e) /* * Perform necessary cache operations to ensure that data previously diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0f..316a7687f813 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -566,7 +566,7 @@ __do_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = flush_cache_user_range(start, start + chunk); + ret = flush_icache_user_range(start, start + chunk); if (ret) return ret; From a1e81f9654eef650d3ee35c94a8cab00b5cd379c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:37 -0700 Subject: [PATCH 46/52] m68k: implement flush_icache_user_range Rename the current flush_icache_range to flush_icache_user_range as per commit ae92ef8a4424 ("PATCH] flush icache in correct context") there seems to be an assumption that it operates on user addresses. Add a flush_icache_range around it that for now is a no-op. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Geert Uytterhoeven Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-25-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/include/asm/cacheflush_mm.h | 2 ++ arch/m68k/mm/cache.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 95376bf84faa..1ac55e7b47f0 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -257,6 +257,8 @@ static inline void __flush_page_to_ram(void *vaddr) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); extern void flush_icache_range(unsigned long address, unsigned long endaddr); +extern void flush_icache_user_range(unsigned long address, + unsigned long endaddr); static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 99057cd5ff7f..7915be3a0971 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -73,7 +73,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) /* Push n pages at kernel virtual address and clear the icache */ /* RZ: use cpush %bc instead of cpush %dc, cinv %ic */ -void flush_icache_range(unsigned long address, unsigned long endaddr) +void flush_icache_user_range(unsigned long address, unsigned long endaddr) { if (CPU_IS_COLDFIRE) { unsigned long start, end; @@ -104,6 +104,11 @@ void flush_icache_range(unsigned long address, unsigned long endaddr) : "di" (FLUSH_I)); } } + +void flush_icache_range(unsigned long address, unsigned long endaddr) +{ + flush_icache_user_range(address, endaddr); +} EXPORT_SYMBOL(flush_icache_range); void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, From 48304f7994d74ef8fddde3ff27ce41aa61453d39 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:40 -0700 Subject: [PATCH 47/52] exec: only build read_code when needed Only build read_code when binary formats that use it are built into the kernel. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-26-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index 93ff1c4c7ebb..88260ec0ac34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1053,6 +1053,8 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); +#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ + defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); @@ -1061,6 +1063,7 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) return res; } EXPORT_SYMBOL(read_code); +#endif /* * Maps the mm_struct mm into the current task struct. From bce2b68b891af3dd02fdc1603786c1c28db8bfad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:43 -0700 Subject: [PATCH 48/52] exec: use flush_icache_user_range in read_code read_code operates on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-27-hch@lst.de Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 88260ec0ac34..02d0c5d19be5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1059,7 +1059,7 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) - flush_icache_range(addr, addr + len); + flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); From 79ef1e1fffebcfcb2c93463ca8d0f4a03eceb8f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:46 -0700 Subject: [PATCH 49/52] binfmt_flat: use flush_icache_user_range load_flat_file works on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Greg Ungerer Cc: Alexander Viro Link: http://lkml.kernel.org/r/20200515143646.3857579-28-hch@lst.de Signed-off-by: Linus Torvalds --- fs/binfmt_flat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 9b82bc111d0a..87ce229c63cf 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -854,7 +854,7 @@ static int load_flat_file(struct linux_binprm *bprm, #endif /* CONFIG_BINFMT_FLAT_OLD */ } - flush_icache_range(start_code, end_code); + flush_icache_user_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ if (clear_user((void __user *)(datapos + data_len), bss_len + From a75a2df68f87c4924c614204a85b29d82d5c7dc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:49 -0700 Subject: [PATCH 50/52] nommu: use flush_icache_user_range in brk and mmap These obviously operate on user addresses. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Greg Ungerer Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20200515143646.3857579-29-hch@lst.de Signed-off-by: Linus Torvalds --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index dfae55f41901..062dc1c90d21 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -433,7 +433,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ - flush_icache_range(mm->brk, brk); + flush_icache_user_range(mm->brk, brk); return mm->brk = brk; } @@ -1277,7 +1277,7 @@ share: /* we flush the region from the icache only when the first executable * mapping of it is made */ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { - flush_icache_range(region->vm_start, region->vm_end); + flush_icache_user_range(region->vm_start, region->vm_end); region->vm_icache_flushed = true; } From 490741ab1beb9c9f6580e30c7cd91b17a7560369 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:42:52 -0700 Subject: [PATCH 51/52] module: move the set_fs hack for flush_icache_range to m68k flush_icache_range generally operates on kernel addresses, but for some reason m68k needed a set_fs override. Move that into the m68k code insted of keeping it in the module loader. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Acked-by: Jessica Yu Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Link: http://lkml.kernel.org/r/20200515143646.3857579-30-hch@lst.de Signed-off-by: Linus Torvalds --- arch/m68k/mm/cache.c | 4 ++++ kernel/module.c | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 7915be3a0971..5ecb3310e874 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -107,7 +107,11 @@ void flush_icache_user_range(unsigned long address, unsigned long endaddr) void flush_icache_range(unsigned long address, unsigned long endaddr) { + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); flush_icache_user_range(address, endaddr); + set_fs(old_fs); } EXPORT_SYMBOL(flush_icache_range); diff --git a/kernel/module.c b/kernel/module.c index ef400c389f49..e8a198588f26 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3344,12 +3344,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3361,8 +3355,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, From db33ec371be8e45956e8cebb5b0fe641f008430b Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sun, 7 Jun 2020 21:42:55 -0700 Subject: [PATCH 52/52] doc: cgroup: update note about conditions when oom killer is invoked Starting from v4.19 commit 29ef680ae7c2 ("memcg, oom: move out_of_memory back to the charge path") cgroup oom killer is no longer invoked only from page faults. Now it implements the same semantics as global OOM killer: allocation context invokes OOM killer and keeps retrying until success. [akpm@linux-foundation.org: fixes per Randy] Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Randy Dunlap Link: http://lkml.kernel.org/r/158894738928.208854.5244393925922074518.stgit@buzz Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c2a4b652bd1a..ce3e05e41724 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1170,6 +1170,13 @@ PAGE_SIZE multiple when read back. Under certain circumstances, the usage may go over the limit temporarily. + In default configuration regular 0-order allocations always + succeed unless OOM killer chooses current task as a victim. + + Some kinds of allocations don't invoke the OOM killer. + Caller could retry them differently, return into userspace + as -ENOMEM or silently ignore in cases like disk readahead. + This is the ultimate protection mechanism. As long as the high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. @@ -1226,17 +1233,9 @@ PAGE_SIZE multiple when read back. The number of time the cgroup's memory usage was reached the limit and allocation was about to fail. - Depending on context result could be invocation of OOM - killer and retrying allocation or failing allocation. - - Failed allocation in its turn could be returned into - userspace as -ENOMEM or silently ignored in cases like - disk readahead. For now OOM in memory cgroup kills - tasks iff shortage has happened inside page fault. - This event is not raised if the OOM killer is not considered as an option, e.g. for failed high-order - allocations. + allocations or if caller asked to not retry attempts. oom_kill The number of processes belonging to this cgroup