From c0b942a76361e08fc9fb17989e0f266e64ff0688 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Mon, 12 Dec 2016 16:40:39 -0800 Subject: [PATCH 001/123] kthread: add __printf attributes When commit fbae2d44aa1d ("kthread: add kthread_create_worker*()") introduced some kthread_create_...() functions which were taking printf-like parametter, it introduced __printf attributes to some functions (e.g. kthread_create_worker()). Nevertheless some new functions were forgotten (they have been detected thanks to -Wmissing-format-attribute warning flag). Add the missing __printf attributes to the newly-introduced functions in order to detect formatting issues at build-time with -Wformat flag. Link: http://lkml.kernel.org/r/20161126193543.22672-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 2 +- kernel/kthread.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index c1c3e63d52c1..4fec8b775895 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -175,7 +175,7 @@ __printf(2, 3) struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...); -struct kthread_worker * +__printf(3, 4) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...); diff --git a/kernel/kthread.c b/kernel/kthread.c index 956495f0efaf..2318fba86277 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create) } } -static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), +static __printf(4, 0) +struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) @@ -635,7 +636,7 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); -static struct kthread_worker * +static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { From 3fb4afd9a504c2386b8435028d43283216bf588e Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskiy Date: Mon, 12 Dec 2016 16:40:42 -0800 Subject: [PATCH 002/123] prctl: remove one-shot limitation for changing exe link This limitation came with the reason to remove "another way for malicious code to obscure a compromised program and masquerade as a benign process" by allowing "security-concious program can use this prctl once during its early initialization to ensure the prctl cannot later be abused for this purpose": http://marc.info/?l=linux-kernel&m=133160684517468&w=2 This explanation doesn't look sufficient. The only thing "exe" link is indicating is the file, used to execve, which is basically nothing and not reliable immediately after process has returned from execve system call. Moreover, to use this feture, all the mappings to previous exe file have to be unmapped and all the new exe file permissions must be satisfied. Which means, that changing exe link is very similar to calling execve on the binary. The need to remove this limitations comes from migration of NFS mount point, which is not accessible during restore and replaced by other file system. Because of this exe link has to be changed twice. [akpm@linux-foundation.org: fix up comment] Link: http://lkml.kernel.org/r/20160927153755.9337.69650.stgit@localhost.localdomain Signed-off-by: Stanislav Kinsburskiy Acked-by: Oleg Nesterov Acked-by: Cyrill Gorcunov Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Michal Hocko Cc: Kees Cook Cc: Andy Lutomirski Cc: John Stultz Cc: Matt Helsley Cc: Pavel Emelyanov Cc: Vlastimil Babka Cc: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 6 +++++- kernel/sys.c | 10 ---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 7551d3e2ab70..0e90f2973719 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm) /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ -#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ diff --git a/kernel/sys.c b/kernel/sys.c index 89d5be418157..fd6f50809b6e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1696,16 +1696,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) fput(exe_file); } - /* - * The symlink can be changed only once, just to disallow arbitrary - * transitions malicious software might bring in. This means one - * could make a snapshot over all processes running and monitor - * /proc/pid/exe changes to notice unusual activity if needed. - */ - err = -EPERM; - if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) - goto exit; - err = 0; /* set the new file, lockless */ get_file(exe.file); From 3af06fd96aae18561830745880ca6e289053edae Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:40:45 -0800 Subject: [PATCH 003/123] scripts/bloat-o-meter: don't use readlines() readlines() conses whole list before doing anything which is slower for big object files. Use per line iterator. Speed up is ~2% on "allyesconfig" type of kernel. $ perf stat -r 16 taskset -c 15 ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux >/dev/null ... Before: 7.247708646 seconds time elapsed ( +- 0.28% ) After: 7.091202853 seconds time elapsed ( +- 0.15% ) Link: http://lkml.kernel.org/r/20161119004143.GA1200@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/bloat-o-meter | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index d9ff038c1b28..378dfd7ff81d 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -18,17 +18,18 @@ if len(sys.argv) != 3: def getsizes(file): sym = {} - for l in os.popen("nm --size-sort " + file).readlines(): - size, type, name = l[:-1].split() - if type in "tTdDbBrR": - # strip generated symbols - if name.startswith("__mod_"): continue - if name.startswith("SyS_"): continue - if name.startswith("compat_SyS_"): continue - if name == "linux_banner": continue - # statics and some other optimizations adds random .NUMBER - name = re.sub(r'\.[0-9]+', '', name) - sym[name] = sym.get(name, 0) + int(size, 16) + with os.popen("nm --size-sort " + file) as f: + for line in f: + size, type, name = line.split() + if type in "tTdDbBrR": + # strip generated symbols + if name.startswith("__mod_"): continue + if name.startswith("SyS_"): continue + if name.startswith("compat_SyS_"): continue + if name == "linux_banner": continue + # statics and some other optimizations adds random .NUMBER + name = re.sub(r'\.[0-9]+', '', name) + sym[name] = sym.get(name, 0) + int(size, 16) return sym old = getsizes(sys.argv[1]) From 0d7bbb43641c35390378e951785f2351bc36650a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:40:48 -0800 Subject: [PATCH 004/123] scripts/bloat-o-meter: compile .NUMBER regex Every often used regex is better be compiled in Python. Speedup is about ~9.8% (whee!) $ perf stat -r 16 taskset -c 15 ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux >/dev/null 7.091202853 seconds time elapsed ( +- 0.15% ) +re.compile 6.397564973 seconds time elapsed ( +- 0.34% ) Link: http://lkml.kernel.org/r/20161119004417.GB1200@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/bloat-o-meter | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 378dfd7ff81d..a27677146410 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -16,6 +16,8 @@ if len(sys.argv) != 3: sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) sys.exit(-1) +re_NUMBER = re.compile(r'\.[0-9]+') + def getsizes(file): sym = {} with os.popen("nm --size-sort " + file) as f: @@ -28,7 +30,7 @@ def getsizes(file): if name.startswith("compat_SyS_"): continue if name == "linux_banner": continue # statics and some other optimizations adds random .NUMBER - name = re.sub(r'\.[0-9]+', '', name) + name = re_NUMBER.sub('', name) sym[name] = sym.get(name, 0) + int(size, 16) return sym From 779d5eb375f8f3aa9b83d24cc8e3e56fe5dc9864 Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Mon, 12 Dec 2016 16:40:51 -0800 Subject: [PATCH 005/123] scripts/tags.sh: handle OMAP platforms properly When SUBARCH is "omap1" or "omap2", plat-omap/ directory must be indexed. Handle this special case properly. While at it, check if mach- directory exists at all. Link: http://lkml.kernel.org/r/20161202122148.15001-1-joe.skb7@gmail.com Signed-off-by: Sam Protsenko Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/tags.sh | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index a2ff3388e5ea..df5fa777d300 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -304,11 +304,26 @@ if [ "${ARCH}" = "um" ]; then elif [ "${SRCARCH}" = "arm" -a "${SUBARCH}" != "" ]; then subarchdir=$(find ${tree}arch/$SRCARCH/ -name "mach-*" -type d -o \ -name "plat-*" -type d); + mach_suffix=$SUBARCH + plat_suffix=$SUBARCH + + # Special cases when $plat_suffix != $mach_suffix + case $mach_suffix in + "omap1" | "omap2") + plat_suffix="omap" + ;; + esac + + if [ ! -d ${tree}arch/$SRCARCH/mach-$mach_suffix ]; then + echo "Warning: arch/arm/mach-$mach_suffix/ not found." >&2 + echo " Fix your \$SUBARCH appropriately" >&2 + fi + for i in $subarchdir; do case "$i" in - *"mach-"${SUBARCH}) + *"mach-"${mach_suffix}) ;; - *"plat-"${SUBARCH}) + *"plat-"${plat_suffix}) ;; *) subarchprune="$subarchprune \ From eb17726b00b327b3c0544f6970738204f09676a4 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 12 Dec 2016 16:40:54 -0800 Subject: [PATCH 006/123] m32r: add simple dma Some builds of m32r were failing as it tried to build few drivers which needed dma but m32r is not having dma support. Objections were raised when it was tried to make those drivers depend on HAS_DMA. So the next best thing is to add dma support to m32r. dma_noop is a very simple dma with 1:1 memory mapping. Link: http://lkml.kernel.org/r/1475949198-31623-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/Kconfig | 2 +- arch/m32r/include/asm/device.h | 6 +++++- arch/m32r/include/asm/dma-mapping.h | 32 +++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 arch/m32r/include/asm/dma-mapping.h diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 3cc8498fe0fe..d227a6988d6b 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -34,7 +34,7 @@ config NO_IOPORT_MAP def_bool y config NO_DMA - def_bool y + def_bool n config HZ int diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h index d8f9872b0e2d..4a9f35e0973f 100644 --- a/arch/m32r/include/asm/device.h +++ b/arch/m32r/include/asm/device.h @@ -3,5 +3,9 @@ * * This file is released under the GPLv2 */ -#include +struct dev_archdata { + struct dma_map_ops *dma_ops; +}; +struct pdev_archdata { +}; diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h new file mode 100644 index 000000000000..2c43a77fe942 --- /dev/null +++ b/arch/m32r/include/asm/dma-mapping.h @@ -0,0 +1,32 @@ +#ifndef _ASM_M32R_DMA_MAPPING_H +#define _ASM_M32R_DMA_MAPPING_H + +#include +#include +#include +#include +#include +#include + +#define DMA_ERROR_CODE (~(dma_addr_t)0x0) + +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + if (dev && dev->archdata.dma_ops) + return dev->archdata.dma_ops; + return &dma_noop_ops; +} + +static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction direction) +{ +} + +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return false; + return addr + size - 1 <= *dev->dma_mask; +} + +#endif /* _ASM_M32R_DMA_MAPPING_H */ From 17e96230d95cc2b7eb7d039415d3506340257b5e Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 12 Dec 2016 16:40:57 -0800 Subject: [PATCH 007/123] m32r: fix build warning While building m32r defconfig we got warnings: arch/m32r/platforms/m32700ut/setup.c:249:24: warning: 'm32700ut_lcdpld_irq_type' defined but not used [-Wunused-variable] m32700ut_lcdpld_irq_type is only used when CONFIG_USB is enabled. Modify the code to declare the related variables and functions only when CONFIG_USB is enabled. Link: http://lkml.kernel.org/r/1479244406-7507-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/platforms/m32700ut/setup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c index 9a4ba8a8589d..349eb341752c 100644 --- a/arch/m32r/platforms/m32700ut/setup.c +++ b/arch/m32r/platforms/m32700ut/setup.c @@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type = #define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \ (((x) - 1) * sizeof(unsigned short))) +#ifdef CONFIG_USB static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ]; static void disable_m32700ut_lcdpld_irq(unsigned int irq) @@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type = .irq_mask = mask_m32700ut_lcdpld, .irq_unmask = unmask_m32700ut_lcdpld, }; +#endif void __init init_IRQ(void) { From 4170a20f21e734b14317a65baaccc4078eef5198 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 12 Dec 2016 16:40:59 -0800 Subject: [PATCH 008/123] drivers/pcmcia/m32r_pcc.c: check return from request_irq While building m32r allmodconfig we were getting warning: drivers/pcmcia/m32r_pcc.c:331:2: warning: ignoring return value of 'request_irq', declared with attribute warn_unused_result request_irq() can fail and we should always be checking the result from it. Check the result and return it to the caller. Link: http://lkml.kernel.org/r/1474237304-897-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pcmcia/m32r_pcc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index eb126b98ed8a..fad4455665eb 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -296,10 +296,11 @@ static int __init is_alive(u_short sock) return 0; } -static void add_pcc_socket(ulong base, int irq, ulong mapaddr, - unsigned int ioaddr) +static int add_pcc_socket(ulong base, int irq, ulong mapaddr, + unsigned int ioaddr) { pcc_socket_t *t = &socket[pcc_sockets]; + int err; /* add sockets */ t->ioaddr = ioaddr; @@ -328,11 +329,16 @@ static void add_pcc_socket(ulong base, int irq, ulong mapaddr, t->socket.irq_mask = 0; t->socket.pci_irq = 2 + pcc_sockets; /* XXX */ - request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt); + err = request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt); + if (err) { + if (t->base > 0) + release_region(t->base, 0x20); + return err; + } pcc_sockets++; - return; + return 0; } From c795cf4f1865659b40a286a9997e64f4c15bbc9b Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 12 Dec 2016 16:41:02 -0800 Subject: [PATCH 009/123] drivers/pcmcia/m32r_pcc.c: use common error path Use a common error path for the failure. Link: http://lkml.kernel.org/r/1474237304-897-2-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pcmcia/m32r_pcc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index fad4455665eb..56bf38804e93 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -689,10 +689,8 @@ static int __init init_m32r_pcc(void) return ret; ret = platform_device_register(&pcc_device); - if (ret){ - platform_driver_unregister(&pcc_driver); - return ret; - } + if (ret) + goto unreg_driv; printk(KERN_INFO "m32r PCC probe:\n"); @@ -706,9 +704,8 @@ static int __init init_m32r_pcc(void) if (pcc_sockets == 0) { printk("socket is not found.\n"); - platform_device_unregister(&pcc_device); - platform_driver_unregister(&pcc_driver); - return -ENODEV; + ret = -ENODEV; + goto unreg_dev; } /* Set up interrupt handler(s) */ @@ -734,6 +731,12 @@ static int __init init_m32r_pcc(void) } return 0; + +unreg_dev: + platform_device_unregister(&pcc_device); +unreg_driv: + platform_driver_unregister(&pcc_driver); + return ret; } /* init_m32r_pcc */ static void __exit exit_m32r_pcc(void) From 3da82065f1e59fd280742cd792cbf2fb2052958f Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Mon, 12 Dec 2016 16:41:05 -0800 Subject: [PATCH 010/123] drivers/pcmcia/m32r_pcc.c: check return from add_pcc_socket If request_irq() fails it passes the error to the caller. The caller now checks it and jumps to the common error path on failure. Link: http://lkml.kernel.org/r/1474237304-897-3-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pcmcia/m32r_pcc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index 56bf38804e93..e50bbf826188 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -696,10 +696,16 @@ static int __init init_m32r_pcc(void) pcc_sockets = 0; - add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, 0x1000); + ret = add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, + 0x1000); + if (ret) + goto unreg_dev; #ifdef CONFIG_M32RPCC_SLOT2 - add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, 0x2000); + ret = add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, + 0x2000); + if (ret) + goto unreg_dev; #endif if (pcc_sockets == 0) { From 46832b2de5fa42519c4924a9d0751d9297012ca9 Mon Sep 17 00:00:00 2001 From: piaojun Date: Mon, 12 Dec 2016 16:41:08 -0800 Subject: [PATCH 011/123] ocfs2/dlm: clean up useless BUG_ON default case in dlm_finalize_reco_handler() The value of 'stage' must be between 1 and 2, so the switch can't reach the default case. Link: http://lkml.kernel.org/r/57FB5EB2.7050002@huawei.com Signed-off-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index dd5cb8bcefd1..74407c6dd592 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); break; - default: - BUG(); } mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", From aa7b58597f48b93e9d08b4bac90e41690f586e46 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Mon, 12 Dec 2016 16:41:11 -0800 Subject: [PATCH 012/123] ocfs2: delete redundant code and set the node bit into maybe_map directly The variable `set_maybe' is redundant when the mle has been found in the map. So it is ok to set the node_idx into mle's maybe_map directly. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4A3D490DD@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Guozhonghua Reviewed-by: Mark Fasheh Reviewed-by: Joseph Qi Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3f828a187049..0487dbcd17e0 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1609,8 +1609,6 @@ way_up_top: __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "mle was found\n"); - set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->master == dlm->node_num) { mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); @@ -1625,8 +1623,7 @@ way_up_top: response = DLM_MASTER_RESP_NO; } else response = DLM_MASTER_RESP_MAYBE; - if (set_maybe) - set_bit(request->node_idx, tmpmle->maybe_map); + set_bit(request->node_idx, tmpmle->maybe_map); spin_unlock(&tmpmle->spinlock); } spin_unlock(&dlm->master_lock); From 28bb5ef485d3e96da056124e9b60df4486d38265 Mon Sep 17 00:00:00 2001 From: piaojun Date: Mon, 12 Dec 2016 16:41:14 -0800 Subject: [PATCH 013/123] ocfs2/dlm: clean up deadcode in dlm_master_request_handler() When 'dispatch_assert' is set, 'response' must be DLM_MASTER_RESP_YES, and 'res' won't be null, so execution can't reach these two branch. Link: http://lkml.kernel.org/r/58174C91.3040004@huawei.com Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 0487dbcd17e0..a464c8088170 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1641,12 +1641,6 @@ send_response: * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); From 07f38d971cd92d06adeaa50240f0235a2479d543 Mon Sep 17 00:00:00 2001 From: piaojun Date: Mon, 12 Dec 2016 16:41:17 -0800 Subject: [PATCH 014/123] ocfs2: clean up unused 'page' parameter in ocfs2_write_end_nolock() 'page' parameter in ocfs2_write_end_nolock() is never used. Link: http://lkml.kernel.org/r/582FD91A.5000902@huawei.com Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 7 +++---- fs/ocfs2/aops.h | 3 +-- fs/ocfs2/mmap.c | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..9a88984f9f6f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, } int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, void *fsdata) { int i, ret; unsigned from, to, start = pos & (PAGE_SIZE - 1); @@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, int ret; struct inode *inode = mapping->host; - ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); @@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, dwc->dw_zero_count++; } - ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); BUG_ON(ret != len); ret = 0; unlock: diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index b1c9f28a57b1..8614ff069d99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 71545ad4628c..429088786e93 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); BUG_ON(ret != len); ret = VM_FAULT_LOCKED; out: From 4131d53810681e4f0a2ff00f5c137478a7f0ef69 Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Mon, 12 Dec 2016 16:41:20 -0800 Subject: [PATCH 015/123] ocfs2: fix double put of recount tree in ocfs2_lock_refcount_tree() In ocfs2_lock_refcount_tree, if ocfs2_read_refcount_block() returns an error, we do ocfs2_refcount_tree_put twice (once in ocfs2_unlock_refcount_tree and once outside it), thereby reducing the refcount of the refcount tree twice, but we dont delete the tree in this case. This will make refcnt of the tree = 0 and the ocfs2_refcount_tree_put will eventually call ocfs2_mark_lockres_freeing, setting OCFS2_LOCK_FREEING for the refcount_tree->rf_lockres. The error returned by ocfs2_read_refcount_block is propagated all the way back and for next iteration of write, ocfs2_lock_refcount_tree gets the same tree back from ocfs2_get_refcount_tree because we havent deleted the tree. Now we have the same tree, but OCFS2_LOCK_FREEING is set for rf_lockres and eventually, when _ocfs2_lock_refcount_tree is called in this iteration, BUG_ON( __ocfs2_cluster_lock:1395 ERROR: Cluster lock called on freeing lockres T00000000000000000386019775b08d! flags 0x81) is triggerred. Call stack: (loop16,11155,0):ocfs2_lock_refcount_tree:482 ERROR: status = -5 (loop16,11155,0):ocfs2_refcount_cow_hunk:3497 ERROR: status = -5 (loop16,11155,0):ocfs2_refcount_cow:3560 ERROR: status = -5 (loop16,11155,0):ocfs2_prepare_inode_for_refcount:2111 ERROR: status = -5 (loop16,11155,0):ocfs2_prepare_inode_for_write:2190 ERROR: status = -5 (loop16,11155,0):ocfs2_file_write_iter:2331 ERROR: status = -5 (loop16,11155,0):__ocfs2_cluster_lock:1395 ERROR: bug expression: lockres->l_flags & OCFS2_LOCK_FREEING (loop16,11155,0):__ocfs2_cluster_lock:1395 ERROR: Cluster lock called on freeing lockres T00000000000000000386019775b08d! flags 0x81 kernel BUG at fs/ocfs2/dlmglue.c:1395! invalid opcode: 0000 [#1] SMP CPU 0 Modules linked in: tun ocfs2 jbd2 xen_blkback xen_netback xen_gntdev .. sd_mod crc_t10dif ext3 jbd mbcache RIP: __ocfs2_cluster_lock+0x31c/0x740 [ocfs2] RSP: e02b:ffff88017c0138a0 EFLAGS: 00010086 Process loop16 (pid: 11155, threadinfo ffff88017c010000, task ffff8801b5374300) Call Trace: ocfs2_refcount_lock+0xae/0x130 [ocfs2] __ocfs2_lock_refcount_tree+0x29/0xe0 [ocfs2] ocfs2_lock_refcount_tree+0xdd/0x320 [ocfs2] ocfs2_refcount_cow_hunk+0x1cb/0x440 [ocfs2] ocfs2_refcount_cow+0xa9/0x1d0 [ocfs2] ocfs2_prepare_inode_for_refcount+0x115/0x200 [ocfs2] ocfs2_prepare_inode_for_write+0x33b/0x470 [ocfs2] ocfs2_file_write_iter+0x220/0x8c0 [ocfs2] aio_write_iter+0x2e/0x30 Fix this by avoiding the second call to ocfs2_refcount_tree_put() Link: http://lkml.kernel.org/r/1473984404-32011-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19238512a324..738b4ea8e990 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -478,7 +478,6 @@ again: if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); - ocfs2_refcount_tree_put(tree); goto out; } From 395627b0718b6d4252c451c766cfc00ec155ddaf Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 12 Dec 2016 16:41:23 -0800 Subject: [PATCH 016/123] ocfs2: use time64_t to represent orphan scan times struct timespec is not y2038 safe. Use time64_t which is y2038 safe to represent orphan scan times. time64_t is sufficient here as only the seconds delta times are relevant. Also use appropriate time functions that return time in time64_t format. Time functions now return monotonic time instead of real time as only delta scan times are relevant and these values are not persistent across reboots. The format string for the debug print is still using long as this is only the time elapsed since the last scan and long is sufficient to represent this value. Link: http://lkml.kernel.org/r/1475365138-20567-1-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 4 ++-- fs/ocfs2/ocfs2.h | 2 +- fs/ocfs2/super.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a244f14c6b87..d5e5fa7f0743 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) */ seqno++; os->os_count++; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: @@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) struct ocfs2_orphan_scan *os; os = &osb->osb_orphan_scan; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e63af7ddfe68..7e5958b0be6b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,7 +224,7 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - struct timespec os_scantime; /* time this node ran the scan */ + time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f56fe39fab04..c894d945b084 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) out += snprintf(buf + out, len - out, "Disabled\n"); else out += snprintf(buf + out, len - out, "%lu seconds ago\n", - (get_seconds() - os->os_scantime.tv_sec)); + (unsigned long)(ktime_get_seconds() - os->os_scantime)); out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); From c62c38f6b91b87a013bccd3637c2a1850d8e590c Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 12 Dec 2016 16:41:26 -0800 Subject: [PATCH 017/123] ocfs2: replace CURRENT_TIME macro CURRENT_TIME is not y2038 safe. Use y2038 safe ktime_get_real_seconds() here for timestamps. struct heartbeat_block's hb_seq and deletetion time are already 64 bits wide and accommodate times beyond y2038. Also use y2038 safe ktime_get_real_ts64() for on disk inode timestamps. These are also wide enough to accommodate time64_t. Link: http://lkml.kernel.org/r/1475365298-29236-1-git-send-email-deepa.kernel@gmail.com Signed-off-by: Deepa Dinamani Reviewed-by: Arnd Bergmann Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/inode.c | 2 +- fs/ocfs2/namei.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..9158c9825094 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c56a7679df93..382401d3e88f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8d887c75765c..3b0a10d9b36f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_extent_list *fel; u16 feat; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct timespec64 ts; *new_fe_bh = NULL; @@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + ktime_get_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = - cpu_to_le64(CURRENT_TIME.tv_sec); + cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = - cpu_to_le32(CURRENT_TIME.tv_nsec); + cpu_to_le32(ts.tv_nsec); fe->i_dtime = 0; /* From 13583c3d3224508582ec03d881d0b68dd3ee8e10 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 12 Dec 2016 16:41:29 -0800 Subject: [PATCH 018/123] mm: memcontrol: use special workqueue for creating per-memcg caches Creating a lot of cgroups at the same time might stall all worker threads with kmem cache creation works, because kmem cache creation is done with the slab_mutex held. The problem was amplified by commits 801faf0db894 ("mm/slab: lockless decision to grow cache") in case of SLAB and 81ae6d03952c ("mm/slub.c: replace kick_all_cpus_sync() with synchronize_sched() in kmem_cache_shrink()") in case of SLUB, which increased the maximal time the slab_mutex can be held. To prevent that from happening, let's use a special ordered single threaded workqueue for kmem cache creation. This shouldn't introduce any functional changes regarding how kmem caches are created, as the work function holds the global slab_mutex during its whole runtime anyway, making it impossible to run more than one work at a time. By using a single threaded workqueue, we just avoid creating a thread per each work. Ordering is required to avoid a situation when a cgroup's work is put off indefinitely because there are other cgroups to serve, in other words to guarantee fairness. Link: https://bugzilla.kernel.org/show_bug.cgi?id=172981 Link: http://lkml.kernel.org/r/20161004131417.GC1862@esperanza Signed-off-by: Vladimir Davydov Reported-by: Doug Smythies Acked-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f870ba43942..91dfc7c5ce8f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2154,6 +2154,8 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; +static struct workqueue_struct *memcg_kmem_cache_create_wq; + static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2185,7 +2187,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - schedule_work(&cw->work); + queue_work(memcg_kmem_cache_create_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -5783,6 +5785,17 @@ static int __init mem_cgroup_init(void) { int cpu, node; +#ifndef CONFIG_SLOB + /* + * Kmem cache creation is mostly done with the slab_mutex held, + * so use a special workqueue to avoid stalling all worker + * threads in case lots of cgroups are created simultaneously. + */ + memcg_kmem_cache_create_wq = + alloc_ordered_workqueue("memcg_kmem_cache_create", 0); + BUG_ON(!memcg_kmem_cache_create_wq); +#endif + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); for_each_possible_cpu(cpu) From 89e364db71fb5e7fc8d93228152abfa67daf35fa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 12 Dec 2016 16:41:32 -0800 Subject: [PATCH 019/123] slub: move synchronize_sched out of slab_mutex on shrink synchronize_sched() is a heavy operation and calling it per each cache owned by a memory cgroup being destroyed may take quite some time. What is worse, it's currently called under the slab_mutex, stalling all works doing cache creation/destruction. Actually, there isn't much point in calling synchronize_sched() for each cache - it's enough to call it just once - after setting cpu_partial for all caches and before shrinking them. This way, we can also move it out of the slab_mutex, which we have to hold for iterating over the slab cache list. Link: https://bugzilla.kernel.org/show_bug.cgi?id=172991 Link: http://lkml.kernel.org/r/0a10d71ecae3db00fb4421bcd3f82bcc911f4be4.1475329751.git.vdavydov.dev@gmail.com Signed-off-by: Vladimir Davydov Reported-by: Doug Smythies Acked-by: Joonsoo Kim Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- mm/slab.h | 2 +- mm/slab_common.c | 27 +++++++++++++++++++++++++-- mm/slob.c | 2 +- mm/slub.c | 19 ++----------------- 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 0b0550ca85b4..7ea765cd7e93 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2332,7 +2332,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2352,7 +2352,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index bc05fdc3edce..ceb7d70cdb76 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -146,7 +146,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 329b03843863..5d2f24fbafc5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -573,6 +573,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); +#ifdef CONFIG_SLUB + /* + * In case of SLUB, we need to disable empty slab caching to + * avoid pinning the offline memory cgroup by freeable kmem + * pages charged to it. SLAB doesn't need this, as it + * periodically purges unused slabs. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; + if (c) { + c->cpu_partial = 0; + c->min_partial = 0; + } + } + mutex_unlock(&slab_mutex); + /* + * kmem_cache->cpu_partial is checked locklessly (see + * put_cpu_partial()). Make sure the change is visible. + */ + synchronize_sched(); +#endif + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) @@ -584,7 +607,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c, true); + __kmem_cache_shrink(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -755,7 +778,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 2b3e740609e9..4a861f265cd7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; From 84582c8ab9479ffa4532afa95ab8d8f96b5478dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Dec 2016 16:41:35 -0800 Subject: [PATCH 020/123] slub: avoid false-postive warning The slub allocator gives us some incorrect warnings when CONFIG_PROFILE_ANNOTATED_BRANCHES is set, as the unlikely() macro prevents it from seeing that the return code matches what it was before: mm/slub.c: In function `kmem_cache_free_bulk': mm/slub.c:262:23: error: `df.s' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2943:3: error: `df.cnt' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2933:4470: error: `df.freelist' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2943:3: error: `df.tail' may be used uninitialized in this function [-Werror=maybe-uninitialized] I have not been able to come up with a perfect way for dealing with this, the three options I see are: - add a bogus initialization, which would increase the runtime overhead - replace unlikely() with unlikely_notrace() - remove the unlikely() annotation completely I checked the object code for a typical x86 configuration and the last two cases produce the same result, so I went for the last one, which is the simplest. Link: http://lkml.kernel.org/r/20161024155704.3114445-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Jesper Dangaard Brouer Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Laura Abbott Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 4a861f265cd7..067598a00849 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (unlikely(!df.page)) + if (!df.page) continue; slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); From e70954fd6d4b469517fd906ef1c33310e90ef9f0 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Mon, 12 Dec 2016 16:41:38 -0800 Subject: [PATCH 021/123] mm/slab_common.c: check kmem_create_cache flags are common Verify that kmem_create_cache flags are not allocator specific. It is done before removing flags that are not available with the current configuration. The current kmem_cache_create removes incorrect flags but do not validate the callers are using them right. This change will ensure that callers are not trying to create caches with flags that won't be used because allocator specific. Link: http://lkml.kernel.org/r/1478553075-120242-2-git-send-email-thgarnie@google.com Signed-off-by: Thomas Garnier Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 15 +++++++++++++++ mm/slab_common.c | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/mm/slab.h b/mm/slab.h index ceb7d70cdb76..699b072dc46e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -142,8 +142,23 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_CACHE_FLAGS (0) #endif +/* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_NOTRACK | \ + SLAB_ACCOUNT) + int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 5d2f24fbafc5..ae323841adb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this From f728b0a5d72ae99c446f933912914a61254c03b6 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 12 Dec 2016 16:41:41 -0800 Subject: [PATCH 022/123] mm, slab: faster active and free stats Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive if there are many slab caches and if there are very lengthy per-node partial and/or free lists. Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo stats") addressed the per-node full lists which showed a significant improvement when no objects were freed. This patch has the same motivation and optimizes the remainder of the usecases where there are very lengthy partial and free lists. This patch maintains per-node active_slabs (full and partial) and free_slabs rather than iterating the lists at runtime when reading /proc/slabinfo. When allocating 100GB of slab from a test cache where every slab page is on the partial list, reading /proc/slabinfo (includes all other slab caches on the system) takes ~247ms on average with 48 samples. As a result of this patch, the same read takes ~0.856ms on average. [rientjes@google.com: changelog] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1611081505240.13403@chino.kir.corp.google.com Signed-off-by: Greg Thelen Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 117 ++++++++++++++++++++++-------------------------------- mm/slab.h | 3 +- 2 files changed, 49 insertions(+), 71 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 7ea765cd7e93..e06da6ceaf73 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); + parent->active_slabs = 0; + parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; - parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { #if DEBUG struct kmem_cache_node *n; - struct page *page; unsigned long flags; int node; static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -1381,32 +1381,20 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, num_objs = 0, free_objects = 0; - unsigned long active_slabs = 0, num_slabs = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full; + unsigned long active_objs = 0, free_objs = 0; + unsigned long active_slabs, num_slabs; spin_lock_irqsave(&n->list_lock, flags); - num_slabs = n->num_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; - num_slabs_partial++; - } - list_for_each_entry(page, &n->slabs_free, lru) - num_slabs_free++; + active_slabs = n->active_slabs; + num_slabs = active_slabs + n->free_slabs; - free_objects += n->free_objects; + active_objs += (num_slabs * cachep->num) - n->free_objects; + free_objs += n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, num_objs, - free_objects); + node, active_slabs, num_slabs, active_objs, + num_slabs * cachep->num, free_objs); } #endif } @@ -2318,7 +2306,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); - n->num_slabs--; + n->free_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2753,12 +2741,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); - if (!page->active) + if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); - else + n->free_slabs++; + } else { fixup_slab_list(cachep, n, page, &list); + n->active_slabs++; + } - n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -2884,7 +2874,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, /* Try to find non-pfmemalloc slab if needed */ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, - struct page *page, bool pfmemalloc) + struct page *page, bool *page_is_free, bool pfmemalloc) { if (!page) return NULL; @@ -2903,9 +2893,11 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (!page->active) + if (*page_is_free) { + WARN_ON(page->active); list_add_tail(&page->lru, &n->slabs_free); - else + *page_is_free = false; + } else list_add_tail(&page->lru, &n->slabs_partial); list_for_each_entry(page, &n->slabs_partial, lru) { @@ -2913,9 +2905,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, return page; } + n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { - if (!PageSlabPfmemalloc(page)) + if (!PageSlabPfmemalloc(page)) { + *page_is_free = true; return page; + } } return NULL; @@ -2924,17 +2919,26 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; + bool page_is_free = false; + assert_spin_locked(&n->list_lock); page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; page = list_first_entry_or_null(&n->slabs_free, struct page, lru); + if (page) + page_is_free = true; } if (sk_memalloc_socks()) - return get_valid_first_slab(n, page, pfmemalloc); + page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc); + + if (page && page_is_free) { + n->active_slabs++; + n->free_slabs--; + } return page; } @@ -3434,9 +3438,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) + if (page->active == 0) { list_add(&page->lru, &n->slabs_free); - else { + n->free_slabs++; + n->active_slabs--; + } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3450,7 +3456,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); - n->num_slabs--; + n->free_slabs--; } } @@ -4102,43 +4108,21 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full = 0; - const char *name; - char *error = NULL; + unsigned long active_objs, num_objs, active_slabs; + unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long num_slabs_free = 0; int node; struct kmem_cache_node *n; - active_objs = 0; - num_slabs = 0; for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->num_slabs; + num_slabs += n->active_slabs + n->free_slabs; + num_slabs_free += n->free_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; - num_slabs_partial++; - } + free_objs += n->free_objects; - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; - num_slabs_free++; - } - - free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; @@ -4146,15 +4130,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) } num_objs = num_slabs * cachep->num; active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - if (num_objs - active_objs != free_objects && !error) - error = "free_objects accounting error"; - - name = cachep->name; - if (error) - pr_err("slab: cache %s error: %s\n", name, error); + active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; diff --git a/mm/slab.h b/mm/slab.h index 699b072dc46e..26123c512fee 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -447,7 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long num_slabs; + unsigned long active_slabs; /* length of slabs_partial+slabs_full */ + unsigned long free_slabs; /* length of slabs_free */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ From bf00bd3458041c4643a13d80fb349d29cb66eb63 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Dec 2016 16:41:44 -0800 Subject: [PATCH 023/123] mm, slab: maintain total slab count instead of active count Rather than tracking the number of active slabs for each node, track the total number of slabs. This is a minor improvement that avoids active slab tracking when a slab goes from free to partial or partial to free. For slab debugging, this also removes an explicit free count since it can easily be inferred by the difference in number of total objects and number of active objects. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1612042020110.115755@chino.kir.corp.google.com Signed-off-by: David Rientjes Suggested-by: Joonsoo Kim Cc: Greg Thelen Cc: Aruna Ramakrishna Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 70 +++++++++++++++++++++++-------------------------------- mm/slab.h | 4 ++-- 2 files changed, 31 insertions(+), 43 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index e06da6ceaf73..87b29e76cafd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); - parent->active_slabs = 0; + parent->total_slabs = 0; parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; @@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, free_objs = 0; - unsigned long active_slabs, num_slabs; + unsigned long total_slabs, free_slabs, free_objs; spin_lock_irqsave(&n->list_lock, flags); - active_slabs = n->active_slabs; - num_slabs = active_slabs + n->free_slabs; - - active_objs += (num_slabs * cachep->num) - n->free_objects; - free_objs += n->free_objects; + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, - num_slabs * cachep->num, free_objs); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); } #endif } @@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); n->free_slabs--; + n->total_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); + n->total_slabs++; if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); n->free_slabs++; - } else { + } else fixup_slab_list(cachep, n, page, &list); - n->active_slabs++; - } STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; @@ -2874,7 +2872,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, /* Try to find non-pfmemalloc slab if needed */ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, - struct page *page, bool *page_is_free, bool pfmemalloc) + struct page *page, bool pfmemalloc) { if (!page) return NULL; @@ -2893,10 +2891,9 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (*page_is_free) { - WARN_ON(page->active); + if (!page->active) { list_add_tail(&page->lru, &n->slabs_free); - *page_is_free = false; + n->free_slabs++; } else list_add_tail(&page->lru, &n->slabs_partial); @@ -2908,7 +2905,7 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { if (!PageSlabPfmemalloc(page)) { - *page_is_free = true; + n->free_slabs--; return page; } } @@ -2919,26 +2916,19 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - bool page_is_free = false; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_free, struct page, + lru); if (page) - page_is_free = true; + n->free_slabs--; } if (sk_memalloc_socks()) - page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc); - - if (page && page_is_free) { - n->active_slabs++; - n->free_slabs--; - } + page = get_valid_first_slab(n, page, pfmemalloc); return page; } @@ -3441,7 +3431,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, if (page->active == 0) { list_add(&page->lru, &n->slabs_free); n->free_slabs++; - n->active_slabs--; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the @@ -3457,6 +3446,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); n->free_slabs--; + n->total_slabs--; } } @@ -4109,8 +4099,8 @@ out: void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { unsigned long active_objs, num_objs, active_slabs; - unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0; - unsigned long num_slabs_free = 0; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; int node; struct kmem_cache_node *n; @@ -4118,9 +4108,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->active_slabs + n->free_slabs; - num_slabs_free += n->free_slabs; - + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; free_objs += n->free_objects; if (n->shared) @@ -4128,15 +4117,14 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) spin_unlock_irq(&n->list_lock); } - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; sinfo->active_slabs = active_slabs; - sinfo->num_slabs = num_slabs; + sinfo->num_slabs = total_slabs; sinfo->shared_avail = shared_avail; sinfo->limit = cachep->limit; sinfo->batchcount = cachep->batchcount; diff --git a/mm/slab.h b/mm/slab.h index 26123c512fee..de6579dc362c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -447,8 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long active_slabs; /* length of slabs_partial+slabs_full */ - unsigned long free_slabs; /* length of slabs_free */ + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ From 3e32158767b04db60b83f760e1722fd15a715d7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 12 Dec 2016 16:41:47 -0800 Subject: [PATCH 024/123] mm/mprotect.c: don't touch single threaded PTEs which are on the right node We had some problems with pages getting unmapped in single threaded affinitized processes. It was tracked down to NUMA scanning. In this case it doesn't make any sense to unmap pages if the process is single threaded and the page is already on the node the process is running on. Add a check for this case into the numa protection code, and skip unmapping if true. In theory the process could be migrated later, but we will eventually rescan and unmap and migrate then. In theory this could be made more fancy: remembering this state per process or even whole mm. However that would need extra tracking and be more complicated, and the simple check seems to work fine so far. [ak@linux.intel.com: v3: Minor updates from Mel. Change code layout] Link: http://lkml.kernel.org/r/1476382117-5440-1-git-send-email-andi@firstfloor.org Link: http://lkml.kernel.org/r/1476288949-20970-1-git-send-email-andi@firstfloor.org Signed-off-by: Andi Kleen Acked-by: Mel Gorman Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/mprotect.c b/mm/mprotect.c index 11936526b08b..05a02b72c98d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + int target_node = NUMA_NO_NODE; pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); if (!pte) return 0; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + continue; } ptent = ptep_modify_prot_start(mm, addr, pte); From 5f33a0803bbd781de916f5c7448cbbbbc763d911 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 12 Dec 2016 16:41:50 -0800 Subject: [PATCH 025/123] mm/vmscan.c: set correct defer count for shrinker Our system uses significantly more slab memory with memcg enabled with the latest kernel. With 3.10 kernel, slab uses 2G memory, while with 4.6 kernel, 6G memory is used. The shrinker has problem. Let's see we have two memcg for one shrinker. In do_shrink_slab: 1. Check cg1. nr_deferred = 0, assume total_scan = 700. batch size is 1024, then no memory is freed. nr_deferred = 700 2. Check cg2. nr_deferred = 700. Assume freeable = 20, then total_scan = 10 or 40. Let's assume it's 10. No memory is freed. nr_deferred = 10. The deferred share of cg1 is lost in this case. kswapd will free no memory even run above steps again and again. The fix makes sure one memcg's deferred share isn't lost. Link: http://lkml.kernel.org/r/2414be961b5d25892060315fbb56bb19d81d0c07.1476227351.git.shli@fb.com Signed-off-by: Shaohua Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d75cdf360730..c4abf08861d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); From 771ab4302c592d1de9e6b73f58979e9e5c424f4c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 12 Dec 2016 16:41:53 -0800 Subject: [PATCH 026/123] mm/gup.c: make unnecessarily global vma_permits_fault() static Make vma_permits_fault() static as it is only used in mm/gup.c This fixes a sparse warning. Link: http://lkml.kernel.org/r/20161017122353.31598-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index ec4f82704b6f..fc04f1c3cf08 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -632,7 +632,8 @@ next_page: return i; } -bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); From 3999f52e3198e76607446ab1a4610c1ddc406c56 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:41:56 -0800 Subject: [PATCH 027/123] mm/hugetlb.c: use the right pte val for compare in hugetlb_cow We cannot use the pte value used in set_pte_at for pte_same comparison, because archs like ppc64, filter/add new pte flag in set_pte_at. Instead fetch the pte value inside hugetlb_cow. We are comparing pte value to make sure the pte didn't change since we dropped the page table lock. hugetlb_cow get called with page table lock held, and we can take a copy of the pte value before we drop the page table lock. With hugetlbfs, we optimize the MAP_PRIVATE write fault path with no previous mapping (huge_pte_none entries), by forcing a cow in the fault path. This avoid take an addition fault to covert a read-only mapping to read/write. Here we were comparing a recently instantiated pte (via set_pte_at) to the pte values from linux page table. As explained above on ppc64 such pte_same check returned wrong result, resulting in us taking an additional fault on ppc64. Fixes: 6a119eae942c ("powerpc/mm: Add a _PAGE_PTE bit") Link: http://lkml.kernel.org/r/20161018154245.18023-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reported-by: Jan Stancek Acked-by: Hillf Danton Cc: Mike Kravetz Cc: Scott Wood Cc: Michael Ellerman Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 418bf01a50ed..23aec01836aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3450,15 +3450,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) { + pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: @@ -3733,7 +3735,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3888,8 +3890,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); From 8bea805207500068b70778b707299a9b5920ca72 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:41:59 -0800 Subject: [PATCH 028/123] mm/hugetlb.c: use huge_pte_lock instead of opencoding the lock No functional change by this patch. Link: http://lkml.kernel.org/r/20161018090234.22574-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23aec01836aa..c12296f62e8d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3713,8 +3713,7 @@ retry: vma_end_reservation(h, vma, address); } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -4332,8 +4331,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); From 22901c6c9f93058c3803d343db02c14e870e3545 Mon Sep 17 00:00:00 2001 From: Andreas Platschek Date: Mon, 12 Dec 2016 16:42:01 -0800 Subject: [PATCH 029/123] kmemleak: fix reference to Documentation Documentation/kmemleak.txt was moved to Documentation/dev-tools/kmemleak.rst, this fixes the reference to the new location. Link: http://lkml.kernel.org/r/1476544946-18804-1-git-send-email-andreas.platschek@opentech.at Signed-off-by: Andreas Platschek Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d1380ed93fdf..da3436953022 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -19,7 +19,7 @@ * * * For more information on the algorithm and kmemleak usage, please see - * Documentation/kmemleak.txt. + * Documentation/dev-tools/kmemleak.rst. * * Notes on locking * ---------------- From 88ed365ea227aa28841a8d6e196c9a261c76fffd Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:05 -0800 Subject: [PATCH 030/123] mm: don't steal highatomic pageblock Patch series "use up highorder free pages before OOM", v3. I got OOM report from production team with v4.4 kernel. It had enough free memory but failed to allocate GFP_KERNEL order-0 page and finally encountered OOM kill. It occured during QA process which launches several apps, switching and so on. It happned rarely. IOW, In normal situation, it was not a problem but if we are unluck so that several apps uses peak memory at the same time, it can happen. If we manage to pass the phase, the system can go working well. I could reproduce it with my test(memory spike easily. Look at below. The reason is free pages(19M) of DMA32 zone are reserved for HIGHORDERATOMIC and doesn't unreserved before the OOM. balloon invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), order=0, oom_score_adj=0 balloon cpuset=/ mems_allowed=0 CPU: 1 PID: 8473 Comm: balloon Tainted: G W OE 4.8.0-rc7-00219-g3f74c9559583-dirty #3161 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 dump_header+0x5c/0x1ce oom_kill_process+0x22e/0x400 out_of_memory+0x1ac/0x210 __alloc_pages_nodemask+0x101e/0x1040 handle_mm_fault+0xa0a/0xbf0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:383949 inactive_anon:106724 isolated_anon:0 active_file:15 inactive_file:44 isolated_file:0 unevictable:0 dirty:0 writeback:24 unstable:0 slab_reclaimable:2483 slab_unreclaimable:3326 mapped:0 shmem:0 pagetables:1906 bounce:0 free:6898 free_pcp:291 free_cma:0 Node 0 active_anon:1535796kB inactive_anon:426896kB active_file:60kB inactive_file:176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:96kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1418 all_unreclaimable? no DMA free:8188kB min:44kB low:56kB high:68kB active_anon:7648kB inactive_anon:0kB active_file:0kB inactive_file:4kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:20kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:19404kB min:5628kB low:7624kB high:9620kB active_anon:1528148kB inactive_anon:426896kB active_file:60kB inactive_file:420kB unevictable:0kB writepending:96kB present:2080640kB managed:2030092kB mlocked:0kB slab_reclaimable:9932kB slab_unreclaimable:13284kB kernel_stack:2496kB pagetables:7624kB bounce:0kB free_pcp:900kB local_pcp:112kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 2*4096kB (H) = 8192kB DMA32: 7*4kB (H) 8*8kB (H) 30*16kB (H) 31*32kB (H) 14*64kB (H) 9*128kB (H) 2*256kB (H) 2*512kB (H) 4*1024kB (H) 5*2048kB (H) 0*4096kB = 19484kB 51131 total pagecache pages 50795 pages in swap cache Swap cache stats: add 3532405601, delete 3532354806, find 124289150/1822712228 Free swap = 8kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12658 pages reserved 0 pages cma reserved 0 pages hwpoisoned Another example exceeded the limit by the race is in:imklog: page allocation failure: order:0, mode:0x2280020(GFP_ATOMIC|__GFP_NOTRACK) CPU: 0 PID: 476 Comm: in:imklog Tainted: G E 4.8.0-rc7-00217-g266ef83c51e5-dirty #3135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 warn_alloc_failed+0xdb/0x130 __alloc_pages_nodemask+0x4d6/0xdb0 new_slab+0x339/0x490 ___slab_alloc.constprop.74+0x367/0x480 __slab_alloc.constprop.73+0x20/0x40 __kmalloc+0x1a4/0x1e0 alloc_indirect.isra.14+0x1d/0x50 virtqueue_add_sgs+0x1c4/0x470 __virtblk_add_req+0xae/0x1f0 virtio_queue_rq+0x12d/0x290 __blk_mq_run_hw_queue+0x239/0x370 blk_mq_run_hw_queue+0x8f/0xb0 blk_mq_insert_requests+0x18c/0x1a0 blk_mq_flush_plug_list+0x125/0x140 blk_flush_plug_list+0xc7/0x220 blk_finish_plug+0x2c/0x40 __do_page_cache_readahead+0x196/0x230 filemap_fault+0x448/0x4f0 ext4_filemap_fault+0x36/0x50 __do_fault+0x75/0x140 handle_mm_fault+0x84d/0xbe0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:363826 inactive_anon:121283 isolated_anon:32 active_file:65 inactive_file:152 isolated_file:0 unevictable:0 dirty:0 writeback:46 unstable:0 slab_reclaimable:2778 slab_unreclaimable:3070 mapped:112 shmem:0 pagetables:1822 bounce:0 free:9469 free_pcp:231 free_cma:0 Node 0 active_anon:1455304kB inactive_anon:485132kB active_file:260kB inactive_file:608kB unevictable:0kB isolated(anon):128kB isolated(file):0kB mapped:448kB dirty:0kB writeback:184kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:13641 all_unreclaimable? no DMA free:7748kB min:44kB low:56kB high:68kB active_anon:7944kB inactive_anon:104kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:108kB kernel_stack:0kB pagetables:4kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:30128kB min:5628kB low:7624kB high:9620kB active_anon:1447360kB inactive_anon:485028kB active_file:260kB inactive_file:608kB unevictable:0kB writepending:184kB present:2080640kB managed:2030132kB mlocked:0kB slab_reclaimable:11112kB slab_unreclaimable:12172kB kernel_stack:2400kB pagetables:7284kB bounce:0kB free_pcp:924kB local_pcp:72kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 7*4kB (UE) 3*8kB (UH) 1*16kB (M) 0*32kB 2*64kB (U) 1*128kB (M) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (U) 1*4096kB (H) = 7748kB DMA32: 10*4kB (H) 3*8kB (H) 47*16kB (H) 38*32kB (H) 5*64kB (H) 1*128kB (H) 2*256kB (H) 3*512kB (H) 3*1024kB (H) 3*2048kB (H) 4*4096kB (H) = 30128kB 2775 total pagecache pages 2536 pages in swap cache Swap cache stats: add 206786828, delete 206784292, find 7323106/106686077 Free swap = 108744kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12648 pages reserved 0 pages cma reserved 0 pages hwpoisoned During the investigation, I found some problems with highatomic so this patch aims to solve the problems and the final goal is to unreserve every highatomic free pages before the OOM kill. This patch (of 4): In page freeing path, migratetype is racy so that a highorderatomic page could free into non-highorderatomic free list. If that page is allocated, VM can change the pageblock from higorderatomic to something. In that case, highatomic pageblock accounting is broken so it doesn't work(e.g., VM cannot reserve highorderatomic pageblocks any more although it doesn't reach 1% limit). So, this patch prohibits the changing from highatomic to other type. It's no problem because MIGRATE_HIGHATOMIC is not listed in fallback array so stealing will only happen due to unexpected races which is really rare. Also, such prohibiting keeps highatomic pageblock more longer so it would be better for highorderatomic page allocation. Link: http://lkml.kernel.org/r/1476259429-18279-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6de9440e3ae2..97170131f2ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2133,7 +2133,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2534,7 +2535,8 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } From 4855e4a7f29d6d10b0b9c84e189c770c9a94e91e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:08 -0800 Subject: [PATCH 031/123] mm: prevent double decrease of nr_reserved_highatomic There is race between page freeing and unreserved highatomic. CPU 0 CPU 1 free_hot_cold_page mt = get_pfnblock_migratetype set_pcppage_migratetype(page, mt) unreserve_highatomic_pageblock spin_lock_irqsave(&zone->lock) move_freepages_block set_pageblock_migratetype(page) spin_unlock_irqrestore(&zone->lock) free_pcppages_bulk __free_one_page(mt) <- mt is stale By above race, a page on CPU 0 could go non-highorderatomic free list since the pageblock's type is changed. By that, unreserve logic of highorderatomic can decrease reserved count on a same pageblock severak times and then it will make mismatch between nr_reserved_highatomic and the number of reserved pageblock. So, this patch verifies whether the pageblock is highatomic or not and decrease the count only if the pageblock is highatomic. Link: http://lkml.kernel.org/r/1476259429-18279-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97170131f2ab..8cbc38f923aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2085,13 +2085,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal From 04c8716f7b0075def05dc05646e2408f318167d2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:11 -0800 Subject: [PATCH 032/123] mm: try to exhaust highatomic reserve before the OOM I got OOM report from production team with v4.4 kernel. It had enough free memory but failed to allocate GFP_KERNEL order-0 page and finally encountered OOM kill. It occured during QA process which launches several apps, switching and so on. It happned rarely. IOW, In normal situation, it was not a problem but if we are unluck so that several apps uses peak memory at the same time, it can happen. If we manage to pass the phase, the system can go working well. I could reproduce it with my test(memory spike easily. Look at below. The reason is free pages(19M) of DMA32 zone are reserved for HIGHORDERATOMIC and doesn't unreserved before the OOM. balloon invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), order=0, oom_score_adj=0 balloon cpuset=/ mems_allowed=0 CPU: 1 PID: 8473 Comm: balloon Tainted: G W OE 4.8.0-rc7-00219-g3f74c9559583-dirty #3161 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 dump_header+0x5c/0x1ce oom_kill_process+0x22e/0x400 out_of_memory+0x1ac/0x210 __alloc_pages_nodemask+0x101e/0x1040 handle_mm_fault+0xa0a/0xbf0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:383949 inactive_anon:106724 isolated_anon:0 active_file:15 inactive_file:44 isolated_file:0 unevictable:0 dirty:0 writeback:24 unstable:0 slab_reclaimable:2483 slab_unreclaimable:3326 mapped:0 shmem:0 pagetables:1906 bounce:0 free:6898 free_pcp:291 free_cma:0 Node 0 active_anon:1535796kB inactive_anon:426896kB active_file:60kB inactive_file:176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:96kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1418 all_unreclaimable? no DMA free:8188kB min:44kB low:56kB high:68kB active_anon:7648kB inactive_anon:0kB active_file:0kB inactive_file:4kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:20kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:19404kB min:5628kB low:7624kB high:9620kB active_anon:1528148kB inactive_anon:426896kB active_file:60kB inactive_file:420kB unevictable:0kB writepending:96kB present:2080640kB managed:2030092kB mlocked:0kB slab_reclaimable:9932kB slab_unreclaimable:13284kB kernel_stack:2496kB pagetables:7624kB bounce:0kB free_pcp:900kB local_pcp:112kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 2*4096kB (H) = 8192kB DMA32: 7*4kB (H) 8*8kB (H) 30*16kB (H) 31*32kB (H) 14*64kB (H) 9*128kB (H) 2*256kB (H) 2*512kB (H) 4*1024kB (H) 5*2048kB (H) 0*4096kB = 19484kB 51131 total pagecache pages 50795 pages in swap cache Swap cache stats: add 3532405601, delete 3532354806, find 124289150/1822712228 Free swap = 8kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12658 pages reserved 0 pages cma reserved 0 pages hwpoisoned Another example exceeded the limit by the race is in:imklog: page allocation failure: order:0, mode:0x2280020(GFP_ATOMIC|__GFP_NOTRACK) CPU: 0 PID: 476 Comm: in:imklog Tainted: G E 4.8.0-rc7-00217-g266ef83c51e5-dirty #3135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 warn_alloc_failed+0xdb/0x130 __alloc_pages_nodemask+0x4d6/0xdb0 new_slab+0x339/0x490 ___slab_alloc.constprop.74+0x367/0x480 __slab_alloc.constprop.73+0x20/0x40 __kmalloc+0x1a4/0x1e0 alloc_indirect.isra.14+0x1d/0x50 virtqueue_add_sgs+0x1c4/0x470 __virtblk_add_req+0xae/0x1f0 virtio_queue_rq+0x12d/0x290 __blk_mq_run_hw_queue+0x239/0x370 blk_mq_run_hw_queue+0x8f/0xb0 blk_mq_insert_requests+0x18c/0x1a0 blk_mq_flush_plug_list+0x125/0x140 blk_flush_plug_list+0xc7/0x220 blk_finish_plug+0x2c/0x40 __do_page_cache_readahead+0x196/0x230 filemap_fault+0x448/0x4f0 ext4_filemap_fault+0x36/0x50 __do_fault+0x75/0x140 handle_mm_fault+0x84d/0xbe0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:363826 inactive_anon:121283 isolated_anon:32 active_file:65 inactive_file:152 isolated_file:0 unevictable:0 dirty:0 writeback:46 unstable:0 slab_reclaimable:2778 slab_unreclaimable:3070 mapped:112 shmem:0 pagetables:1822 bounce:0 free:9469 free_pcp:231 free_cma:0 Node 0 active_anon:1455304kB inactive_anon:485132kB active_file:260kB inactive_file:608kB unevictable:0kB isolated(anon):128kB isolated(file):0kB mapped:448kB dirty:0kB writeback:184kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:13641 all_unreclaimable? no DMA free:7748kB min:44kB low:56kB high:68kB active_anon:7944kB inactive_anon:104kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:108kB kernel_stack:0kB pagetables:4kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:30128kB min:5628kB low:7624kB high:9620kB active_anon:1447360kB inactive_anon:485028kB active_file:260kB inactive_file:608kB unevictable:0kB writepending:184kB present:2080640kB managed:2030132kB mlocked:0kB slab_reclaimable:11112kB slab_unreclaimable:12172kB kernel_stack:2400kB pagetables:7284kB bounce:0kB free_pcp:924kB local_pcp:72kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 7*4kB (UE) 3*8kB (UH) 1*16kB (M) 0*32kB 2*64kB (U) 1*128kB (M) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (U) 1*4096kB (H) = 7748kB DMA32: 10*4kB (H) 3*8kB (H) 47*16kB (H) 38*32kB (H) 5*64kB (H) 1*128kB (H) 2*256kB (H) 3*512kB (H) 3*1024kB (H) 3*2048kB (H) 4*4096kB (H) = 30128kB 2775 total pagecache pages 2536 pages in swap cache Swap cache stats: add 206786828, delete 206784292, find 7323106/106686077 Free swap = 108744kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12648 pages reserved 0 pages cma reserved 0 pages hwpoisoned It's weird to show that zone has enough free memory above min watermark but OOMed with 4K GFP_KERNEL allocation due to reserved highatomic pages. As last resort, try to unreserve highatomic pages again and if it has moved pages to non-highatmoc free list, retry reclaim once more. Link: http://lkml.kernel.org/r/1476259429-18279-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8cbc38f923aa..085de0442dd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2059,7 +2059,7 @@ out_unlock: * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. */ -static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2067,6 +2067,7 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct zone *zone; struct page *page; int order; + bool ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { @@ -2115,12 +2116,14 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype); spin_unlock_irqrestore(&zone->lock, flags); - return; + return ret; } spin_unlock_irqrestore(&zone->lock, flags); } + + return false; } /* Remove an element from the buddy allocator from the fallback list */ @@ -3436,8 +3439,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) - return false; + if (*no_progress_loops > MAX_RECLAIM_RETRIES) { + /* Before OOM, exhaust highatomic_reserve */ + return unreserve_highatomic_pageblock(ac); + } /* * Keep reclaiming pages while there is a chance this will lead From 29fac03bef729ef6f9fba5be56f8554093813c39 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:14 -0800 Subject: [PATCH 033/123] mm: make unreserve highatomic functions reliable Currently, unreserve_highatomic_pageblock bails out if it found highatomic pageblock regardless of really moving free pages from the one so that it could mitigate unreserve logic's goal which saves OOM of a process. This patch makes unreserve functions bail out only if it moves some pages out of !highatomic free list to avoid such false positive. Another potential problem is that by race between page freeing and reserve highatomic function, pages could be in highatomic free list even though the pageblock is !high atomic migratetype. In that case, unreserve_highatomic_pageblock can be void if count of highatomic reserve is less than pageblock_nr_pages. We could solve it simply via draining all of reserved pages before the OOM. It would have a safeguard role to exhuast reserved pages before converging to OOM. Link: http://lkml.kernel.org/r/1476259429-18279-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 085de0442dd4..2b69e28706b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2058,8 +2058,12 @@ out_unlock: * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2071,8 +2075,12 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { - /* Preserve at least one pageblock */ - if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2117,8 +2125,10 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) */ set_pageblock_migratetype(page, ac->migratetype); ret = move_freepages_block(zone, page, ac->migratetype); - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } } spin_unlock_irqrestore(&zone->lock, flags); } @@ -3322,7 +3332,7 @@ retry: * Shrink them them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac); + unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -3441,7 +3451,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, */ if (*no_progress_loops > MAX_RECLAIM_RETRIES) { /* Before OOM, exhaust highatomic_reserve */ - return unreserve_highatomic_pageblock(ac); + return unreserve_highatomic_pageblock(ac, true); } /* From 3f5000693f80e014fa577b67b93a0de945a4338d Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Mon, 12 Dec 2016 16:42:17 -0800 Subject: [PATCH 034/123] mm/vmalloc.c: simplify /proc/vmallocinfo implementation Many seq_file helpers exist for simplifying implementation of virtual files especially, for /proc nodes. however, the helpers for iteration over list_head are available but aren't adopted to implement /proc/vmallocinfo currently. Simplify /proc/vmallocinfo implementation by using existing seq_file helpers. Link: http://lkml.kernel.org/r/57FDF2E5.1000201@zoho.com Signed-off-by: zijun_hu Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f2481cb4e6b2..e73948afac70 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2574,32 +2574,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_area_lock) { - loff_t n = *pos; - struct vmap_area *va; - spin_lock(&vmap_area_lock); - va = list_first_entry(&vmap_area_list, typeof(*va), list); - while (n > 0 && &va->list != &vmap_area_list) { - n--; - va = list_next_entry(va, list); - } - if (!n && &va->list != &vmap_area_list) - return va; - - return NULL; - + return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vmap_area *va = p, *next; - - ++*pos; - next = list_next_entry(va, list); - if (&next->list != &vmap_area_list) - return next; - - return NULL; + return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) @@ -2634,9 +2615,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vmap_area *va = p; + struct vmap_area *va; struct vm_struct *v; + va = list_entry(p, struct vmap_area, list); + /* * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. From fd60775aea802beef444881ddfa111a4b73b1bbc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Dec 2016 16:42:20 -0800 Subject: [PATCH 035/123] mm, thp: avoid unlikely branches for split_huge_pmd While doing MADV_DONTNEED on a large area of thp memory, I noticed we encountered many unlikely() branches in profiles for each backing hugepage. This is because zap_pmd_range() would call split_huge_pmd(), which rechecked the conditions that were already validated, but as part of an unlikely() branch. Avoid the unlikely() branch when in a context where pmd is known to be good for __split_huge_pmd() directly. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1610181600300.84525@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ mm/memory.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e35e6de633b9..1f782aa1d8e6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) +static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} diff --git a/mm/memory.c b/mm/memory.c index 33f45edf8272..d86b7b4afd7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1240,7 +1240,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -3454,7 +3454,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); return VM_FAULT_FALLBACK; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b859af06b87..a6a27e5d6b14 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -496,7 +496,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { get_page(page); spin_unlock(ptl); diff --git a/mm/mprotect.c b/mm/mprotect.c index 05a02b72c98d..c5ba2aae0f54 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -176,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); if (pmd_trans_unstable(pmd)) continue; } else { From 6d8409580bee356ce418dcb94260b24dda639934 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 12 Dec 2016 16:42:23 -0800 Subject: [PATCH 036/123] mm, mempolicy: clean up __GFP_THISNODE confusion in policy_zonelist __GFP_THISNODE is documented to enforce the allocation to be satisified from the requested node with no fallbacks or placement policy enforcements. policy_zonelist seemingly breaks this semantic if the current policy is MPOL_MBIND and instead of taking the node it will fallback to the first node in the mask if the requested one is not in the mask. This is confusing to say the least because it fact we shouldn't ever go that path. First tasks shouldn't be scheduled on CPUs with nodes outside of their mempolicy binding. And secondly policy_zonelist is called only from 3 places: - huge_zonelist - never should do __GFP_THISNODE when going this path - alloc_pages_vma - which shouldn't depend on __GFP_THISNODE either - alloc_pages_current - which uses default_policy id __GFP_THISNODE is used So we shouldn't even need to care about this possibility and can drop the confusing code. Let's keep a WARN_ON_ONCE in place to catch potential users and fix them up properly (aka use a different allocation function which ignores mempolicy). [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161013125958.32155-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a6a27e5d6b14..4d58021dba34 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1679,25 +1679,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: + if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + else { /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + return node_zonelist(nd, gfp); } From 6afcf8ef0ca0a69d014f8edb613d94821f0ae700 Mon Sep 17 00:00:00 2001 From: Ming Ling Date: Mon, 12 Dec 2016 16:42:26 -0800 Subject: [PATCH 037/123] mm, compaction: fix NR_ISOLATED_* stats for pfn based migration Since commit bda807d44454 ("mm: migrate: support non-lru movable page migration") isolate_migratepages_block) can isolate !PageLRU pages which would acct_isolated account as NR_ISOLATED_*. Accounting these non-lru pages NR_ISOLATED_{ANON,FILE} doesn't make any sense and it can misguide heuristics based on those counters such as pgdat_reclaimable_pages resp. too_many_isolated which would lead to unexpected stalls during the direct reclaim without any good reason. Note that __alloc_contig_migrate_range can isolate a lot of pages at once. On mobile devices such as 512M ram android Phone, it may use a big zram swap. In some cases zram(zsmalloc) uses too many non-lru but migratedable pages, such as: MemTotal: 468148 kB Normal free:5620kB Free swap:4736kB Total swap:409596kB ZRAM: 164616kB(zsmalloc non-lru pages) active_anon:60700kB inactive_anon:60744kB active_file:34420kB inactive_file:37532kB Fix this by only accounting lru pages to NR_ISOLATED_* in isolate_migratepages_block right after they were isolated and we still know they were on LRU. Drop acct_isolated because it is called after the fact and we've lost that information. Batching per-cpu counter doesn't make much improvement anyway. Also make sure that we uncharge only LRU pages when putting them back on the LRU in putback_movable_pages resp. when unmap_and_move migrates the page. [mhocko@suse.com: replace acct_isolated() with direct counting] Fixes: bda807d44454 ("mm: migrate: support non-lru movable page migration") Link: http://lkml.kernel.org/r/20161019080240.9682-1-mhocko@kernel.org Signed-off-by: Ming Ling Signed-off-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 25 +++---------------------- mm/migrate.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 26 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 0409a4ad6ea1..70e6bec46dc2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } -/* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) -{ - struct page *page; - unsigned int count[2] = { 0, }; - - if (list_empty(&cc->migratepages)) - return; - - list_for_each_entry(page, &cc->migratepages, lru) - count[!!page_is_file_cache(page)]++; - - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); -} - /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { @@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -902,7 +888,6 @@ isolate_fail: spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } - acct_isolated(zone, cc); putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; cc->last_migrated_pfn = 0; @@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; } - acct_isolated(cc->zone, cc); return pfn; } @@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) { - acct_isolated(zone, cc); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; - } /* * Either we isolated something and proceed with migration. Or @@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, break; } - acct_isolated(zone, cc); /* Record where migration scanner will be restarted. */ cc->migrate_pfn = low_pfn; diff --git a/mm/migrate.c b/mm/migrate.c index 99250aee1ac1..66ce6b490b13 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use * __PageMovable because LRU page's mapping cannot have @@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { putback_lru_page(page); + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -1121,8 +1121,15 @@ out: * restored. */ list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } /* From 23f919d4ad0eb325595f10f55be4301b2965d6d6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Dec 2016 16:42:28 -0800 Subject: [PATCH 038/123] shmem: avoid maybe-uninitialized warning After enabling -Wmaybe-uninitialized warnings, we get a false-postive warning for shmem: mm/shmem.c: In function `shmem_getpage_gfp': include/linux/spinlock.h:332:21: error: `info' may be used uninitialized in this function [-Werror=maybe-uninitialized] This can be easily avoided, since the correct 'info' pointer is known at the time we first enter the function, so we can simply move the initialization up. Moving it before the first label avoids the warning and lets us remove two later initializations. Note that the function is so hard to read that it not only confuses the compiler, but also most readers and without this patch it could\ easily break if one of the 'goto's changed. Link: https://www.spinics.net/lists/kernel/msg2368133.html Link: http://lkml.kernel.org/r/20161024205725.786455-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9d32e1cb9f38..ba0d7644ee20 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1539,7 +1539,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct mem_cgroup *memcg; @@ -1589,7 +1589,6 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); charge_mm = fault_mm ? : current->mm; @@ -1837,7 +1836,6 @@ unlock: put_page(page); } if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); From c0f2e176f87bd989835bd098a52779df41a9243c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:31 -0800 Subject: [PATCH 039/123] mm: use the correct page size when removing the page We are removing a pmd hugepage here. Use the correct page size. Link: http://lkml.kernel.org/r/20161026084839.27299-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8e35cc66d32..0103728ffa94 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1399,12 +1399,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); From b5bc66b713108710e341bb164f8ffbc11896706e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:34 -0800 Subject: [PATCH 040/123] mm: update mmu_gather range correctly We use __tlb_adjust_range to update range convered by mmu_gather struct. We later use the 'start' and 'end' to do a mmu_notifier_invalidate_range in tlb_flush_mmu_tlbonly(). Update the 'end' correctly in __tlb_adjust_range so that we call mmu_notifier_invalidate_range with the correct range values. Wrt tlbflush, this should not have any impact, because a flush with correct start address will flush tlb mapping for the range. Also add comment w.r.t updating the range when we free pagetable pages. For now we don't support a range based page table cache flush. Link: http://lkml.kernel.org/r/20161026084839.27299-3-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 43 ++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index c6d667187608..dba727becd5f 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -125,10 +125,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); static inline void __tlb_adjust_range(struct mmu_gather *tlb, - unsigned long address) + unsigned long address, + unsigned int range_size) { tlb->start = min(tlb->start, address); - tlb->end = max(tlb->end, address + PAGE_SIZE); + tlb->end = max(tlb->end, address + range_size); /* * Track the last address with which we adjusted the range. This * will be used later to adjust again after a mmu_flush due to @@ -153,7 +154,7 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, if (__tlb_remove_page_size(tlb, page, page_size)) { tlb_flush_mmu(tlb); tlb->page_size = page_size; - __tlb_adjust_range(tlb, tlb->addr); + __tlb_adjust_range(tlb, tlb->addr, page_size); __tlb_remove_page_size(tlb, page, page_size); } } @@ -177,7 +178,7 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa /* active->nr should be zero when we call this */ VM_BUG_ON_PAGE(tlb->active->nr, page); tlb->page_size = PAGE_SIZE; - __tlb_adjust_range(tlb, tlb->addr); + __tlb_adjust_range(tlb, tlb->addr, PAGE_SIZE); return __tlb_remove_page(tlb, page); } @@ -215,7 +216,7 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) @@ -227,29 +228,47 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa #define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) #endif -#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ - do { \ - __tlb_adjust_range(tlb, address); \ - __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ +#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ + do { \ + __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \ + __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) +/* + * For things like page tables caches (ie caching addresses "inside" the + * page tables, like x86 does), for legacy reasons, flushing an + * individual page had better flush the page table caches behind it. This + * is definitely how x86 works, for example. And if you have an + * architected non-legacy page table cache (which I'm not aware of + * anybody actually doing), you're going to have some architecturally + * explicit flushing for that, likely *separate* from a regular TLB entry + * flush, and thus you'd need more than just some range expansion.. + * + * So if we ever find an architecture + * that would want something that odd, I think it is up to that + * architecture to do its own odd thing, not cause pain for others + * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com + * + * For now w.r.t page table cache, mark the range_size as PAGE_SIZE + */ + #define pte_free_tlb(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #ifndef __ARCH_HAS_4LEVEL_HACK #define pud_free_tlb(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif #define pmd_free_tlb(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address); \ + __tlb_adjust_range(tlb, address, PAGE_SIZE); \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) From b528e4b6405b9fd656a6a308a7e2aa6afa50e77d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:37 -0800 Subject: [PATCH 041/123] mm/hugetlb: add tlb_remove_hugetlb_entry for handling hugetlb pages This add tlb_remove_hugetlb_entry similar to tlb_remove_pmd_tlb_entry. Link: http://lkml.kernel.org/r/20161026084839.27299-4-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 2 ++ arch/ia64/include/asm/tlb.h | 3 +++ arch/s390/include/asm/tlb.h | 2 ++ arch/sh/include/asm/tlb.h | 3 +++ arch/um/include/asm/tlb.h | 3 +++ include/asm-generic/tlb.h | 6 ++++++ mm/hugetlb.c | 2 +- 7 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 1e25cd80589e..82841ba1f51f 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) tlb_add_flush(tlb, addr); } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 77e541cf0e5d..b3f369ab844d 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -283,6 +283,9 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 15711de10403..094440b59f9e 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -162,5 +162,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) #endif /* _S390_TLB_H */ diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 025cdb1032f6..e7d15e8c75c1 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) tlb->end = address + PAGE_SIZE; } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 821ff0acfe17..a4427029c3c8 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -141,6 +141,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index dba727becd5f..38c2b708df6e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -220,6 +220,12 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + do { \ + __tlb_adjust_range(tlb, address, huge_page_size(h)); \ + __tlb_remove_tlb_entry(tlb, ptep, address); \ + } while (0) + /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c12296f62e8d..8e519da7242d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3336,7 +3336,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } pte = huge_ptep_get_and_clear(mm, address, ptep); - tlb_remove_tlb_entry(tlb, ptep, address); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); From 07e326610e5634e5038fce32fff370949eb42101 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:40 -0800 Subject: [PATCH 042/123] mm: add tlb_remove_check_page_size_change to track page size change With commit e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change") we added the ability to force a tlb flush when the page size change in a mmu_gather loop. We did that by checking for a page size change every time we added a page to mmu_gather for lazy flush/remove. We can improve that by moving the page size change check early and not doing it every time we add a page. This also helps us to do tlb flush when invalidating a range covering dax mapping. Wrt dax mapping we don't have a backing struct page and hence we don't call tlb_remove_page, which earlier forced the tlb flush on page size change. Moving the page size change check earlier means we will do the same even for dax mapping. We also avoid doing this check on architecture other than powerpc. In a later patch we will remove page size check from tlb_remove_page(). Link: http://lkml.kernel.org/r/20161026084839.27299-5-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 6 ++++++ arch/ia64/include/asm/tlb.h | 6 ++++++ arch/powerpc/include/asm/tlb.h | 16 ++++++++++++++++ arch/s390/include/asm/tlb.h | 6 ++++++ arch/sh/include/asm/tlb.h | 6 ++++++ arch/um/include/asm/tlb.h | 6 ++++++ include/asm-generic/tlb.h | 16 ++++++++++++++++ mm/huge_memory.c | 4 ++++ mm/hugetlb.c | 5 +++++ mm/madvise.c | 1 + mm/memory.c | 7 ++++++- 11 files changed, 78 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 82841ba1f51f..a9d6de4746ea 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -286,5 +286,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* CONFIG_MMU */ #endif diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index b3f369ab844d..bfe6295aa746 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -286,6 +286,12 @@ do { \ #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 99e1397b71da..609557569f65 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + if (!tlb->page_size) + tlb->page_size = page_size; + else if (tlb->page_size != page_size) { + tlb_flush_mmu(tlb); + /* + * update the page size after flush for the new + * mmu_gather. + */ + tlb->page_size = page_size; + } +} + #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 094440b59f9e..28b159c87c38 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -165,4 +165,10 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* _S390_TLB_H */ diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index e7d15e8c75c1..0f988b3e484b 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -130,6 +130,12 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, return tlb_remove_page(tlb, page); } +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index a4427029c3c8..8258dd4bb13c 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -144,6 +144,12 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 38c2b708df6e..256c9de71fdb 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -182,6 +182,22 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa return __tlb_remove_page(tlb, page); } +#ifndef tlb_remove_check_page_size_change +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + /* + * We don't care about page size change, just update + * mmu_gather page size here so that debug checks + * doesn't throw false warning. + */ +#ifdef CONFIG_DEBUG_VM + tlb->page_size = page_size; +#endif +} +#endif + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0103728ffa94..26fd1161ca85 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1323,6 +1323,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; @@ -1384,6 +1386,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e519da7242d..3edb759c5c7d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; diff --git a/mm/madvise.c b/mm/madvise.c index 93fb63e88b5e..0e3828eae9f8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index d86b7b4afd7d..eae20eb66bfc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -528,7 +528,11 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1120,6 +1124,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, swp_entry_t entry; struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); From 692a68c1544d6be4ba7c6e929e9c7b2ba0447b91 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:43 -0800 Subject: [PATCH 043/123] mm: remove the page size change check in tlb_remove_page Now that we check for page size change early in the loop, we can partially revert e9d55e157034a ("mm: change the interface for __tlb_remove_page"). This simplies the code much, by removing the need to track the last address with which we adjusted the range. We also go back to the older way of filling the mmu_gather array, ie, we add an entry and then check whether the gather batch is full. Link: http://lkml.kernel.org/r/20161026084839.27299-6-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 13 +++---------- arch/ia64/include/asm/tlb.h | 16 ++++------------ arch/s390/include/asm/tlb.h | 6 ------ arch/sh/include/asm/tlb.h | 6 ------ arch/um/include/asm/tlb.h | 6 ------ include/asm-generic/tlb.h | 28 ++-------------------------- mm/memory.c | 21 ++++++--------------- 7 files changed, 15 insertions(+), 81 deletions(-) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index a9d6de4746ea..3f2eb76243e3 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -213,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); if (tlb->nr == tlb->max) return true; - tlb->pages[tlb->nr++] = page; return false; } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -233,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index bfe6295aa746..fced197b9626 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) */ static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (tlb->nr == tlb->max) - return true; - tlb->need_flush = 1; if (!tlb->nr && tlb->pages == tlb->local) __tlb_alloc_page(tlb); tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); + if (tlb->nr == tlb->max) + return true; return false; } @@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 28b159c87c38..853b2a3d8dee 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 0f988b3e484b..46e0d635e36f 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -118,12 +118,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 8258dd4bb13c..600a2e9bfee2 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 256c9de71fdb..7eed8cf3130a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -107,11 +107,6 @@ struct mmu_gather { struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; unsigned int batch_count; - /* - * __tlb_adjust_range will track the new addr here, - * that that we can adjust the range after the flush - */ - unsigned long addr; int page_size; }; @@ -130,12 +125,6 @@ static inline void __tlb_adjust_range(struct mmu_gather *tlb, { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); - /* - * Track the last address with which we adjusted the range. This - * will be used later to adjust again after a mmu_flush due to - * failed __tlb_remove_page - */ - tlb->addr = address; } static inline void __tlb_reset_range(struct mmu_gather *tlb) @@ -151,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) { + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); - tlb->page_size = page_size; - __tlb_adjust_range(tlb, tlb->addr, page_size); - __tlb_remove_page_size(tlb, page, page_size); - } } -static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } @@ -173,15 +158,6 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) -{ - /* active->nr should be zero when we call this */ - VM_BUG_ON_PAGE(tlb->active->nr, page); - tlb->page_size = PAGE_SIZE; - __tlb_adjust_range(tlb, tlb->addr, PAGE_SIZE); - return __tlb_remove_page(tlb, page); -} - #ifndef tlb_remove_check_page_size_change #define tlb_remove_check_page_size_change tlb_remove_check_page_size_change static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, diff --git a/mm/memory.c b/mm/memory.c index eae20eb66bfc..0a72f821ccdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -1122,7 +1120,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: @@ -1177,7 +1174,6 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } @@ -1218,11 +1214,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } From 80a7951627712180ed2575a20e1e442b851fc27c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 12 Dec 2016 16:42:46 -0800 Subject: [PATCH 044/123] mm: fix up get_user_pages* comments In the previous round of get_user_pages* changes comments attached to __get_user_pages_unlocked() and get_user_pages_unlocked() were rendered incorrect, this patch corrects them. In addition the get_user_pages_unlocked() comment seems to have already been outdated as it referred to tsk, mm parameters which were removed in c12d2da5 ("mm/gup: Remove the macro overload API migration helpers from the get_user*() APIs"), this patch fixes this also. Link: http://lkml.kernel.org/r/20161025233435.5338-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index fc04f1c3cf08..e50178c58b97 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -858,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to - * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for + * tsk, mm to be specified. * * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET", - * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed - * according to the parameters "pages", "write", "force" - * respectively. + * caller if required (just like with __get_user_pages). "FOLL_GET" + * is set implicitly if "pages" is non-NULL. */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -895,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * get_user_pages_unlocked(tsk, mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead, if the two parameters - * "tsk" and "mm" are respectively equal to current and current->mm, - * or if "force" shall be set to 1 (get_user_pages_fast misses the - * "force" parameter). + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) From 8d303e44e99c2ae5cad31f3dded10a572b0fd4d7 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Mon, 12 Dec 2016 16:42:49 -0800 Subject: [PATCH 045/123] mm/mempolicy.c: forbid static or relative flags for local NUMA mode The MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES flags are irrelevant when setting them for MPOL_LOCAL NUMA memory policy via set_mempolicy or mbind. Return the "invalid argument" from set_mempolicy and mbind whenever any of these flags is passed along with MPOL_LOCAL. It is consistent with MPOL_PREFERRED passed with empty nodemask. It slightly shortens the execution time in paths where these flags are used e.g. when trying to rebind the NUMA nodes for changes in cgroups cpuset mems (mpol_rebind_preferred()) or when just printing the mempolicy structure (/proc/PID/numa_maps). Isolated tests done. Link: http://lkml.kernel.org/r/20161027163037.4089-1-kwapulinski.piotr@gmail.com Signed-off-by: Piotr Kwapulinski Acked-by: David Rientjes Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Michal Hocko Cc: Liang Chen Cc: Mel Gorman Cc: Dave Hansen Cc: Nathan Zimmer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4d58021dba34..6d3639e1f254 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) From 4a3bac4e3ac212c31edd8b124a1a2c7e8c1767ed Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:52 -0800 Subject: [PATCH 046/123] powerpc/mm: allow memory hotplug into a memoryless node Patch series "enable movable nodes on non-x86 configs", v7. This patchset allows more configs to make use of movable nodes. When CONFIG_MOVABLE_NODE is selected, there are two ways to introduce such nodes into the system: 1. Discover movable nodes at boot. Currently this is only possible on x86, but we will enable configs supporting fdt to do the same. 2. Hotplug and online all of a node's memory using online_movable. This is already possible on any config supporting memory hotplug, not just x86, but the Kconfig doesn't say so. We will fix that. We'll also remove some cruft on power which would prevent (2). This patch (of 5): Remove the check which prevents us from hotplugging into an empty node. The original commit b226e4621245 ("[PATCH] powerpc: don't add memory to empty node/zone"), states that this was intended to be a temporary measure. It is a workaround for an oops which no longer occurs. Link: http://lkml.kernel.org/r/1479160961-25840-2-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Aneesh Kumar K.V Acked-by: Balbir Singh Acked-by: Michael Ellerman Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/numa.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51c188b81f3..0cb6bd8bfccf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int hot_add_scn_to_nid(unsigned long scn_addr) { struct device_node *memory = NULL; - int nid, found = 0; + int nid; if (!numa_enabled || (min_common_depth < 0)) return first_online_node; @@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr) if (nid < 0 || !node_online(nid)) nid = first_online_node; - if (NODE_DATA(nid)->node_spanned_pages) - return nid; - - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages) { - found = 1; - break; - } - } - - BUG_ON(!found); return nid; } From 39fa104d5b87655c1c19d4b1990ea63d190c4817 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:55 -0800 Subject: [PATCH 047/123] mm: remove x86-only restriction of movable_node In commit c5320926e370 ("mem-hotplug: introduce movable_node boot option"), the memblock allocation direction is changed to bottom-up and then back to top-down like this: 1. memblock_set_bottom_up(true), called by cmdline_parse_movable_node(). 2. memblock_set_bottom_up(false), called by x86's numa_init(). Even though (1) occurs in generic mm code, it is wrapped by #ifdef CONFIG_MOVABLE_NODE, which depends on X86_64. This means that when we extend CONFIG_MOVABLE_NODE to non-x86 arches, things will be unbalanced. (1) will happen for them, but (2) will not. This toggle was added in the first place because x86 has a delay between adding memblocks and marking them as hotpluggable. Since other arches do this marking either immediately or not at all, they do not require the bottom-up toggle. So, resolve things by moving (1) from cmdline_parse_movable_node() to x86's setup_arch(), immediately after the movable_node parameter has been parsed. Link: http://lkml.kernel.org/r/1479160961-25840-3-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup.c | 24 ++++++++++++++++++++++++ mm/memory_hotplug.c | 20 -------------------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 86a31dfc036e..26f0f92206d7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2406,7 +2406,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. - movable_node [KNL,X86] Boot-time switch to enable the effects + movable_node [KNL] Boot-time switch to enable the effects of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. MTD_Partition= [MTD] diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..4cfba947d774 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cad4b9125695..e43142c15631 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); From 114cf3cc55ec00465a59bb89e06b4e4fdcd6412e Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:59 -0800 Subject: [PATCH 048/123] mm: enable CONFIG_MOVABLE_NODE on non-x86 arches To support movable memory nodes (CONFIG_MOVABLE_NODE), at least one of the following must be true: 1. This config has the capability to identify movable nodes at boot. Right now, only x86 can do this. 2. Our config supports memory hotplug, which means that a movable node can be created by hotplugging all of its memory into ZONE_MOVABLE. Fix the Kconfig definition of CONFIG_MOVABLE_NODE, which currently recognizes (1), but not (2). Link: http://lkml.kernel.org/r/1479160961-25840-4-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Aneesh Kumar K.V Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 86e3e0e74d20..061b46b18029 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 + depends on X86_64 || MEMORY_HOTPLUG depends on NUMA default n help From 41a9ada3e6b4253f1a3ce42699c6aaeb8584306c Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:43:02 -0800 Subject: [PATCH 049/123] of/fdt: mark hotpluggable memory When movable nodes are enabled, any node containing only hotpluggable memory is made movable at boot time. On x86, hotpluggable memory is discovered by parsing the ACPI SRAT, making corresponding calls to memblock_mark_hotplug(). If we introduce a dt property to describe memory as hotpluggable, configs supporting early fdt may then also do this marking and use movable nodes. Link: http://lkml.kernel.org/r/1479160961-25840-5-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Tested-by: Balbir Singh Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/fdt.c | 19 +++++++++++++++++++ include/linux/of_fdt.h | 1 + mm/Kconfig | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index c89d5d231a0e..c9b5cac03b36 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, const char *type = of_get_flat_dt_prop(node, "device_type", NULL); const __be32 *reg, *endp; int l; + bool hotpluggable; /* We are scanning "memory" nodes only */ if (type == NULL) { @@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, return 0; endp = reg + (l / sizeof(__be32)); + hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL); pr_debug("memory scan node %s, reg size %d,\n", uname, l); @@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, (unsigned long long)size); early_init_dt_add_memory_arch(base, size); + + if (!hotpluggable) + continue; + + if (early_init_dt_mark_hotplug_memory_arch(base, size)) + pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n", + base, base + size); } return 0; @@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) memblock_add(base, size); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return memblock_mark_hotplug(base, size); +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { @@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) WARN_ON(1); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return -ENOSYS; +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 4341f32516d8..271b3fdf0070 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); extern void early_init_dt_add_memory_arch(u64 base, u64 size); +extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size); extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool no_map); extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align); diff --git a/mm/Kconfig b/mm/Kconfig index 061b46b18029..33a9b06ec618 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 || MEMORY_HOTPLUG + depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG depends on NUMA default n help From c3352cbb1bdf198e81141700eb7003b8e2de1f1a Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:43:06 -0800 Subject: [PATCH 050/123] dt: add documentation of "hotpluggable" memory property Summarize the "hotpluggable" property of dt memory nodes. Link: http://lkml.kernel.org/r/1479160961-25840-6-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Balbir Singh Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/booting-without-of.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt index 3f1437fbca6b..280d283304bb 100644 --- a/Documentation/devicetree/booting-without-of.txt +++ b/Documentation/devicetree/booting-without-of.txt @@ -974,6 +974,13 @@ compatibility. 4Gb. Some vendors prefer splitting those ranges into smaller segments, but the kernel doesn't care. + Additional properties: + + - hotpluggable : The presence of this property provides an explicit + hint to the operating system that this memory may potentially be + removed later. The kernel can take this into consideration when + doing nonmovable allocations and when laying out memory zones. + e) The /chosen node This node is a bit "special". Normally, that's where Open Firmware From c7142aead87aa5026e4b57671c7dbb1706b02606 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 12 Dec 2016 16:43:09 -0800 Subject: [PATCH 051/123] mm/pkeys: generate pkey system call code only if ARCH_HAS_PKEYS is selected Having code for the pkey_mprotect, pkey_alloc and pkey_free system calls makes only sense if ARCH_HAS_PKEYS is selected. If not selected these system calls will always return -ENOSPC or -EINVAL. To simplify things and have less code generate the pkey system call code only if ARCH_HAS_PKEYS is selected. For architectures which have already wired up the system calls, but do not select ARCH_HAS_PKEYS this will result in less generated code and a different return code: the three system calls will now always return -ENOSYS, using the cond_syscall mechanism. For architectures which have not wired up the system calls less unreachable code will be generated. Link: http://lkml.kernel.org/r/20161114111251.70084-1-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Acked-by: Dave Hansen Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/mprotect.c b/mm/mprotect.c index c5ba2aae0f54..cc2459c57f60 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -497,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return do_mprotect_pkey(start, len, prot, -1); } +#ifdef CONFIG_ARCH_HAS_PKEYS + SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { @@ -547,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) */ return ret; } + +#endif /* CONFIG_ARCH_HAS_PKEYS */ From c1ef8e2c0235bffe4b0505c3325bb8a6af954021 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 12 Dec 2016 16:43:12 -0800 Subject: [PATCH 052/123] mm: disable numa migration faults for dax vmas Mark dax vmas as not migratable to exclude them from task_numa_work(). This is especially relevant for device-dax which wants to ensure predictable access latency and not incur periodic faults. [akpm@linux-foundation.org: add comment] Link: http://lkml.kernel.org/r/147892450132.22062.16875659431109209179.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Aneesh Kumar K.V Cc: Michal Hocko Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5e5b2969d931..5f4d8281832b 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -7,6 +7,7 @@ #include +#include #include #include #include @@ -177,6 +178,13 @@ static inline bool vma_migratable(struct vm_area_struct *vma) if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; + /* + * DAX device mappings require predictable access latency, so avoid + * incurring periodic faults. + */ + if (vma_is_dax(vma)) + return false; + #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION if (vma->vm_flags & VM_HUGETLB) return false; From d5e6eff265fe7537fa494e6ab125747813be76a0 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 12 Dec 2016 16:43:15 -0800 Subject: [PATCH 053/123] mm: cma: make linux/cma.h standalone includible The header uses types and definitions from the linux/init.h as well as linux/types.h headers without explicitly including them. This causes a failure to compile if they are not implicitly pulled in by includers. Link: http://lkml.kernel.org/r/20161115133235.13387-1-thierry.reding@gmail.com Signed-off-by: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cma.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/cma.h b/include/linux/cma.h index 29f9e774ab76..6f0a91b37f68 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -1,6 +1,9 @@ #ifndef __CMA_H__ #define __CMA_H__ +#include +#include + /* * There is always at least global CMA area and a few optional * areas configured in kernel .config. From c70b647d381cba1899c953b0016b7dc185892f90 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 12 Dec 2016 16:43:17 -0800 Subject: [PATCH 054/123] mm/filemap.c: add comment for confusing logic in page_cache_tree_insert() Unlike THP, hugetlb pages are represented by one entry in the radix-tree. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/20161110163640.126124-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 50b52fe51937..caa779f8797f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -169,7 +169,10 @@ static int page_cache_tree_insert(struct address_space *mapping, static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int i, nr; + + /* hugetlb pages are represented by one entry in the radix tree */ + nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); From bace9248188f64d7490ebe59fc0733db8b6f0e57 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Mon, 12 Dec 2016 16:43:20 -0800 Subject: [PATCH 055/123] fs/fs-writeback.c: remove redundant if check b_more_io non-empty check is already preceded by an opposite check. Link: http://lkml.kernel.org/r/1478591249-30641-1-git-send-email-tahsin@google.com Signed-off-by: Tahsin Erdogan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05713a5da083..ef600591d96f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb, work); - inode = wb_inode(wb->b_more_io.prev); - spin_lock(&inode->i_lock); - spin_unlock(&wb->list_lock); - /* This function drops i_lock... */ - inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); - } + trace_writeback_wait(wb, work); + inode = wb_inode(wb->b_more_io.prev); + spin_lock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); From f1f5929cd9715c1cdfe07a890f12ac7d2c5304ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= Date: Mon, 12 Dec 2016 16:43:23 -0800 Subject: [PATCH 056/123] shmem: fix compilation warnings on unused functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling shmem.c with SHMEM and TRANSAPRENT_HUGE_PAGECACHE enabled raises warnings on two unused functions when CONFIG_TMPFS and CONFIG_SYSFS are both disabled: mm/shmem.c:390:20: warning: `shmem_format_huge' defined but not used [-Wunused-function] static const char *shmem_format_huge(int huge) ^~~~~~~~~~~~~~~~~ mm/shmem.c:373:12: warning: `shmem_parse_huge' defined but not used [-Wunused-function] static int shmem_parse_huge(const char *str) ^~~~~~~~~~~~~~~~ A conditional compilation on tmpfs or sysfs removes the warnings. Link: http://lkml.kernel.org/r/20161118055749.11313-1-jeremy.lefaure@lse.epita.fr Signed-off-by: Jérémy Lefaure Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ba0d7644ee20..ec7aa562343e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -370,6 +370,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, int shmem_huge __read_mostly; +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -407,6 +408,7 @@ static const char *shmem_format_huge(int huge) return "bad_val"; } } +#endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) From 9491ae4aade6814afcfa67f4eb3e3342c2b39750 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Dec 2016 16:43:26 -0800 Subject: [PATCH 057/123] mm: don't cap request size based on read-ahead setting We ran into a funky issue, where someone doing 256K buffered reads saw 128K requests at the device level. Turns out it is read-ahead capping the request size, since we use 128K as the default setting. This doesn't make a lot of sense - if someone is issuing 256K reads, they should see 256K reads, regardless of the read-ahead setting, if the underlying device can support a 256K read in a single command. This patch introduces a bdi hint, io_pages. This is the soft max IO size for the lower level, I've hooked it up to the bdev settings here. Read-ahead is modified to issue the maximum of the user request size, and the read-ahead max size, but capped to the max request size on the device side. The latter is done to avoid reading ahead too much, if the application asks for a huge read. With this patch, the kernel behaves like the application expects. Link: http://lkml.kernel.org/r/1479498073-8657-1-git-send-email-axboe@fb.com Signed-off-by: Jens Axboe Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-settings.c | 1 + block/blk-sysfs.c | 1 + include/linux/backing-dev-defs.h | 1 + mm/readahead.c | 39 +++++++++++++++++++++++--------- 4 files changed, 31 insertions(+), 11 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae122843..65f16cf4f850 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cc8d7c5439a..ea374e820775 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; + q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(q->queue_lock); return ret; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c357f27d5483..b8144b2d59ce 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -136,6 +136,7 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + unsigned long io_pages; /* max allowed IO size */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ diff --git a/mm/readahead.c b/mm/readahead.c index c8a955b1297e..c4ca70239233 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -207,12 +207,21 @@ out: * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long max_pages; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { int err; @@ -369,9 +378,17 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = ra->ra_pages; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages = ra->ra_pages; pgoff_t prev_offset; + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + /* * start of file */ @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping, if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max); + start = page_cache_next_hole(mapping, offset + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max) + if (!start || start - offset > max_pages) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ ra->size += req_size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping, /* * oversize read */ - if (req_size > max) + if (req_size > max_pages) goto initial_readahead; /* @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max)) + if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) goto readit; /* @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping, initial_readahead: ra->start = offset; - ra->size = get_init_ra_size(req_size, max); + ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: @@ -454,7 +471,7 @@ readit: * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max); + ra->async_size = get_next_ra_size(ra, max_pages); ra->size += ra->async_size; } From 8db378a570330fa0aaa9d75299fe264e4a5b6348 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Dec 2016 16:43:29 -0800 Subject: [PATCH 058/123] include/linux/backing-dev-defs.h: shrink struct backing_dev_info Move the 4-byte `capabilities' field next to other 4-byte things. Shrinks sizeof(backing_dev_info) by 8 bytes on x86_64. Reviewed-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index b8144b2d59ce..0b5b1af35e5e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -137,12 +137,12 @@ struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ unsigned long io_pages; /* max allowed IO size */ - unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ char *name; + unsigned int capabilities; /* Device capabilities */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; From 91a45f71078a6569ec3ca5bef74e1ab58121d80e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:32 -0800 Subject: [PATCH 059/123] mm: khugepaged: close use-after-free race during shmem collapsing Patch series "mm: workingset: radix tree subtleties & single-page file refaults", v3. This is another revision of the radix tree / workingset patches based on feedback from Jan and Kirill. This is a follow-up to d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). That patch fixed an issue that was caused mainly by the page cache sneaking special shadow page entries into the radix tree and relying on subtleties in the radix tree code to make that work. The fix also had to stop tracking refaults for single-page files because shadow pages stored as direct pointers in radix_tree_root->rnode weren't properly handled during tree extension. These patches make the radix tree code explicitely support and track such special entries, to eliminate the subtleties and to restore the thrash detection for single-page files. This patch (of 9): When a radix tree iteration drops the tree lock, another thread might swoop in and free the node holding the current slot. The iteration needs to do another tree lookup from the current index to continue. [kirill.shutemov@linux.intel.com: re-lookup for replacement] Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Link: http://lkml.kernel.org/r/20161117191138.22769-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 87e1a7ca3846..2779c63bdea0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1403,6 +1403,9 @@ static void collapse_shmem(struct mm_struct *mm, spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_lookup_slot(&mapping->page_tree, index); + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, + &mapping->tree_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1426,6 +1429,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(slot, new_page + (index % HPAGE_PMD_NR)); + slot = radix_tree_iter_next(&iter); index++; continue; out_lru: @@ -1537,6 +1541,7 @@ tree_unlocked: putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); From 59749e6ce53735d8b696763742225f126e94603f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:35 -0800 Subject: [PATCH 060/123] mm: khugepaged: fix radix tree node leak in shmem collapse error path The radix tree counts valid entries in each tree node. Entries stored in the tree cannot be removed by simpling storing NULL in the slot or the internal counters will be off and the node never gets freed again. When collapsing a shmem page fails, restore the holes that were filled with radix_tree_insert() with a proper radix tree deletion. Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Link: http://lkml.kernel.org/r/20161117191138.22769-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Jan Kara Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2779c63bdea0..5d7c006373d3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1525,9 +1525,11 @@ tree_unlocked: if (!page || iter.index < page->index) { if (!nr_none) break; - /* Put holes back where they were */ - radix_tree_replace_slot(slot, NULL); nr_none--; + /* Put holes back where they were */ + radix_tree_delete(&mapping->page_tree, + iter.index); + slot = radix_tree_iter_next(&iter); continue; } From b936887e8739d3fa83f87d899f68d136735d9816 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:38 -0800 Subject: [PATCH 061/123] mm: workingset: turn shadow node shrinker bugs into warnings When the shadow page shrinker tries to reclaim a radix tree node but finds it in an unexpected state - it should contain no pages, and non-zero shadow entries - there is no need to kill the executing task or even the entire system. Warn about the invalid state, then leave that tree node be. Simply don't put it back on the shadow LRU for future reclaim and move on. Link: http://lkml.kernel.org/r/20161117191138.22769-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index fb1f9183d89a..98f830897b1b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -418,23 +418,27 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - BUG_ON(!workingset_node_shadows(node)); - BUG_ON(workingset_node_pages(node)); - + if (WARN_ON_ONCE(!workingset_node_shadows(node))) + goto out_invalid; + if (WARN_ON_ONCE(workingset_node_pages(node))) + goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + goto out_invalid; + if (WARN_ON_ONCE(!mapping->nrexceptional)) + goto out_invalid; node->slots[i] = NULL; workingset_node_shadows_dec(node); - BUG_ON(!mapping->nrexceptional); mapping->nrexceptional--; } } - BUG_ON(workingset_node_shadows(node)); + if (WARN_ON_ONCE(workingset_node_shadows(node))) + goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - if (!__radix_tree_delete_node(&mapping->page_tree, node)) - BUG(); + __radix_tree_delete_node(&mapping->page_tree, node); +out_invalid: spin_unlock(&mapping->tree_lock); ret = LRU_REMOVED_RETRY; out: From f7942430e40f14c6d2ca48a1875add509938c07d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:41 -0800 Subject: [PATCH 062/123] lib: radix-tree: native accounting of exceptional entries The way the page cache is sneaking shadow entries of evicted pages into the radix tree past the node entry accounting and tracking them manually in the upper bits of node->count is fraught with problems. These shadow entries are marked in the tree as exceptional entries, which are a native concept to the radix tree. Maintain an explicit counter of exceptional entries in the radix tree node. Subsequent patches will switch shadow entry tracking over to that counter. DAX and shmem are the other users of exceptional entries. Since slot replacements that change the entry type from regular to exceptional must now be accounted, introduce a __radix_tree_replace() function that does replacement and accounting, and switch DAX and shmem over. The increase in radix tree node size is temporary. A followup patch switches the shadow tracking to this new scheme and we'll no longer need the upper bits in node->count and shrink that back to one byte. Link: http://lkml.kernel.org/r/20161117192945.GA23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 5 +++-- include/linux/radix-tree.h | 10 ++++++--- lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++--- mm/shmem.c | 8 +++---- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 014defd2e744..db78bae0dc0f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -643,12 +643,13 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } mapping->nrexceptional++; } else { + struct radix_tree_node *node; void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - radix_tree_replace_slot(slot, new_entry); + __radix_tree_replace(page_tree, node, slot, new_entry); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index af3581b8a451..7ced8a70cc8b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -85,9 +85,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) struct radix_tree_node { - unsigned char shift; /* Bits remaining in each slot */ - unsigned char offset; /* Slot offset in parent */ - unsigned int count; + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned int count; /* Total entry count */ + unsigned char exceptional; /* Exceptional entry count */ union { struct { /* Used when ascending tree */ @@ -276,6 +277,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e6d552c40dd..7885796d35ae 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +753,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + void *old = rcu_dereference_raw(*slot); + int exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + WARN_ON_ONCE(!!item - !!old); + + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + + if (node) + node->exceptional += exceptional; + + rcu_assign_pointer(*slot, item); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root, delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; + if (radix_tree_exceptional_entry(entry)) + node->exceptional--; __radix_tree_delete_node(root, node); diff --git a/mm/shmem.c b/mm/shmem.c index ec7aa562343e..3149ddee8f55 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -300,18 +300,18 @@ void shmem_uncharge(struct inode *inode, long pages) static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { + struct radix_tree_node *node; void **pslot; void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (!pslot) + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + if (!item) return -ENOENT; - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - radix_tree_replace_slot(pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); return 0; } From 6d75f366b9242f9b17ed7d0b0604d7460f818f21 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:43 -0800 Subject: [PATCH 063/123] lib: radix-tree: check accounting of existing slot replacement users The bug in khugepaged fixed earlier in this series shows that radix tree slot replacement is fragile; and it will become more so when not only NULL<->!NULL transitions need to be caught but transitions from and to exceptional entries as well. We need checks. Re-implement radix_tree_replace_slot() on top of the sanity-checked __radix_tree_replace(). This requires existing callers to also pass the radix tree root, but it'll warn us when somebody replaces slots with contents that need proper accounting (transitions between NULL entries, real entries, exceptional entries) and where a replacement through the slot pointer would corrupt the radix tree node counts. Link: http://lkml.kernel.org/r/20161117193021.GB23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gmap.c | 2 +- drivers/sh/intc/virq.c | 2 +- fs/dax.c | 4 +- include/linux/radix-tree.h | 16 +------ lib/radix-tree.c | 63 +++++++++++++++++++++------ mm/filemap.c | 4 +- mm/khugepaged.c | 5 ++- mm/migrate.c | 4 +- mm/truncate.c | 2 +- tools/testing/radix-tree/multiorder.c | 2 +- 10 files changed, 64 insertions(+), 40 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce4..ec1f0dedb948 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index e7899624aa0b..35bbe288ddb4 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -254,7 +254,7 @@ restart: radix_tree_tag_clear(&d->tree, entry->enum_id, INTC_TAG_VIRQ_NEEDS_ALLOC); - radix_tree_replace_slot((void **)entries[i], + radix_tree_replace_slot(&d->tree, (void **)entries[i], &intc_irq_xlate[irq]); } diff --git a/fs/dax.c b/fs/dax.c index db78bae0dc0f..85930c2a2749 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7ced8a70cc8b..2d1b9b8be983 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -/** - * radix_tree_replace_slot - replace item in a slot - * @pslot: pointer to slot, returned by radix_tree_lookup_slot - * @item: new item to store in the slot. - * - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -static inline void radix_tree_replace_slot(void **pslot, void *item) -{ - BUG_ON(radix_tree_is_internal_node(item)); - rcu_assign_pointer(*pslot, item); -} - int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp); @@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item); +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7885796d35ae..f91d5b0af654 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -753,6 +753,28 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) +{ + void *old = rcu_dereference_raw(*slot); + int exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + WARN_ON_ONCE(!!item - !!old); + + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(warn_typeswitch && exceptional); + + if (node) + node->exceptional += exceptional; + + rcu_assign_pointer(*slot, item); +} + /** * __radix_tree_replace - replace item in a slot * @root: radix tree root @@ -767,21 +789,34 @@ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item) { - void *old = rcu_dereference_raw(*slot); - int exceptional; + /* + * This function supports replacing exceptional entries, but + * that needs accounting against the node unless the slot is + * root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); +} - WARN_ON_ONCE(radix_tree_is_internal_node(item)); - WARN_ON_ONCE(!!item - !!old); - - exceptional = !!radix_tree_exceptional_entry(item) - - !!radix_tree_exceptional_entry(old); - - WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); - - if (node) - node->exceptional += exceptional; - - rcu_assign_pointer(*slot, item); +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry to + * another, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); } /** diff --git a/mm/filemap.c b/mm/filemap.c index caa779f8797f..1ba726aef708 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping, false); } } - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, slot, page); mapping->nrpages++; if (node) { workingset_node_pages_inc(node); @@ -196,7 +196,7 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(slot, shadow); + radix_tree_replace_slot(&mapping->page_tree, slot, shadow); if (!node) break; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5d7c006373d3..7a50c726c5ae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1426,7 +1426,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_next(&iter); @@ -1538,7 +1538,8 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index 66ce6b490b13..0ed24b1fa77b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..3c631c357873 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping, goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + radix_tree_replace_slot(&mapping->page_tree, slot, NULL); mapping->nrexceptional--; if (!node) goto unlock; diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 05d7bc488971..d1be94667a30 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) slot = radix_tree_lookup_slot(&tree, index); free(*slot); - radix_tree_replace_slot(slot, item2); + radix_tree_replace_slot(&tree, slot, item2); for (i = min; i < max; i++) { struct item *item = item_lookup(&tree, i); assert(item != 0); From f4b109c6dad54257eca837f9dd16a23f2eeab832 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:46 -0800 Subject: [PATCH 064/123] lib: radix-tree: add entry deletion support to __radix_tree_replace() Page cache shadow entry handling will be a lot simpler when it can use a single generic replacement function for pages, shadow entries, and emptying slots. Make __radix_tree_replace() properly account insertions and deletions in node->count and garbage collect nodes as they become empty. Then re-implement radix_tree_delete() on top of it. Link: http://lkml.kernel.org/r/20161117193058.GC23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 227 ++++++++++++++++++++++++----------------------- 1 file changed, 116 insertions(+), 111 deletions(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f91d5b0af654..5d8930f3b3d8 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -538,6 +538,107 @@ out: return maxshift + RADIX_TREE_MAP_SHIFT; } +/** + * radix_tree_shrink - shrink radix tree to minimum height + * @root radix tree root + */ +static inline bool radix_tree_shrink(struct radix_tree_root *root) +{ + bool shrunk = false; + + for (;;) { + struct radix_tree_node *node = root->rnode; + struct radix_tree_node *child; + + if (!radix_tree_is_internal_node(node)) + break; + node = entry_to_node(node); + + /* + * The candidate node has more than one child, or its child + * is not at the leftmost slot, or the child is a multiorder + * entry, we cannot shrink. + */ + if (node->count != 1) + break; + child = node->slots[0]; + if (!child) + break; + if (!radix_tree_is_internal_node(child) && node->shift) + break; + + if (radix_tree_is_internal_node(child)) + entry_to_node(child)->parent = NULL; + + /* + * We don't need rcu_assign_pointer(), since we are simply + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it + * (node->slots[0]), it will be safe to dereference the new + * one (root->rnode) as far as dependent read barriers go. + */ + root->rnode = child; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page has 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (!radix_tree_is_internal_node(child)) + node->slots[0] = RADIX_TREE_RETRY; + + radix_tree_node_free(node); + shrunk = true; + } + + return shrunk; +} + +static bool delete_node(struct radix_tree_root *root, + struct radix_tree_node *node) +{ + bool deleted = false; + + do { + struct radix_tree_node *parent; + + if (node->count) { + if (node == entry_to_node(root->rnode)) + deleted |= radix_tree_shrink(root); + return deleted; + } + + parent = node->parent; + if (parent) { + parent->slots[node->offset] = NULL; + parent->count--; + } else { + root_tag_clear_all(root); + root->rnode = NULL; + } + + radix_tree_node_free(node); + deleted = true; + + node = parent; + } while (node); + + return deleted; +} + /** * __radix_tree_create - create a slot in a radix tree * @root: radix tree root @@ -759,18 +860,20 @@ static void replace_slot(struct radix_tree_root *root, bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); - int exceptional; + int count, exceptional; WARN_ON_ONCE(radix_tree_is_internal_node(item)); - WARN_ON_ONCE(!!item - !!old); + count = !!item - !!old; exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(warn_typeswitch && exceptional); + WARN_ON_ONCE(warn_typeswitch && (count || exceptional)); - if (node) + if (node) { + node->count += count; node->exceptional += exceptional; + } rcu_assign_pointer(*slot, item); } @@ -790,12 +893,14 @@ void __radix_tree_replace(struct radix_tree_root *root, void **slot, void *item) { /* - * This function supports replacing exceptional entries, but - * that needs accounting against the node unless the slot is - * root->rnode. + * This function supports replacing exceptional entries and + * deleting entries, but that needs accounting against the + * node unless the slot is root->rnode. */ replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); + + delete_node(root, node); } /** @@ -810,8 +915,8 @@ void __radix_tree_replace(struct radix_tree_root *root, * * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and exceptional entries, as that requires accounting - * inside the radix tree node. When switching from one type of entry to - * another, use __radix_tree_lookup() and __radix_tree_replace(). + * inside the radix tree node. When switching from one type of entry or + * deleting, use __radix_tree_lookup() and __radix_tree_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item) @@ -1466,75 +1571,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) } #endif /* CONFIG_SHMEM && CONFIG_SWAP */ -/** - * radix_tree_shrink - shrink radix tree to minimum height - * @root radix tree root - */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) -{ - bool shrunk = false; - - for (;;) { - struct radix_tree_node *node = root->rnode; - struct radix_tree_node *child; - - if (!radix_tree_is_internal_node(node)) - break; - node = entry_to_node(node); - - /* - * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. - */ - if (node->count != 1) - break; - child = node->slots[0]; - if (!child) - break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; - - if (radix_tree_is_internal_node(child)) - entry_to_node(child)->parent = NULL; - - /* - * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another: if it - * was safe to dereference the old pointer to it - * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. - */ - root->rnode = child; - - /* - * We have a dilemma here. The node's slot[0] must not be - * NULLed in case there are concurrent lookups expecting to - * find the item. However if this was a bottom-level node, - * then it may be subject to the slot pointer being visible - * to callers dereferencing it. If item corresponding to - * slot[0] is subsequently deleted, these callers would expect - * their slot to become empty sooner or later. - * - * For example, lockless pagecache will look up a slot, deref - * the page pointer, and if the page has 0 refcount it means it - * was concurrently deleted from pagecache so try the deref - * again. Fortunately there is already a requirement for logic - * to retry the entire slot lookup -- the indirect pointer - * problem (replacing direct root node with an indirect pointer - * also results in a stale slot). So tag the slot as indirect - * to force callers to retry. - */ - if (!radix_tree_is_internal_node(child)) - node->slots[0] = RADIX_TREE_RETRY; - - radix_tree_node_free(node); - shrunk = true; - } - - return shrunk; -} - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root @@ -1549,33 +1585,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - bool deleted = false; - - do { - struct radix_tree_node *parent; - - if (node->count) { - if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); - return deleted; - } - - parent = node->parent; - if (parent) { - parent->slots[node->offset] = NULL; - parent->count--; - } else { - root_tag_clear_all(root); - root->rnode = NULL; - } - - radix_tree_node_free(node); - deleted = true; - - node = parent; - } while (node); - - return deleted; + return delete_node(root, node); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1632,12 +1642,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - node->slots[offset] = NULL; - node->count--; - if (radix_tree_exceptional_entry(entry)) - node->exceptional--; - - __radix_tree_delete_node(root, node); + __radix_tree_replace(root, node, slot, NULL); return entry; } From 4d693d08607ab319095ec8942909df4b4aebdf66 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:49 -0800 Subject: [PATCH 065/123] lib: radix-tree: update callback for changing leaf nodes Support handing __radix_tree_replace() a callback that gets invoked for all leaf nodes that change or get freed as a result of the slot replacement, to assist users tracking nodes with node->private_list. This prepares for putting page cache shadow entries into the radix tree root again and drastically simplifying the shadow tracking. Link: http://lkml.kernel.org/r/20161117193134.GD23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- include/linux/radix-tree.h | 4 +++- lib/radix-tree.c | 42 ++++++++++++++++++++++++++------------ mm/shmem.c | 3 ++- 4 files changed, 36 insertions(+), 16 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 85930c2a2749..6916ed37d463 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 2d1b9b8be983..15c972ea9510 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item); + void **slot, void *item, + radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 5d8930f3b3d8..df4ff18dd63c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -542,7 +541,9 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) { bool shrunk = false; @@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (!radix_tree_is_internal_node(child)) + node->count = 0; + if (!radix_tree_is_internal_node(child)) { node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } radix_tree_node_free(node); shrunk = true; @@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) { bool deleted = false; @@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); + deleted |= radix_tree_shrink(root, update_node, + private); return deleted; } @@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root, /** * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item) + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) { /* * This function supports replacing exceptional entries and @@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root, replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); - delete_node(root, node); + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); } /** @@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node); + return delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - __radix_tree_replace(root, node, slot, NULL); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } diff --git a/mm/shmem.c b/mm/shmem.c index 3149ddee8f55..abd7403aba41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping, return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } From 14b468791fa955d442f962fdf5207dfd39a131c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:52 -0800 Subject: [PATCH 066/123] mm: workingset: move shadow entry tracking to radix tree exceptional tracking Currently, we track the shadow entries in the page cache in the upper bits of the radix_tree_node->count, behind the back of the radix tree implementation. Because the radix tree code has no awareness of them, we rely on random subtleties throughout the implementation (such as the node->count != 1 check in the shrinking code, which is meant to exclude multi-entry nodes but also happens to skip nodes with only one shadow entry, as that's accounted in the upper bits). This is error prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). To remove these subtleties, this patch moves shadow entry tracking from the upper bits of node->count to the existing counter for exceptional entries. node->count goes back to being a simple counter of valid entries in the tree node and can be shrunk to a single byte. This vastly simplifies the page cache code. All accounting happens natively inside the radix tree implementation, and maintaining the LRU linkage of shadow nodes is consolidated into a single function in the workingset code that is called for leaf nodes affected by a change in the page cache tree. This also removes the last user of the __radix_delete_node() return value. Eliminate it. Link: http://lkml.kernel.org/r/20161117193211.GE23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 8 ++---- include/linux/swap.h | 34 +---------------------- lib/radix-tree.c | 25 ++++------------- mm/filemap.c | 54 ++++-------------------------------- mm/truncate.c | 21 ++------------ mm/workingset.c | 56 +++++++++++++++++++++++++++++--------- 6 files changed, 60 insertions(+), 138 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 15c972ea9510..744486057e9e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Internally used bits of node->count */ -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) - struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ - unsigned int count; /* Total entry count */ + unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ union { struct { @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b..09b212d37f1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -246,39 +246,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -extern struct list_lru workingset_shadow_nodes; - -static inline unsigned int workingset_node_pages(struct radix_tree_node *node) -{ - return node->count & RADIX_TREE_COUNT_MASK; -} - -static inline void workingset_node_pages_inc(struct radix_tree_node *node) -{ - node->count++; -} - -static inline void workingset_node_pages_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_pages(node)); - node->count--; -} - -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) -{ - return node->count >> RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_inc(struct radix_tree_node *node) -{ - node->count += 1U << RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_shadows(node)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; -} +void workingset_update_node(struct radix_tree_node *node, void *private); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index df4ff18dd63c..9dbfaac05e6c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -541,12 +541,10 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, +static inline void radix_tree_shrink(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private) { - bool shrunk = false; - for (;;) { struct radix_tree_node *node = root->rnode; struct radix_tree_node *child; @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } radix_tree_node_free(node); - shrunk = true; } - - return shrunk; } -static bool delete_node(struct radix_tree_root *root, +static void delete_node(struct radix_tree_root *root, struct radix_tree_node *node, radix_tree_update_node_t update_node, void *private) { - bool deleted = false; - do { struct radix_tree_node *parent; if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); - return deleted; + radix_tree_shrink(root, update_node, private); + return; } parent = node->parent; @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root, } radix_tree_node_free(node); - deleted = true; node = parent; } while (node); - - return deleted; } /** @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node, NULL, NULL); + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, diff --git a/mm/filemap.c b/mm/filemap.c index 1ba726aef708..dc3e5fce0b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(&mapping->page_tree, slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } @@ -185,8 +167,6 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - if (!node) { VM_BUG_ON_PAGE(nr != 1, page); /* @@ -196,33 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(&mapping->page_tree, slot, shadow); - - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { diff --git a/mm/truncate.c b/mm/truncate.c index 3c631c357873..fd97f1dbce29 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(&mapping->page_tree, slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/workingset.c b/mm/workingset.c index 98f830897b1b..ef556bf1323d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -334,18 +335,45 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; + unsigned long nodes; unsigned long pages; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); if (sc->memcg) { @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, */ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!workingset_node_shadows(node))) + if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; - if (WARN_ON_ONCE(workingset_node_pages(node))) + if (WARN_ON_ONCE(node->count != node->exceptional)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(workingset_node_shadows(node))) + if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node); @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -496,7 +526,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -504,7 +534,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } From dbc446b88e7041cb1d076e51726ccee497cb6ee3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:55 -0800 Subject: [PATCH 067/123] mm: workingset: restore refault tracking for single-page files Shadow entries in the page cache used to be accounted behind the radix tree implementation's back in the upper bits of node->count, and the radix tree code extending a single-entry tree with a shadow entry in root->rnode would corrupt that counter. As a result, we could not put shadow entries at index 0 if the tree didn't have any other entries, and that means no refault detection for any single-page file. Now that the shadow entries are tracked natively in the radix tree's exceptional counter, this is no longer necessary. Extending and shrinking the tree from and to single entries in root->rnode now does the right thing when the entry is exceptional, remove that limitation. Link: http://lkml.kernel.org/r/20161117193244.GF23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index dc3e5fce0b7b..5b4dd03130da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -167,14 +167,7 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - if (!node) { - VM_BUG_ON_PAGE(nr != 1, page); - /* - * We need a node to properly account shadow - * entries. Don't plant any without. XXX - */ - shadow = NULL; - } + VM_BUG_ON_PAGE(!node && nr != 1, page); radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, From b53889987862b69179560e50992f7ea8a5eb61ed Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:58 -0800 Subject: [PATCH 068/123] mm: workingset: update shadow limit to reflect bigger active list Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") the size of the active file list is no longer limited to half of memory. Increase the shadow node limit accordingly to avoid throwing out shadow entries that might still result in eligible refaults. The exact size of the active list now depends on the overall size of the page cache, but converges toward taking up most of the space: In mm/vmscan.c::inactive_list_is_low(), * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB It would be possible to apply the same precise ratios when determining the limit for radix tree nodes containing shadow entries, but since it is merely an approximation of the oldest refault distances in the wild and the code also makes assumptions about the node population density, keep it simple and always target the full cache size. While at it, clarify the comment and the formula for memory footprint. Link: http://lkml.kernel.org/r/20161117214701.29000-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index ef556bf1323d..241fa5d6b3b2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -369,40 +369,46 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long pages; + unsigned long cache; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); - if (sc->memcg) { - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - /* - * Active cache pages are limited to 50% of memory, and shadow - * entries that represent a refault distance bigger than that - * do not have any effect. Limit the number of shadow nodes - * such that shadow entries do not exceed the number of active - * cache pages, assuming a worst-case node population density - * of 1/8th on average. + * Approximate a reasonable limit for the radix tree nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. * * On 64-bit with 7 radix_tree_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume - * ~2% of available memory: + * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ - max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + if (sc->memcg) { + cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + } else { + cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } + max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); if (nodes <= max_nodes) return 0; - return nodes - max_nodes; } From c8eef01e2f98e09a6733f2acdc675b4cf87a22a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:01 -0800 Subject: [PATCH 069/123] mm: remove free_unmap_vmap_area_noflush() Patch series "reduce latency in __purge_vmap_area_lazy", v2. This patch (of 10): Sort out the long lock hold times in __purge_vmap_area_lazy. It is based on a patch from Joel. Inline free_unmap_vmap_area_noflush() it into the only caller. Link: http://lkml.kernel.org/r/1479474236-4139-2-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e73948afac70..c3261143a0af 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -710,23 +710,14 @@ static void free_vmap_area_noflush(struct vmap_area *va) try_purge_vmap_area_lazy(); } -/* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); -} - /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) From 9c3acf6043ac437ae0a45de4657ee700c3dc8850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:04 -0800 Subject: [PATCH 070/123] mm: remove free_unmap_vmap_area_addr() Just inline it into the only caller. Link: http://lkml.kernel.org/r/1479474236-4139-3-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c3261143a0af..842ea986adcd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -731,16 +731,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1098,6 +1088,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); @@ -1107,10 +1098,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); From 0574ecd141df28d573d4364adec59766ddf5f38d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:07 -0800 Subject: [PATCH 071/123] mm: refactor __purge_vmap_area_lazy() Move the purge_lock synchronization to the callers, move the call to purge_fragmented_blocks_allcpus at the beginning of the function to the callers that need it, move the force_flush behavior to the caller that needs it, and pass start and end by value instead of by reference. No change in behavior. Link: http://lkml.kernel.org/r/1479474236-4139-4-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 80 +++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 842ea986adcd..1f5501b43026 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_SPINLOCK(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -615,59 +622,36 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; int nr = 0; - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); - - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; } - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!nr) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + atomic_sub(nr, &vmap_lazy_nr); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); - } - spin_unlock(&purge_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -676,9 +660,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (spin_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + spin_unlock(&vmap_purge_lock); + } } /* @@ -686,9 +671,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + spin_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + spin_unlock(&vmap_purge_lock); } /* @@ -1075,7 +1061,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + spin_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + spin_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); From bf22e37a641327e34681b7b6959d9646e3886770 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:10 -0800 Subject: [PATCH 072/123] mm: add vfree_atomic() We are going to use sleeping lock for freeing vmap. However some vfree() users want to free memory from atomic (but not from interrupt) context. For this we add vfree_atomic() - deferred variation of vfree() which can be used in any atomic context (except NMIs). [akpm@linux-foundation.org: tweak comment grammar] [aryabinin@virtuozzo.com: use raw_cpu_ptr() instead of this_cpu_ptr()] Link: http://lkml.kernel.org/r/1481553981-3856-1-git-send-email-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/1479474236-4139-5-git-send-email-hch@lst.de Signed-off-by: Andrey Ryabinin Signed-off-by: Christoph Hellwig Cc: Joel Fernandes Cc: Jisheng Zhang Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 42 +++++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3d9d786a943c..d68edffbf142 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller); extern void vfree(const void *addr); +extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f5501b43026..4ac776f10ad1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1486,7 +1486,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1509,11 +1541,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); From 0f110a9b956c1678b53986b003d59794604807ba Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:14 -0800 Subject: [PATCH 073/123] kernel/fork: use vfree_atomic() to free thread stack vfree() is going to use sleeping lock. Thread stack freed in atomic context, therefore we must use vfree_atomic() here. Link: http://lkml.kernel.org/r/1479474236-4139-6-git-send-email-hch@lst.de Signed-off-by: Andrey Ryabinin Signed-off-by: Christoph Hellwig Cc: Joel Fernandes Cc: Jisheng Zhang Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index 7ffa16033ded..00492b22adfe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk) } local_irq_restore(flags); - vfree(tsk->stack); + vfree_atomic(tsk->stack); return; } #endif From 8d5341a6260a59cf15c4ae0efbf0bcd8e1b8a6bb Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:17 -0800 Subject: [PATCH 074/123] x86/ldt: use vfree_atomic() to free ldt entries vfree() is going to use sleeping lock. free_ldt_struct() may be called with disabled preemption, therefore we must use vfree_atomic() here. E.g. call trace: vfree() free_ldt_struct() destroy_context_ldt() __mmdrop() finish_task_switch() schedule_tail() ret_from_fork() Link: http://lkml.kernel.org/r/1479474236-4139-7-git-send-email-hch@lst.de Signed-off-by: Andrey Ryabinin Signed-off-by: Christoph Hellwig Cc: Joel Fernandes Cc: Jisheng Zhang Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/ldt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6707039b9032..4d12cdf2b453 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) paravirt_free_ldt(ldt->entries, ldt->size); if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); + vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); From 5803ed292e63a1bf00722d6655d0229794607183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:20 -0800 Subject: [PATCH 075/123] mm: mark all calls into the vmalloc subsystem as potentially sleeping We will take a sleeping lock in later in this series, so this adds the proper safeguards. Link: http://lkml.kernel.org/r/1479474236-4139-9-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4ac776f10ad1..3308007d8427 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + might_sleep(); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -1037,6 +1037,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1080,6 +1082,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) unsigned long addr = (unsigned long)mem; struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1431,6 +1434,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; From f9e09977671b618aeb25ddc0d4c9a84d5b5cde9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:23 -0800 Subject: [PATCH 076/123] mm: turn vmap_purge_lock into a mutex The purge_lock spinlock causes high latencies with non RT kernel. This has been reported multiple times on lkml [1] [2] and affects applications like audio. This patch replaces it with a mutex to allow preemption while holding the lock. Thanks to Joel Fernandes for the detailed report and analysis as well as an earlier attempt at fixing this issue. [1] http://lists.openwall.net/linux-kernel/2016/03/23/29 [2] https://lkml.org/lkml/2016/10/9/59 Link: http://lkml.kernel.org/r/1479474236-4139-10-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3308007d8427..d3c1f5ee48b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -606,7 +606,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); * by this look, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ -static DEFINE_SPINLOCK(vmap_purge_lock); +static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -660,9 +660,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) */ static void try_purge_vmap_area_lazy(void) { - if (spin_trylock(&vmap_purge_lock)) { + if (mutex_trylock(&vmap_purge_lock)) { __purge_vmap_area_lazy(ULONG_MAX, 0); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } } @@ -671,10 +671,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - spin_lock(&vmap_purge_lock); + mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } /* @@ -1063,11 +1063,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - spin_lock(&vmap_purge_lock); + mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); From 763b218ddfaf56761c19923beb7e16656f66ec62 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 12 Dec 2016 16:44:26 -0800 Subject: [PATCH 077/123] mm: add preempt points into __purge_vmap_area_lazy() Use cond_resched_lock to avoid holding the vmap_area_lock for a potentially long time and thus creating bad latencies for various workloads. [hch: split from a larger patch by Joel, wrote the crappy changelog] Link: http://lkml.kernel.org/r/1479474236-4139-11-git-send-email-hch@lst.de Signed-off-by: Joel Fernandes Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d3c1f5ee48b4..a5584384eabc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -628,7 +628,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; + bool do_free = false; lockdep_assert_held(&vmap_purge_lock); @@ -638,18 +638,22 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) start = va->va_start; if (va->va_end > end) end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + do_free = true; } - if (!nr) + if (!do_free) return false; - atomic_sub(nr, &vmap_lazy_nr); flush_tlb_kernel_range(start, end); spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); + } spin_unlock(&vmap_area_lock); return true; } From 1dd38b6c27d59414e89c08dd1ae9677a8e12cbc4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:29 -0800 Subject: [PATCH 078/123] mm: move vma_is_anonymous check within pmd_move_must_withdraw Independent of whether the vma is for anonymous memory, some arches like ppc64 would like to override pmd_move_must_withdraw(). One option is to encapsulate the vma_is_anonymous() check for general architectures inside pmd_move_must_withdraw() so that is always called and architectures that need unconditional overriding can override this function. ppc64 needs to override the function when the MMU is configured to use hash PTE's. [bsingharora@gmail.com: reworked changelog] Link: http://lkml.kernel.org/r/20161113150025.17942-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Acked-by: Michael Ellerman (powerpc) Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++- include/asm-generic/pgtable.h | 12 ------------ mm/huge_memory.c | 18 ++++++++++++++++-- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9fd77f8794a0..700301bc5190 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl) + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) { if (radix_enabled()) return false; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 41b95d82a185..2065e81701fc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,18 +652,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif -#ifndef pmd_move_must_withdraw -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, - spinlock_t *old_pmd_ptl) -{ - /* - * With split pmd lock we also need to move preallocated - * PTE page table if new_pmd is on different PMD page table. - */ - return new_pmd_ptl != old_pmd_ptl; -} -#endif - /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26fd1161ca85..b54044c21076 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,6 +1429,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1466,8 +1481,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); From 953c66c2b22a304dbc3c3d7fc8e8c25cd97a03d8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:32 -0800 Subject: [PATCH 079/123] mm: THP page cache support for ppc64 Add arch specific callback in the generic THP page cache code that will deposit and withdarw preallocated page table. Archs like ppc64 use this preallocated table to store the hash pte slot information. Testing: kernel build of the patch series on tmpfs mounted with option huge=always The related thp stat: thp_fault_alloc 72939 thp_fault_fallback 60547 thp_collapse_alloc 603 thp_collapse_alloc_failed 0 thp_file_alloc 253763 thp_file_mapped 4251 thp_split_page 51518 thp_split_page_failed 1 thp_deferred_split_page 73566 thp_split_pmd 665 thp_zero_page_alloc 3 thp_zero_page_alloc_failed 0 [akpm@linux-foundation.org: remove unneeded parentheses, per Kirill] Link: http://lkml.kernel.org/r/20161113150025.17942-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 ++++ include/asm-generic/pgtable.h | 3 + mm/Kconfig | 6 +- mm/huge_memory.c | 17 ++++++ mm/khugepaged.c | 21 ++++++- mm/memory.c | 60 ++++++++++++++++---- 6 files changed, 100 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 700301bc5190..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2065e81701fc..18af2bcefe6a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,6 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/Kconfig b/mm/Kconfig index 33a9b06ec618..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b54044c21076..2b44ac11178f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,6 +1380,15 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1421,6 +1430,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1607,6 +1618,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7a50c726c5ae..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); diff --git a/mm/memory.c b/mm/memory.c index 0a72f821ccdc..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; From 46e8a3a08c23d07d0f21fabeed182b671af68c93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Dec 2016 16:44:35 -0800 Subject: [PATCH 080/123] mm, debug: print raw struct page data in __dump_page() __dump_page() is used when a page metadata inconsistency is detected, either by standard runtime checks, or extra checks in CONFIG_DEBUG_VM builds. It prints some of the relevant metadata, but not the whole struct page, which is based on unions and interpretation is dependent on the context. This means that sometimes e.g. a VM_BUG_ON_PAGE() checks certain field, which is however not printed by __dump_page() and the resulting bug report may then lack clues that could help in determining the root cause. This patch solves the problem by simply printing the whole struct page word by word, so no part is missing, but the interpretation of the data is left to developers. This is similar to e.g. x86_64 raw stack dumps. Example output: page:ffffea00000475c0 count:1 mapcount:0 mapping: (null) index:0x0 flags: 0x100000000000400(reserved) raw: 0100000000000400 0000000000000000 0000000000000000 00000001ffffffff raw: ffffea00000475e0 ffffea00000475e0 0000000000000000 0000000000000000 page dumped because: VM_BUG_ON_PAGE(1) [aryabinin@virtuozzo.com: suggested print_hex_dump()] Link: http://lkml.kernel.org/r/2ff83214-70fe-741e-bf05-fe4a4073ec3e@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/debug.c b/mm/debug.c index 9feb699c5d25..db1cd26d8752 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (reason) pr_alert("page dumped because: %s\n", reason); From d5a187daf5856df9b997f9d208e5a7b64006eb2e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Dec 2016 16:44:38 -0800 Subject: [PATCH 081/123] mm, rmap: handle anon_vma_prepare() common case inline anon_vma_prepare() is mostly a large "if (unlikely(...))" block, as the expected common case is that an anon_vma already exists. We could turn the condition around and return 0, but it also makes sense to do it inline and avoid a call for the common case. Bloat-o-meter naturally shows that inlining the check has some code size costs: add/remove: 1/1 grow/shrink: 4/0 up/down: 475/-373 (102) function old new delta __anon_vma_prepare - 359 +359 handle_mm_fault 2744 2796 +52 hugetlb_cow 1146 1170 +24 hugetlb_fault 2123 2145 +22 wp_page_copy 1469 1487 +18 anon_vma_prepare 373 - -373 Checking the asm however confirms that the hot paths now avoid a call, which is moved away. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161116074005.22768-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 10 +++++- mm/rmap.c | 73 ++++++++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b46bb5620a76..15321fb1df6b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ -int anon_vma_prepare(struct vm_area_struct *); +int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..91619fd70939 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, } /** - * anon_vma_prepare - attach an anon_vma to a memory region + * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * - * The common case will be that we already have one, but if + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we @@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * This must be called with the mmap_sem held for reading. */ -int anon_vma_prepare(struct vm_area_struct *vma) +int __anon_vma_prepare(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); - if (unlikely(!anon_vma)) { - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_enomem; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; - anon_vma = find_mergeable_anon_vma(vma); - allocated = NULL; - if (!anon_vma) { - anon_vma = anon_vma_alloc(); - if (unlikely(!anon_vma)) - goto out_enomem_free_avc; - allocated = anon_vma; - } - - anon_vma_lock_write(anon_vma); - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { - vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; - allocated = NULL; - avc = NULL; - } - spin_unlock(&mm->page_table_lock); - anon_vma_unlock_write(anon_vma); - - if (unlikely(allocated)) - put_anon_vma(allocated); - if (unlikely(avc)) - anon_vma_chain_free(avc); + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; } + + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; + allocated = NULL; + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); + + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); + return 0; out_enomem_free_avc: From a6de734bc002fe2027ccc074fbbd87d72957b7a4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 12 Dec 2016 16:44:41 -0800 Subject: [PATCH 082/123] mm, page_alloc: keep pcp count and list contents in sync if struct page is corrupted Vlastimil Babka pointed out that commit 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") will allow the per-cpu list counter to be out of sync with the per-cpu list contents if a struct page is corrupted. The consequence is an infinite loop if the per-cpu lists get fully drained by free_pcppages_bulk because all the lists are empty but the count is positive. The infinite loop occurs here do { batch_free++; if (++migratetype == MIGRATE_PCPTYPES) migratetype = 0; list = &pcp->lists[migratetype]; } while (list_empty(list)); What the user sees is a bad page warning followed by a soft lockup with interrupts disabled in free_pcppages_bulk(). This patch keeps the accounting in sync. Fixes: 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") Link: http://lkml.kernel.org/r/20161202112951.23346-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Christoph Lameter Cc: Johannes Weiner Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b69e28706b1..3f2c9e535f7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2218,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, bool cold) { - int i; + int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2243,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; + alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'alloced' which is the number of + * pages added to the pcp list. + */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return i; + return alloced; } #ifdef CONFIG_NUMA From dc644a073769dd8949a75691eb4a5bdeb70a7d51 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 12 Dec 2016 16:44:44 -0800 Subject: [PATCH 083/123] mm: add three more cond_resched() in swapoff Add a cond_resched() in the unuse_pmd_range() loop (so as to call it even when pmd none or trans_huge, like zap_pmd_range() does); and in the unuse_mm() loop (since that might skip over many vmas). shmem_unuse() and radix_tree_locate_item() look good enough already. Those were the obvious places, but in fact the stalls came from find_next_to_unuse(), which sometimes scans through many unused entries. Apply scan_swap_map()'s LATENCY_LIMIT of 256 there too; and only go off to test frontswap_map when a used entry is found. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052155140.13021@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Eric Dumazet Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f30438970cd1..1c6e0321205d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { + cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; + cond_resched(); } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; @@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - if (frontswap) { - if (frontswap_test(si, i)) - break; - else - continue; - } count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - break; + if (!frontswap || frontswap_test(si, i)) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); } return i; } From a66c0410b97c07a5708881198528ce724f7a3226 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 12 Dec 2016 16:44:47 -0800 Subject: [PATCH 084/123] mm: add cond_resched() in gather_pte_stats() The other pagetable walks in task_mmu.c have a cond_resched() after walking their ptes: add a cond_resched() in gather_pte_stats() too, for reading /proc//numa_maps. Only pagemap_pmd_range() has a cond_resched() in its (unusually expensive) pmd_trans_huge case: more should probably be added, but leave them unchanged for now. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052157400.13021@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Cc: David Rientjes Cc: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f..958f32545064 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); + cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE From 49920d28781dcced10cd30cb9a938e7d045a1c94 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 12 Dec 2016 16:44:50 -0800 Subject: [PATCH 085/123] mm: make transparent hugepage size public Test programs want to know the size of a transparent hugepage. While it is commonly the same as the size of a hugetlbfs page (shown as Hugepagesize in /proc/meminfo), that is not always so: powerpc implements transparent hugepages in a different way from hugetlbfs pages, so it's coincidence when their sizes are the same; and x86 and others can support more than one hugetlbfs page size. Add /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to show the THP size in bytes - it's the same for Anonymous and Shmem hugepages. Call it hpage_pmd_size (after HPAGE_PMD_SIZE) rather than hpage_size, in case some transparent support for pud and pgd pages is added later. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052200290.13021@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Greg Thelen Cc: David Rientjes Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Dave Hansen Cc: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 5 +++++ mm/huge_memory.c | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 2ec6adb5a4ce..c4171e4519c2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -136,6 +136,11 @@ or enable it back by writing 1: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page +Some userspace (such as a test program, or an optimized memory allocation +library) may want to know the size (in bytes) of a transparent hugepage: + +cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size + khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll be automatically shutdown if it's set to "never". diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2b44ac11178f..cee42cf05477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &shmem_enabled_attr.attr, #endif From 5c5c1f36cedfb51ec291181e71817f7fe7e03ee2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 12 Dec 2016 16:44:53 -0800 Subject: [PATCH 086/123] kasan: support panic_on_warn If user sets panic_on_warn, he wants kernel to panic if there is anything barely wrong with the kernel. KASAN-detected errors are definitely not less benign than an arbitrary kernel WARNING. Panic after KASAN errors if panic_on_warn is set. We use this for continuous fuzzing where we want kernel to stop and reboot on any error. Link: http://lkml.kernel.org/r/1476694764-31986-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 073325aedc68..b82b3e215157 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); kasan_enable_current(); } From 64abdcb24351a27bed6e2b6a3c27348fe532c73f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 12 Dec 2016 16:44:56 -0800 Subject: [PATCH 087/123] kasan: eliminate long stalls during quarantine reduction Currently we dedicate 1/32 of RAM for quarantine and then reduce it by 1/4 of total quarantine size. This can be a significant amount of memory. For example, with 4GB of RAM total quarantine size is 128MB and it is reduced by 32MB at a time. With 128GB of RAM total quarantine size is 4GB and it is reduced by 1GB. This leads to several problems: - freeing 1GB can take tens of seconds, causes rcu stall warnings and just introduces unexpected long delays at random places - if kmalloc() is called under a mutex, other threads stall on that mutex while a thread reduces quarantine - threads wait on quarantine_lock while one thread grabs a large batch of objects to evict - we walk the uncached list of object to free twice which makes all of the above worse - when a thread frees objects, they are already not accounted against global_quarantine.bytes; as the result we can have quarantine_size bytes in quarantine + unbounded amount of memory in large batches in threads that are in process of freeing Reduce size of quarantine in smaller batches to reduce the delays. The only reason to reduce it in batches is amortization of overheads, the new batch size of 1MB should be well enough to amortize spinlock lock/unlock and few function calls. Plus organize quarantine as a FIFO array of batches. This allows to not walk the list in quarantine_reduce() under quarantine_lock, which in turn reduces contention and is just faster. This improves performance of heavy load (syzkaller fuzzing) by ~20% with 4 CPUs and 32GB of RAM. Also this eliminates frequent (every 5 sec) drops of CPU consumption from ~400% to ~100% (one thread reduces quarantine while others are waiting on a mutex). Some reference numbers: 1. Machine with 4 CPUs and 4GB of memory. Quarantine size 128MB. Currently we free 32MB at at time. With new code we free 1MB at a time (1024 batches, ~128 are used). 2. Machine with 32 CPUs and 128GB of memory. Quarantine size 4GB. Currently we free 1GB at at time. With new code we free 8MB at a time (1024 batches, ~512 are used). 3. Machine with 4096 CPUs and 1TB of memory. Quarantine size 32GB. Currently we free 8GB at at time. With new code we free 4MB at a time (16K batches, ~8K are used). Link: http://lkml.kernel.org/r/1478756952-18695-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Eric Dumazet Cc: Greg Thelen Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 92 ++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index baabaad4a4aa..dae929c02bbb 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) qlist_init(from); } -static void qlist_move(struct qlist_head *from, struct qlist_node *last, - struct qlist_head *to, size_t size) -{ - if (unlikely(last == from->tail)) { - qlist_move_all(from, to); - return; - } - if (qlist_empty(to)) - to->head = from->head; - else - to->tail->next = from->head; - to->tail = last; - from->head = last->next; - last->next = NULL; - from->bytes -= size; - to->bytes += size; -} - +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) /* * The object quarantine consists of per-cpu queues and a global queue, @@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last, */ static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); -static struct qlist_head global_quarantine; +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); /* Maximum size of the global queue. */ -static unsigned long quarantine_size; +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; /* * The fraction of physical memory the quarantine is allowed to occupy. @@ -124,9 +120,6 @@ static unsigned long quarantine_size; */ #define QUARANTINE_FRACTION 32 -#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) -#define QUARANTINE_PERCPU_SIZE (1 << 20) - static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { return virt_to_head_page(qlink)->slab_cache; @@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(!qlist_empty(&temp))) { spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_all(&temp, &global_quarantine); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } spin_unlock_irqrestore(&quarantine_lock, flags); } } void quarantine_reduce(void) { - size_t new_quarantine_size, percpu_quarantines; + size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; struct qlist_head to_free = QLIST_INIT; - size_t size_to_free = 0; - struct qlist_node *last; - if (likely(READ_ONCE(global_quarantine.bytes) <= - READ_ONCE(quarantine_size))) + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) return; spin_lock_irqsave(&quarantine_lock, flags); @@ -214,24 +216,23 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? - 0 : new_quarantine_size - percpu_quarantines; - WRITE_ONCE(quarantine_size, new_quarantine_size); + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); - last = global_quarantine.head; - while (last) { - struct kmem_cache *cache = qlink_to_cache(last); - - size_to_free += cache->size; - if (!last->next || size_to_free > - global_quarantine.bytes - QUARANTINE_LOW_SIZE) - break; - last = last->next; + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; } - qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); @@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg) void quarantine_remove_cache(struct kmem_cache *cache) { - unsigned long flags; + unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_cache(&global_quarantine, &to_free, cache); + for (i = 0; i < QUARANTINE_BATCHES; i++) + qlist_move_cache(&global_quarantine[i], &to_free, cache); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); From c5caf21ab0cf884ef15b25af234f620e4a233139 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:59 -0800 Subject: [PATCH 088/123] kasan: turn on -fsanitize-address-use-after-scope In the upcoming gcc7 release, the -fsanitize=kernel-address option at first implied new -fsanitize-address-use-after-scope option. This would cause link errors on older kernels because they don't have two new functions required for use-after-scope support. Therefore, gcc7 changed default to -fno-sanitize-address-use-after-scope. Now the kernel has everything required for that feature since commit 828347f8f9a5 ("kasan: support use-after-scope detection"). So, to make it work, we just have to enable use-after-scope in CFLAGS. Link: http://lkml.kernel.org/r/1481207977-28654-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Dmitry Vyukov Cc: Alexander Potapenko Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/Makefile.kasan | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 37323b0df374..9576775a86f6 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -28,4 +28,6 @@ else CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) endif endif + +CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) endif From 8f6066049c54ef0f726869c27d610cef5d15e084 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Mon, 12 Dec 2016 16:45:02 -0800 Subject: [PATCH 089/123] mm/percpu.c: fix panic triggered by BUG_ON() falsely As shown by pcpu_build_alloc_info(), the number of units within a percpu group is deduced by rounding up the number of CPUs within the group to @upa boundary/ Therefore, the number of CPUs isn't equal to the units's if it isn't aligned to @upa normally. However, pcpu_page_first_chunk() uses BUG_ON() to assert that one number is equal to the other roughly, so a panic is maybe triggered by the BUG_ON() incorrectly. In order to fix this issue, the number of CPUs is rounded up then compared with units's and the BUG_ON() is replaced with a warning and return of an error code as well, to keep system alive as much as possible. Link: http://lkml.kernel.org/r/57FCF07C.2020103@zoho.com Signed-off-by: zijun_hu Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 255714302394..f696385bcc44 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, size_t pages_size; struct page **pages; int unit, i, j, rc; + int upa; + int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size, if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* allocate pages */ j = 0; - for (unit = 0; unit < num_possible_cpus(); unit++) + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { - unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", - psize_str, cpu); + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } + } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; From af884cd4a5ae62fcf5e321fecf0ec1014730353d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 12 Dec 2016 16:45:05 -0800 Subject: [PATCH 090/123] proc: report no_new_privs state Similar to being able to examine if a process has been correctly confined with seccomp, the state of no_new_privs is equally interesting, so this adds it to /proc/$pid/status. Link: http://lkml.kernel.org/r/20161103214041.GA58566@beast Signed-off-by: Kees Cook Reviewed-by: Jann Horn Cc: Jonathan Corbet Cc: Vlastimil Babka Cc: Michal Hocko Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Rodrigo Freire Cc: John Stultz Cc: Ross Zwisler Cc: Robert Ho Cc: Jerome Marchand Cc: Andy Lutomirski Cc: Johannes Weiner Cc: Alexey Dobriyan Cc: "Richard W.M. Jones" Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 2 ++ fs/proc/array.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 74329fd0add2..c03f2f91c6ab 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -191,6 +191,7 @@ read the file /proc/PID/status: CapPrm: 0000000000000000 CapEff: 0000000000000000 CapBnd: ffffffffffffffff + NoNewPrivs: 0 Seccomp: 0 voluntary_ctxt_switches: 0 nonvoluntary_ctxt_switches: 1 @@ -262,6 +263,7 @@ Table 1-2: Contents of the status files (as of 4.1) CapPrm bitmap of permitted capabilities CapEff bitmap of effective capabilities CapBnd bitmap of capabilities bounding set + NoNewPrivs no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...) Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...) Cpus_allowed mask of CPUs on which this process may run Cpus_allowed_list Same as previous, but in "list format" diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..082676ab4878 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { + seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP - seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); - seq_putc(m, '\n'); + seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, From 623f594e7d20e4b5dac50aba5bd23822f2a95d6f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:08 -0800 Subject: [PATCH 091/123] proc: make struct pid_entry::len unsigned "unsigned int" is better on x86_64 because it most of the time it autoexpands to 64-bit value while "int" requires MOVSX instruction. Link: http://lkml.kernel.org/r/20161029160810.GF1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ca651ac00660..e1227bc57090 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -106,7 +106,7 @@ struct pid_entry { const char *name; - int len; + unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; From 9a87fe0d7c2d4fb62255cb69088da5a812df3516 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:11 -0800 Subject: [PATCH 092/123] proc: make struct struct map_files_info::len unsigned int Linux doesn't support 4GB+ filenames in /proc, so unsigned long is too much. MOV r64, r/m64 is larger than MOV r32, r/m32. Link: http://lkml.kernel.org/r/20161029161123.GG1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index e1227bc57090..7c843024b406 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1967,7 +1967,7 @@ out: struct map_files_info { fmode_t mode; - unsigned long len; + unsigned int len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; From 06a0c4175db68912981fa34c24384d8b1a58c6dc Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:14 -0800 Subject: [PATCH 093/123] proc: just list_del() struct pde_opener list_del_init() is too much, structure will be freed in three lines anyway. Link: http://lkml.kernel.org/r/20161029155313.GA1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e69ebe648a34..907265009b03 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -152,7 +152,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) file = pdeo->file; pde->proc_fops->release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); - list_del_init(&pdeo->lh); + list_del(&pdeo->lh); if (pdeo->c) complete(pdeo->c); kfree(pdeo); From f5887c71cf682a6c27e26cb83296d49729f62b3c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:17 -0800 Subject: [PATCH 094/123] proc: fix type of struct pde_opener::closing field struct pde_opener::closing is boolean. Link: http://lkml.kernel.org/r/20161029155439.GB1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- fs/proc/internal.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 907265009b03..f623a3ca2746 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -147,7 +147,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); } else { struct file *file; - pdeo->closing = 1; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_fops->release(file_inode(file), file); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5378441ec1b7..153db5f85b47 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -203,7 +203,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name); struct pde_opener { struct file *file; struct list_head lh; - int closing; + bool closing; struct completion *c; }; extern const struct inode_operations proc_link_inode_operations; From 39a10ac23cfdb6469550e1641a2bc2ed80663ceb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:20 -0800 Subject: [PATCH 095/123] proc: kmalloc struct pde_opener kzalloc is too much, half of the fields will be reinitialized anyway. If proc file doesn't have ->release hook (some still do not), clearing is unnecessary because it will be freed immediately. Link: http://lkml.kernel.org/r/20161029155747.GC1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index f623a3ca2746..57f548e2eb59 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -321,7 +321,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) * by hand in remove_proc_entry(). For this, save opener's credentials * for later. */ - pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; @@ -338,6 +338,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (rv == 0 && release) { /* To know what to release. */ pdeo->file = file; + pdeo->closing = false; + pdeo->c = NULL; /* Strictly for "too late" ->release in proc_reg_release(). */ spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); From 492b2da6056e7051917516368e75e062422c3557 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:22 -0800 Subject: [PATCH 096/123] proc: tweak comments about 2 stage open and everything Some comments were obsoleted since commit 05c0ae21c034 ("try a saner locking for pde_opener..."). Some new comments added. Some confusing comments replaced with equally confusing ones. Link: http://lkml.kernel.org/r/20161029160231.GD1246@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 57f548e2eb59..783bc19644d1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { + /* + * close() (proc_reg_release()) can't delete an entry and proceed: + * ->release hook needs to be available at the right moment. + * + * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: + * "struct file" needs to be available at the right moment. + * + * Therefore, first process to enter this function does ->release() and + * signals its completion to the other process which does nothing. + */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); @@ -152,6 +162,7 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) file = pdeo->file; pde->proc_fops->release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); + /* After ->release. */ list_del(&pdeo->lh); if (pdeo->c) complete(pdeo->c); @@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de) if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); + /* ->pde_openers list can't grow from now on. */ + spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; @@ -312,14 +325,15 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct pde_opener *pdeo; /* - * What for, you ask? Well, we can have open, rmmod, remove_proc_entry - * sequence. ->release won't be called because ->proc_fops will be - * cleared. Depending on complexity of ->release, consequences vary. + * Ensure that + * 1) PDE's ->release hook will be called no matter what + * either normally by close()/->release, or forcefully by + * rmmod/remove_proc_entry. * - * We can't wait for mercy when close will be done for real, it's - * deadlockable: rmmod foo release - * by hand in remove_proc_entry(). For this, save opener's credentials - * for later. + * 2) rmmod isn't blocked by opening file in /proc and sitting on + * the descriptor (including "rmmod foo release hook. */ pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) @@ -340,7 +354,6 @@ static int proc_reg_open(struct inode *inode, struct file *file) pdeo->file = file; pdeo->closing = false; pdeo->c = NULL; - /* Strictly for "too late" ->release in proc_reg_release(). */ spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); From 209b14dc030760d3a17029a5c3bd92c9d6fd3f37 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 12 Dec 2016 16:45:25 -0800 Subject: [PATCH 097/123] fs/proc/array.c: slightly improve render_sigset_t format_decode and vsnprintf occasionally show up in perf top, so I went looking for places that might not need the full printf power. With the help of kprobes, I gathered some statistics on which format strings we mostly pass to vsnprintf. On a trivial desktop workload, I hit "%x" 25% of the time, so something apparently reads /proc/pid/status (which does 5*16 printf("%x") calls) a lot. With this patch, reading /proc/pid/status is 30% faster according to this microbenchmark: char buf[4096]; int i, fd; for (i = 0; i < 10000; ++i) { fd = open("/proc/self/status", O_RDONLY); read(fd, buf, sizeof(buf)); close(fd); } Link: http://lkml.kernel.org/r/1474410485-1305-1-git-send-email-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Acked-by: Andrei Vagin Acked-by: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 082676ab4878..51a4213afa2e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header, if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; - seq_printf(m, "%x", x); + seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); From bac5f5d56bbcb0ef7d3a926dd28b5f1db09117b7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:28 -0800 Subject: [PATCH 098/123] fs/proc/base.c: save decrement during lookup/readdir in /proc/$PID Comparison for "<" works equally well as comparison for "<=" but one SUB/LEA is saved (no, it is not optimised away, at least here). Link: http://lkml.kernel.org/r/20161122195143.GA29812@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 7c843024b406..04a5fcad4c34 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc// without very good reasons. */ - last = &ents[nents - 1]; - for (p = ents; p <= last; p++) { + last = &ents[nents]; + for (p = ents; p < last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (p > last) + if (p >= last) goto out; error = proc_pident_instantiate(dir, dentry, task, p); @@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, if (ctx->pos >= nents + 2) goto out; - for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; From 1270dd8d994039b677d0504ba7260873d608bf75 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 12 Dec 2016 16:45:32 -0800 Subject: [PATCH 099/123] fs/proc: calculate /proc/* and /proc/*/task/* nlink at init time Runtime nlink calculation works but meh. I don't know how to do it at compile time, but I know how to do it at init time. Shift "2+" part into init time as a bonus. Link: http://lkml.kernel.org/r/20161122195549.GB29812@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 19 +++++++++++++------ fs/proc/internal.h | 1 + fs/proc/root.c | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 04a5fcad4c34..9b99df4893a4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -104,6 +104,9 @@ * in /proc for a task before it execs a suid executable. */ +static u8 nlink_tid; +static u8 nlink_tgid; + struct pid_entry { const char *name; unsigned int len; @@ -139,13 +142,13 @@ struct pid_entry { * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ -static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, +static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; - count = 0; + count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; @@ -3068,8 +3071,7 @@ static int proc_pid_instantiate(struct inode *dir, inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff))); + set_nlink(inode, nlink_tgid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3361,8 +3363,7 @@ static int proc_task_instantiate(struct inode *dir, inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff))); + set_nlink(inode, nlink_tid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3552,3 +3553,9 @@ static const struct file_operations proc_task_operations = { .iterate_shared = proc_task_readdir, .llseek = generic_file_llseek, }; + +void __init set_proc_pid_nlink(void) +{ + nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 153db5f85b47..bbba5d22aada 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -211,6 +211,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); +void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/root.c b/fs/proc/root.c index 8d3e484055a6..4bd0373576b5 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -122,6 +122,7 @@ void __init proc_root_init(void) int err; proc_init_inodecache(); + set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) return; From 4ca5ede07c9871c13ae422c96d6d08dbd0df5eda Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Dec 2016 16:45:35 -0800 Subject: [PATCH 100/123] hung_task: decrement sysctl_hung_task_warnings only if it is positive Since sysctl_hung_task_warnings == -1 is allowed (infinite warnings), commit 48a6d64edadb ("hung_task: allow hung_task_panic when hung_task_warnings is 0") should decrement it only when it is not -1. This prevents the kernel from ceasing warnings after the first 4294967295 ;) Signed-off-by: Tetsuo Handa Cc: John Siddle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2b59c82cc3e1..40c07e4fa116 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * complain: */ if (sysctl_hung_task_warnings) { - sysctl_hung_task_warnings--; + if (sysctl_hung_task_warnings > 0) + sysctl_hung_task_warnings--; pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", t->comm, t->pid, timeout); pr_err(" %s %s %.*s\n", From 8e8780a547d987b6465c9458402177fe706c5624 Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Mon, 12 Dec 2016 16:45:38 -0800 Subject: [PATCH 101/123] compiler-gcc.h: use "proved" instead of "proofed" Link: http://lkml.kernel.org/r/1477894241.1103202.772260161.1B0A5995@webmail.messagingengine.com Signed-off-by: Benjamin Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 928e5ca0caee..0444b1336268 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -21,7 +21,7 @@ * clobbered. The issue is as follows: while the inline asm might * access any memory it wants, the compiler could have fit all of * @ptr into memory registers instead, and since @ptr never escaped - * from that, it proofed that the inline asm wasn't touching any of + * from that, it proved that the inline asm wasn't touching any of * it. This version works well with both compilers, i.e. we're telling * the compiler that the inline asm absolutely may see the contents * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 From 4a998e322abc935e95efc1a8108e6102be636a43 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:40 -0800 Subject: [PATCH 102/123] printk/NMI: fix up handling of the full nmi log buffer vsnprintf() adds the trailing '\0' but it does not count it into the number of printed characters. The result is that there is one byte less space for the real characters in the buffer. The broken check for the free space might cause that we will repeatedly try to print 1 character into the buffer, never reach the full buffer, and do not count the messages as missed. Also vsnprintf() returns the number of characters that would be printed if the buffer was big enough. As a result, s->len might be bigger than the size of the buffer[*]. And the printk() function might return bigger len than it really printed. Both problems are fixed by using vscnprintf() instead. Note that I though about increasing the number of missed messages even when the message was shrunken. But it made the code even more complicated. I think that it is not worth it. Shrunken messages are usually easy to recognize. And it should be a corner case. [*] The overflown s->len value is crazy and unexpected. I "made a mistake" and reported this situation as an internal error when fixed handling of PR_CONT headers in some other patch. Link: http://lkml.kernel.org/r/20161208174912.GA17042@linux.suse Signed-off-by: Petr Mladek CcL Sergey Senozhatsky Cc: Chris Mason Cc: David Sterba Cc: Jason Wessel Cc: Josef Bacik Cc: Joe Perches Cc: Jaroslav Kysela Cc: Steven Rostedt Cc: Takashi Iwai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 16bab471c7e2..152533edc56f 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args) again: len = atomic_read(&s->len); - if (len >= sizeof(s->buffer)) { + /* The trailing '\0' is not counted into len. */ + if (len >= sizeof(s->buffer) - 1) { atomic_inc(&nmi_message_lost); return 0; } @@ -79,7 +80,7 @@ again: if (!len) smp_rmb(); - add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); + add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); /* * Do it once again if the buffer has been flushed in the meantime. From 22c2c7b2ef7864389b1b75f9fd604da14b21e2c2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:44 -0800 Subject: [PATCH 103/123] printk/NMI: handle continuous lines and missing newline Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") added back KERN_CONT message header. As a result it might appear in the middle of the line when the parts are squashed via the temporary NMI buffer. A reasonable solution seems to be to split the text in the NNI temporary not only by newlines but also by the message headers. Another solution would be to filter out KERN_CONT when writing to the temporary buffer. But this would complicate the lockless handling. Also it would not solve problems with a missing newline that was there even before the KERN_CONT stuff. This patch moves the temporary buffer handling into separate function. I played with it and it seems that using the char pointers make the code easier to read. Also it prints the final newline as a continuous line. Finally, it moves handling of the s->len overflow into the paranoid check. And allows to recover from the disaster. Link: http://lkml.kernel.org/r/1478695291-12169-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Cc: Joe Perches Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 78 +++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index 152533edc56f..f011aaef583c 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -114,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len) } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, - int start, int end) +/* printk part of the temporary buffer line by line */ +static int printk_nmi_flush_buffer(const char *start, size_t len) { - const char *buf = s->buffer + start; + const char *c, *end; + bool header; - printk_nmi_flush_line(buf, (end - start) + 1); + c = start; + end = start + len; + header = true; + + /* Print line by line. */ + while (c < end) { + if (*c == '\n') { + printk_nmi_flush_line(start, c - start + 1); + start = ++c; + header = true; + continue; + } + + /* Handle continuous lines or missing new line. */ + if ((c + 1 < end) && printk_get_level(c)) { + if (header) { + c = printk_skip_level(c); + continue; + } + + printk_nmi_flush_line(start, c - start); + start = c++; + header = true; + continue; + } + + header = false; + c++; + } + + /* Check if there was a partial line. Ignore pure header. */ + if (start < end && !header) { + static const char newline[] = KERN_CONT "\n"; + + printk_nmi_flush_line(start, end - start); + printk_nmi_flush_line(newline, strlen(newline)); + } + + return len; } /* @@ -136,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work) __RAW_SPIN_LOCK_INITIALIZER(read_lock); struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); unsigned long flags; - size_t len, size; - int i, last_i; + size_t len; + int i; /* * The lock has two functions. First, one reader has to flush all @@ -155,12 +190,14 @@ more: /* * This is just a paranoid check that nobody has manipulated * the buffer an unexpected way. If we printed something then - * @len must only increase. + * @len must only increase. Also it should never overflow the + * buffer size. */ - if (i && i >= len) { + if ((i && i >= len) || len > sizeof(s->buffer)) { const char *msg = "printk_nmi_flush: internal error\n"; printk_nmi_flush_line(msg, strlen(msg)); + len = 0; } if (!len) @@ -168,22 +205,7 @@ more: /* Make sure that data has been written up to the @len */ smp_rmb(); - - size = min(len, sizeof(s->buffer)); - last_i = i; - - /* Print line by line. */ - for (; i < size; i++) { - if (s->buffer[i] == '\n') { - printk_nmi_flush_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < size) { - printk_nmi_flush_seq_line(s, last_i, size - 1); - printk_nmi_flush_line("\n", strlen("\n")); - } + i += printk_nmi_flush_buffer(s->buffer + i, len - i); /* * Check that nothing has got added in the meantime and truncate From 497957576cf8a2150d723aedd74ea60b5d498bfe Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:47 -0800 Subject: [PATCH 104/123] printk/kdb: handle more message headers Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") allows to define more message headers for a single message. The motivation is that continuous lines might get mixed. Therefore it make sense to define the right log level for every piece of a cont line. This patch introduces printk_skip_headers() that will skip all headers and uses it in the kdb code instead of printk_skip_level(). This approach helps to fix other printk_skip_level() users independently. Link: http://lkml.kernel.org/r/1478695291-12169-3-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Joe Perches Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 8 ++++++++ kernel/debug/kdb/kdb_io.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index eac1af8502bb..a0859e169bc3 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,6 +31,14 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +static inline const char *printk_skip_headers(const char *buffer) +{ + while (printk_get_level(buffer)) + buffer = printk_skip_level(buffer); + + return buffer; +} + #define CONSOLE_EXT_LOG_MAX 8192 /* printk's without a loglevel use this.. */ diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..98c9011eac78 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -697,7 +697,7 @@ kdb_printit: * Write to all consoles. */ retlen = strlen(kdb_buffer); - cp = (char *) printk_skip_level(kdb_buffer); + cp = (char *) printk_skip_headers(kdb_buffer); if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); } else { From 262c5e86fec7cfd59754732001a9ff5b13eba501 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:50 -0800 Subject: [PATCH 105/123] printk/btrfs: handle more message headers Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") allows to define more message headers for a single message. The motivation is that continuous lines might get mixed. Therefore it make sense to define the right log level for every piece of a cont line. The current btrfs_printk() macros do not support continuous lines at the moment. But better be prepared for a custom messages and avoid potential "lvl" buffer overflow. This patch iterates over the entire message header. It is interested only into the message level like the original code. This patch also introduces PRINTK_MAX_SINGLE_HEADER_LEN. Three bytes are enough for the message level header at the moment. But it used to be three, see the commit 04d2c8c83d0e ("printk: convert the format for KERN_ to a 2 byte pattern"). Also I fixed the default ratelimit level. It looked very strange when it was different from the default log level. [pmladek@suse.com: Fix a check of the valid message level] Link: http://lkml.kernel.org/r/20161111183236.GD2145@dhcp128.suse.cz Link: http://lkml.kernel.org/r/1478695291-12169-4-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Acked-by: David Sterba Cc: Joe Perches Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/super.c | 26 +++++++++++++++----------- include/linux/printk.h | 2 ++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..180f910339f4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,27 +202,31 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char lvl[4]; + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; struct va_format vaf; va_list args; - const char *type = logtypes[4]; + const char *type = NULL; int kern_level; struct ratelimit_state *ratelimit; va_start(args, fmt); - kern_level = printk_get_level(fmt); - if (kern_level) { + while ((kern_level = printk_get_level(fmt)) != 0) { size_t size = printk_skip_level(fmt) - fmt; - memcpy(lvl, fmt, size); - lvl[size] = '\0'; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } fmt += size; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } else { + } + + if (!type) { *lvl = '\0'; - /* Default to debug output */ - ratelimit = &printk_limits[7]; + type = logtypes[4]; + ratelimit = &printk_limits[4]; } vaf.fmt = fmt; diff --git a/include/linux/printk.h b/include/linux/printk.h index a0859e169bc3..afe8ccec1672 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -10,6 +10,8 @@ extern const char linux_banner[]; extern const char linux_proc_banner[]; +#define PRINTK_MAX_SINGLE_HEADER_LEN 2 + static inline int printk_get_level(const char *buffer) { if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { From 0a4824bf8f8b88ba62c3c6e01608e8bfc2a99a17 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 12 Dec 2016 16:45:53 -0800 Subject: [PATCH 106/123] printk/sound: handle more message headers Commit 4bcc595ccd80 ("printk: reinstate KERN_CONT for printing continuation lines") allows to define more message headers for a single message. The motivation is that continuous lines might get mixed. Therefore it make sense to define the right log level for every piece of a cont line. This patch allows to copy only the real message level. We should ignore KERN_CONT because is added for each message. By other words, we want to know where each piece of the line comes from. [pmladek@suse.com: fix a check of the valid message level] Link: http://lkml.kernel.org/r/20161111183444.GE2145@dhcp128.suse.cz Link: http://lkml.kernel.org/r/1478695291-12169-5-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Joe Perches Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Jason Wessel Cc: Jaroslav Kysela Cc: Takashi Iwai Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- sound/core/misc.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/sound/core/misc.c b/sound/core/misc.c index f2e8226c88fb..21b228046e88 100644 --- a/sound/core/misc.c +++ b/sound/core/misc.c @@ -71,6 +71,7 @@ void __snd_printk(unsigned int level, const char *path, int line, int kern_level; struct va_format vaf; char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV"; + bool level_found = false; #endif #ifdef CONFIG_SND_DEBUG @@ -83,15 +84,22 @@ void __snd_printk(unsigned int level, const char *path, int line, vaf.fmt = format; vaf.va = &args; - kern_level = printk_get_level(format); - if (kern_level) { - const char *end_of_header = printk_skip_level(format); - memcpy(verbose_fmt, format, end_of_header - format); - vaf.fmt = end_of_header; - } else if (level) - memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); - printk(verbose_fmt, sanity_file_name(path), line, &vaf); + while ((kern_level = printk_get_level(vaf.fmt)) != 0) { + const char *end_of_header = printk_skip_level(vaf.fmt); + /* Ignore KERN_CONT. We print filename:line for each piece. */ + if (kern_level >= '0' && kern_level <= '7') { + memcpy(verbose_fmt, vaf.fmt, end_of_header - vaf.fmt); + level_found = true; + } + + vaf.fmt = end_of_header; + } + + if (!level_found && level) + memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); + + printk(verbose_fmt, sanity_file_name(path), line, &vaf); #else vprintk(format, args); #endif From a8cfdc68f6cfc0c7ffc6d664406fe7f06f17eef4 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Mon, 12 Dec 2016 16:45:56 -0800 Subject: [PATCH 107/123] printk: add Kconfig option to set default console loglevel Add a configuration option to set the default console loglevel. This is, as before, still possible to override at runtime through bootargs (loglevel=), sysrq and /proc/printk. There are cases where adding additional arguments on the commandline is impractical, and changing the default for the kernel when being built makes more sense. Provide such a method here, for those who choose to do so. Also, while touching this code, clarify the difference between MESSAGE_LOGLEVEL_DEFAULT and CONSOLE_LOGLEVEL_DEFAULT. Link: http://lkml.kernel.org/r/1479676829-30031-1-git-send-email-olof@lixom.net Signed-off-by: Olof Johansson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 7 ++++++- lib/Kconfig.debug | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/include/linux/printk.h b/include/linux/printk.h index afe8ccec1672..3472cc6b7a60 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -50,10 +50,15 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ #define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ #define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ -#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ #define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ #define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ +/* + * Default used to be hard-coded at 7, we're now allowing it to be set from + * kernel config. + */ +#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT + extern int console_printk[]; #define console_loglevel (console_printk[0]) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bb7d825ba14..65a619e0ad5d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,6 +15,21 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt +config CONSOLE_LOGLEVEL_DEFAULT + int "Default console loglevel (1-15)" + range 1 15 + default "7" + help + Default loglevel to determine what will be printed on the console. + + Setting a default here is equivalent to passing in loglevel= in + the kernel bootargs. loglevel= continues to override whatever + value is specified here as well. + + Note: This does not affect the log level of un-prefixed prink() + usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT + option. + config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 @@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT that are auditing their logs closely may want to set it to a lower priority. + Note: This does not affect what message level gets printed on the console + by default. To change that, use loglevel= in the kernel bootargs, + or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value. + config BOOT_PRINTK_DELAY bool "Delay each boot printk message by N milliseconds" depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY From 03aed214b25c068cb9955063d4742273679b49f3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 12 Dec 2016 16:45:59 -0800 Subject: [PATCH 108/123] get_maintainer: look for arbitrary letter prefixes in sections Jani Nikula proposes patches to add a few new letter prefixes for "B:" bug reporting and "C:" maintainer chatting to the various sections of MAINTAINERS. Add a generic mechanism to get_maintainer.pl to find sections that have any combination of "[A-Z]" letter prefix types in a section. Link: http://lkml.kernel.org/r/1477332323.1984.8.camel@perches.com Signed-off-by: Joe Perches Cc: Jani Nikula Cc: Daniel Vetter Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index aed4511f0304..633f2dd3de27 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -49,6 +49,7 @@ my $scm = 0; my $web = 0; my $subsystem = 0; my $status = 0; +my $letters = ""; my $keywords = 1; my $sections = 0; my $file_emails = 0; @@ -241,6 +242,7 @@ if (!GetOptions( 'status!' => \$status, 'scm!' => \$scm, 'web!' => \$web, + 'letters=s' => \$letters, 'pattern-depth=i' => \$pattern_depth, 'k|keywords!' => \$keywords, 'sections!' => \$sections, @@ -271,7 +273,8 @@ $output_multiline = 0 if ($output_separator ne ", "); $output_rolestats = 1 if ($interactive); $output_roles = 1 if ($output_rolestats); -if ($sections) { +if ($sections || $letters ne "") { + $sections = 1; $email = 0; $email_list = 0; $scm = 0; @@ -682,8 +685,10 @@ sub get_maintainers { $line =~ s/\\\./\./g; ##Convert \. to . $line =~ s/\.\*/\*/g; ##Convert .* to * } - $line =~ s/^([A-Z]):/$1:\t/g; - print("$line\n"); + my $count = $line =~ s/^([A-Z]):/$1:\t/g; + if ($letters eq "" || (!$count || $letters =~ /$1/i)) { + print("$line\n"); + } } print("\n"); } @@ -814,6 +819,7 @@ Other options: --pattern-depth => Number of pattern directory traversals (default: 0 (all)) --keywords => scan patch for keywords (default: $keywords) --sections => print all of the subsystem sections with pattern matches + --letters => print all matching 'letter' types from all matching sections --mailmap => use .mailmap file (default: $email_use_mailmap) --version => show version --help => show this help information From 2de2bd95f45639b8e21b1da7f72f506d43400b4d Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 12 Dec 2016 16:46:02 -0800 Subject: [PATCH 109/123] MAINTAINERS: add "B:" for URI where to file bugs Different subsystems and drivers have different preferences for where to file bugs and what information to include. Add "B:" entry for specifying the URI for the bug tracker directly, a web page for detailed info on filing bugs, or a mailto: URI. Link: http://lkml.kernel.org/r/1476966135-26943-1-git-send-email-jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Andrew Donnellan Acked-by: Daniel Vetter Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index cfbb164acd20..db40318c46da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -74,6 +74,8 @@ Descriptions of section entries: These reviewers should be CCed on patches. L: Mailing list that is relevant to this area W: Web-page with status/info + B: URI for where to file bugs. A web-page with detailed bug + filing info, a direct bug tracker link, or a mailto: URI. Q: Patchwork web based patch tracking system site T: SCM tree type and location. Type is one of: git, hg, quilt, stgit, topgit From 51b06f9f84eb05f98755fd2d6293063a1a23980c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 12 Dec 2016 16:46:05 -0800 Subject: [PATCH 110/123] MAINTAINERS: add drm and drm/i915 bug filing info Link: http://lkml.kernel.org/r/1476966135-26943-2-git-send-email-jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Andrew Donnellan Cc: Daniel Vetter Cc: Dave Airlie Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index db40318c46da..58700bf78e19 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4026,6 +4026,7 @@ DRM DRIVERS M: David Airlie L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~airlied/linux +B: https://bugs.freedesktop.org/ S: Maintained F: drivers/gpu/drm/ F: drivers/gpu/vga/ @@ -4078,6 +4079,7 @@ M: Jani Nikula L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org W: https://01.org/linuxgraphics/ +B: https://01.org/linuxgraphics/documentation/how-report-bugs Q: http://patchwork.freedesktop.org/project/intel-gfx/ T: git git://anongit.freedesktop.org/drm-intel S: Supported From 57599f9b879cae3c85e8646772e94ae568854319 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 12 Dec 2016 16:46:08 -0800 Subject: [PATCH 111/123] MAINTAINERS: add "C:" for URI for chat where developers hang out Make it easier to find the developer chat for the subsystem or driver. Link: http://lkml.kernel.org/r/1476966135-26943-3-git-send-email-jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Andrew Donnellan Cc: Daniel Vetter Cc: Dave Airlie Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 58700bf78e19..550d2ea7db3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -76,6 +76,8 @@ Descriptions of section entries: W: Web-page with status/info B: URI for where to file bugs. A web-page with detailed bug filing info, a direct bug tracker link, or a mailto: URI. + C: URI for chat protocol, server and channel where developers + usually hang out, for example irc://server/channel. Q: Patchwork web based patch tracking system site T: SCM tree type and location. Type is one of: git, hg, quilt, stgit, topgit From 5fc41a70d804f93b88924bed1d2f494f3b735571 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Mon, 12 Dec 2016 16:46:11 -0800 Subject: [PATCH 112/123] MAINTAINERS: add drm and drm/i915 irc channels Link: http://lkml.kernel.org/r/1476966135-26943-4-git-send-email-jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Andrew Donnellan Cc: Daniel Vetter Cc: Dave Airlie Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 550d2ea7db3c..c1738f42e7cc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4029,6 +4029,7 @@ M: David Airlie L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~airlied/linux B: https://bugs.freedesktop.org/ +C: irc://chat.freenode.net/dri-devel S: Maintained F: drivers/gpu/drm/ F: drivers/gpu/vga/ @@ -4082,6 +4083,7 @@ L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org W: https://01.org/linuxgraphics/ B: https://01.org/linuxgraphics/documentation/how-report-bugs +C: irc://chat.freenode.net/intel-gfx Q: http://patchwork.freedesktop.org/project/intel-gfx/ T: git git://anongit.freedesktop.org/drm-intel S: Supported From 6b2a65c7ff612035deb1012388738b54e08ab2a6 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 12 Dec 2016 16:46:14 -0800 Subject: [PATCH 113/123] lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM With CONFIG_DEVMEM not set, CONFIG_STRICT_DEVMEM will be useless even if it is set =y, thus let's update the dependency in Kconfig. Link: http://lkml.kernel.org/r/20161006051217.GA31027@dhcp-128-65.nay.redhat.com Signed-off-by: Dave Young Acked-by: Kees Cook Cc: Ingo Molnar Cc: Dan Williams Cc: Josh Poimboeuf Cc: Tejun Heo Cc: Andrey Ryabinin Cc: Nikolay Aleksandrov Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 65a619e0ad5d..e40a0715f422 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2005,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED config STRICT_DEVMEM bool "Filter access to /dev/mem" - depends on MMU + depends on MMU && DEVMEM depends on ARCH_HAS_DEVMEM_IS_ALLOWED default y if TILE || PPC ---help--- From ce093a04543c403d52c1a5788d8cb92e47453aba Mon Sep 17 00:00:00 2001 From: Jie Chen Date: Mon, 12 Dec 2016 16:46:17 -0800 Subject: [PATCH 114/123] lib/rbtree.c: fix typo in comment of ____rb_erase_color In Case 3 of `sibling == parent->rb_right': Right rotation will not change color of sl and S in the diagram (i.e. should not change "sl" to "Sl", "S" to "s") In Case 3 of `sibling == parent->rb_left': (p) (p) / \ / \ S N --> sr N / \ / Sl sr S / Sl This is actually left rotation at "S", not right rotation. In Case 4 of `sibling == parent->rb_left': (p) (s) / \ / \ S N --> Sl P / \ / \ sl (sr) (sr) N This is actually right rotation at "(p)" + color flips, not left rotation + color flips. Link: http://lkml.kernel.org/r/1472391115-3702-1-git-send-email-fykcee1@gmail.com Signed-off-by: Jie Chen Cc: Wei Yang Cc: Xiao Guangrong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lib/rbtree.c b/lib/rbtree.c index eb8a19fee110..1f8b112a7c35 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ tmp1 = tmp2->rb_right; WRITE_ONCE(sibling->rb_left, tmp1); @@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ + /* Case 3 - left rotate at sibling */ tmp1 = tmp2->rb_left; WRITE_ONCE(sibling->rb_right, tmp1); WRITE_ONCE(tmp2->rb_left, sibling); @@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ + /* Case 4 - right rotate at parent + color flips */ tmp2 = sibling->rb_right; WRITE_ONCE(parent->rb_left, tmp2); WRITE_ONCE(sibling->rb_right, parent); From a2ef9471c771427c2ddd56677b8de45021f6fd71 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 12 Dec 2016 16:46:20 -0800 Subject: [PATCH 115/123] lib/ida: document locking requirements a bit better I wanted to wrap a bunch of ida_simple_get calls into their own locking, until I dug around and read the original commit message. Stuff like this should imo be added to the kernel doc, let's do that. Link: http://lkml.kernel.org/r/20161027072216.20411-1-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Acked-by: Tejun Heo Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/idr.c b/lib/idr.c index 6098336df267..52d2979a05e8 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get); * and go back to the ida_pre_get() call. If the ida is full, it will * return %-ENOSPC. * + * Note that callers must ensure that concurrent access to @ida is not possible. + * See ida_simple_get() for a varaint which takes care of locking. + * * @p_id returns a value in the range @starting_id ... %0x7fffffff. */ int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) @@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy); * Allocates an id in the range start <= id < end, or returns -ENOSPC. * On memory allocation failure, returns -ENOMEM. * + * Compared to ida_get_new_above() this function does its own locking, and + * should be used unless there are special requirements. + * * Use ida_simple_remove() to get rid of an id. */ int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, @@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get); * ida_simple_remove - remove an allocated id. * @ida: the (initialized) ida. * @id: the id returned by ida_simple_get. + * + * Use to release an id allocated with ida_simple_get(). + * + * Compared to ida_remove() this function does its own locking, and should be + * used unless there are special requirements. */ void ida_simple_remove(struct ida *ida, unsigned int id) { From f2c19c2f380fcfdc85eb750016d73f7cd3e77573 Mon Sep 17 00:00:00 2001 From: Jerome Forissier Date: Mon, 12 Dec 2016 16:46:23 -0800 Subject: [PATCH 116/123] checkpatch: don't try to get maintained status when --no-tree is given Fixes the following warning: Use of uninitialized value $root in concatenation (.) or string at /path/to/checkpatch.pl line 764. Link: http://lkml.kernel.org/r/1476719709-16668-1-git-send-email-jerome.forissier@linaro.org Signed-off-by: Jerome Forissier Reviewed-by: Brian Norris Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 23f462f64a3f..7000adb5820c 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -761,7 +761,7 @@ sub seed_camelcase_file { sub is_maintained_obsolete { my ($filename) = @_; - return 0 if (!(-e "$root/scripts/get_maintainer.pl")); + return 0 if (!$tree || !(-e "$root/scripts/get_maintainer.pl")); my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`; From 224236d9c3a65d23cd3f113042404cf5e09e393c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 12 Dec 2016 16:46:26 -0800 Subject: [PATCH 117/123] scripts/checkpatch.pl: fix spelling s/preceeded/preceded/ Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7000adb5820c..9f651bcde046 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5925,7 +5925,7 @@ sub process { } if (!$has_break && $has_statement) { WARN("MISSING_BREAK", - "Possible switch case/default not preceeded by break or fallthrough comment\n" . $herecurr); + "Possible switch case/default not preceded by break or fallthrough comment\n" . $herecurr); } } From d6430f71805aa98d6e8fbc67e605922aefcf9ceb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 12 Dec 2016 16:46:28 -0800 Subject: [PATCH 118/123] checkpatch: don't check .pl files, improve absolute path commit log test perl files (*.pl) are mostly inappropriate to check coding styles so exempt them from long line checks and various .[ch] file type tests. And as well, only scan absolute paths in the commit log, not in the patch. Link: http://lkml.kernel.org/r/85b101d50acafe6c0261d9f7df283c827da52c4a.1477340110.git.joe@perches.com Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: Heinrich Schuchardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9f651bcde046..0c20f035ed99 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2601,20 +2601,6 @@ sub process { $herecurr) if (!$emitted_corrupt++); } -# Check for absolute kernel paths. - if ($tree) { - while ($line =~ m{(?:^|\s)(/\S*)}g) { - my $file = $1; - - if ($file =~ m{^(.*?)(?::\d+)+:?$} && - check_absolute_file($1, $herecurr)) { - # - } else { - check_absolute_file($file, $herecurr); - } - } - } - # UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php if (($realfile =~ /^$/ || $line =~ /^\+/) && $rawline !~ m/^$UTF8*$/) { @@ -2652,6 +2638,20 @@ sub process { "8-bit UTF-8 used in possible commit log\n" . $herecurr); } +# Check for absolute kernel paths in commit message + if ($tree && $in_commit_log) { + while ($line =~ m{(?:^|\s)(/\S*)}g) { + my $file = $1; + + if ($file =~ m{^(.*?)(?::\d+)+:?$} && + check_absolute_file($1, $herecurr)) { + # + } else { + check_absolute_file($file, $herecurr); + } + } + } + # Check for various typo / spelling mistakes if (defined($misspellings) && ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) { @@ -2805,7 +2805,7 @@ sub process { } # check we are in a valid source file if not then ignore this hunk - next if ($realfile !~ /\.(h|c|s|S|pl|sh|dtsi|dts)$/); + next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); # line length limit (with some exclusions) # From 11ca40a0f8e3a788a987f14cb80d836a34d109ae Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 12 Dec 2016 16:46:31 -0800 Subject: [PATCH 119/123] checkpatch: avoid multiple line dereferences Code that puts a single dereferencing identifier on multiple lines like: struct_identifier->member[index]. member = ; is generally hard to follow. Prefer that dereferencing identifiers be single line. Link: http://lkml.kernel.org/r/e9c191ae3f41bedc8ffd5c0fbcc5a1cec1d1d2df.1478120869.git.joe@perches.com Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0c20f035ed99..cf95d3ae0754 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3440,6 +3440,18 @@ sub process { #ignore lines not being added next if ($line =~ /^[^\+]/); +# check for dereferences that span multiple lines + if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ && + $line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) { + $prevline =~ /($Lval\s*(?:\.|->))\s*$/; + my $ref = $1; + $line =~ /^.\s*($Lval)/; + $ref .= $1; + $ref =~ s/\s//g; + WARN("MULTILINE_DEREFERENCE", + "Avoid multiple line dereference - prefer '$ref'\n" . $hereprev); + } + # check for declarations of signed or unsigned without int while ($line =~ m{\b($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) { my $type = $1; From fd39f904b17686df0f409e880af1ccfa2fe4ae1a Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 12 Dec 2016 16:46:34 -0800 Subject: [PATCH 120/123] checkpatch: don't check c99 types like uint8_t under tools Tools contains user space code so uintX_t types are just fine. Link: http://lkml.kernel.org/r/1479286379-853-1-git-send-email-tomas.winkler@intel.com Signed-off-by: Tomas Winkler Acked-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index cf95d3ae0754..7997300d5d30 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5560,8 +5560,9 @@ sub process { "Using weak declarations can have unintended link defects\n" . $herecurr); } -# check for c99 types like uint8_t used outside of uapi/ +# check for c99 types like uint8_t used outside of uapi/ and tools/ if ($realfile !~ m@\binclude/uapi/@ && + $realfile !~ m@\btools/@ && $line =~ /\b($Declare)\s*$Ident\s*[=;,\[]/) { my $type = $1; if ($type =~ /\b($typeC99Typedefs)\b/) { From a82603a8325f89ec53d4c3e249a2e831b747a69d Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Mon, 12 Dec 2016 16:46:37 -0800 Subject: [PATCH 121/123] checkpatch: don't emit unified-diff error for rename-only patches I generated a patch with `git format-patch` which checkpatch thinks is invalid: $ ./scripts/checkpatch.pl lpc-dt/0006-mfd-dt-Move-syscon-bindings-to-syscon-subdirectory.patch WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? Documentation/devicetree/bindings/mfd/{ => syscon}/aspeed-scu.txt | 0 ERROR: Does not appear to be a unified-diff format patch total: 1 errors, 1 warnings, 0 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. lpc-dt/0006-mfd-dt-Move-syscon-bindings-to-syscon-subdirectory.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. The patch in question was all renames with no edits, giving 100% similarity and thus no diff markers. Set '$is_patch = 1;' in the add/remove/rename detection to avoid generating spurious warnings. Link: http://lkml.kernel.org/r/20161205232224.22685-1-andrew@aj.id.au Signed-off-by: Andrew Jeffery Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7997300d5d30..ac5656ef2aec 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2589,6 +2589,7 @@ sub process { $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && (defined($1) || defined($2))))) { + $is_patch = 1; $reported_maintainer_file = 1; WARN("FILE_PATH_CHANGES", "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); From 30f74aa0854c2d5a331b507b14fe421ba4980511 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Mon, 12 Dec 2016 16:46:40 -0800 Subject: [PATCH 122/123] binfmt_elf: use vmalloc() for allocation of vma_filesz We have observed page allocations failures of order 4 during core dump while trying to allocate vma_filesz. This results in a useless core file of size 0. To improve reliability use vmalloc(). Note that the vmalloc() allocation is bounded by sysctl_max_map_count, which is 65,530 by default. So with a 4k page size, and 8 bytes per seg, this is a max of 128 pages or an order 7 allocation. Other parts of the core dump path, such as fill_files_note() are already using vmalloc() for presumably similar reasons. Link: http://lkml.kernel.org/r/1479745791-17611-1-git-send-email-jbaron@akamai.com Signed-off-by: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2798c7..e6c1bd443806 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) + goto end_coredump; + vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); if (!vma_filesz) goto end_coredump; @@ -2311,7 +2313,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - kfree(vma_filesz); + vfree(vma_filesz); kfree(phdr4note); kfree(elf); out: From 39a0e975c37dee93fa1b8ea5f7eacd1c4c8a586e Mon Sep 17 00:00:00 2001 From: Jungseung Lee Date: Mon, 12 Dec 2016 16:46:43 -0800 Subject: [PATCH 123/123] init: reduce rootwait polling interval time to 5ms For several devices, the rootwait time is sensitive because it directly affects booting time. The polling interval of rootwait is currently 100ms. To save unnessesary waiting time, reduce the polling interval to 5 ms. [akpm@linux-foundation.org: remove used-once #define] Link: http://lkml.kernel.org/r/20161207060743.1728-1-js07.lee@samsung.com Signed-off-by: Jungseung Lee Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index dea5de95c2dd..c2de5104aad2 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -588,7 +588,7 @@ void __init prepare_namespace(void) saved_root_name); while (driver_probe_done() != 0 || (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) - msleep(100); + msleep(5); async_synchronize_full(); }