From 30dd3478c3cd7d01cc5afc4952e885ba4eefb730 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Thu, 14 Mar 2024 10:17:13 +0800 Subject: [PATCH 01/62] ocfs2: correctly use ocfs2_find_next_zero_bit() If no bits are zero, ocfs2_find_next_zero_bit() will return max size, so check the return value with -1 is meaningless. Correct this usage and cleanup the code. Link: https://lkml.kernel.org/r/20240314021713.240796-1-joseph.qi@linux.alibaba.com Signed-off-by: Joseph Qi Reviewed-by: Heming Zhao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 19 ++++++------------- fs/ocfs2/reservations.c | 2 +- fs/ocfs2/suballoc.c | 6 ++---- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c803c10dd97e..33aeaaa056d7 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -863,14 +863,8 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, numfound = bitoff = startoff = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { - if (bitoff == left) { - /* mlog(0, "bitoff (%d) == left", bitoff); */ - break; - } - /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " - "numfound = %d\n", bitoff, startoff, numfound);*/ - + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) < + left) { /* Ok, we found a zero bit... is it contig. or do we * start over?*/ if (bitoff == startoff) { @@ -976,9 +970,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, start = count = 0; left = le32_to_cpu(alloc->id1.bitmap1.i_total); - while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) - != -1) { - if ((bit_off < left) && (bit_off == start)) { + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) < + left) { + if (bit_off == start) { count++; start++; continue; @@ -1002,8 +996,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, goto bail; } } - if (bit_off >= left) - break; + count = 1; start = bit_off + 1; } diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index a9d1296d736d..1fe61974d9f0 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -414,7 +414,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap, start = search_start; while ((offset = ocfs2_find_next_zero_bit(bitmap, resmap->m_bitmap_len, - start)) != -1) { + start)) < resmap->m_bitmap_len) { /* Search reached end of the region */ if (offset >= (search_start + search_len)) break; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 166c8918c825..961998415308 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1290,10 +1290,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { - if (offset == total_bits) - break; - + while ((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) < + total_bits) { if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { /* We found a zero, but we can't use it as it * hasn't been put to disk yet! */ From c9abe099865b2d0df66346657b76922d8efd43e4 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 18 Mar 2024 19:56:09 +0800 Subject: [PATCH 02/62] ocfs2: update inode ctime in ocfs2_fileattr_set inode ctime should be updated if ocfs2_fileattr_set is called. Link: https://lkml.kernel.org/r/20240318115609.3194-1-l@damenly.org Signed-off-by: Su Yue Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index b1550ba73f96..71beef7f8a60 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -125,6 +125,7 @@ int ocfs2_fileattr_set(struct mnt_idmap *idmap, ocfs2_inode->ip_attr = flags; ocfs2_set_inode_flags(inode); + inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); if (status < 0) From 5ef6dc08cfde240b8c748733759185646e654570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 13 Mar 2024 22:19:56 +0100 Subject: [PATCH 03/62] lib/build_OID_registry: don't mention the full path of the script in output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This change strips the full path of the script generating lib/oid_registry_data.c to just lib/build_OID_registry. The motivation for this change is Yocto emitting a build warning File /usr/src/debug/linux-lxatac/6.7-r0/lib/oid_registry_data.c in package linux-lxatac-src contains reference to TMPDIR [buildpaths] So this change brings us one step closer to make the build result reproducible independent of the build path. Link: https://lkml.kernel.org/r/20240313211957.884561-2-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Masahiro Yamada Reviewed-by: Nicolas Schier Signed-off-by: Andrew Morton --- lib/build_OID_registry | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/build_OID_registry b/lib/build_OID_registry index d7fc32ea8ac2..56d8bafeb848 100755 --- a/lib/build_OID_registry +++ b/lib/build_OID_registry @@ -8,6 +8,7 @@ # use strict; +use Cwd qw(abs_path); my @names = (); my @oids = (); @@ -17,6 +18,8 @@ if ($#ARGV != 1) { exit(2); } +my $abs_srctree = abs_path($ENV{'srctree'}); + # # Open the file to read from # @@ -35,7 +38,7 @@ close IN_FILE || die; # open C_FILE, ">$ARGV[1]" or die; print C_FILE "/*\n"; -print C_FILE " * Automatically generated by ", $0, ". Do not edit\n"; +print C_FILE " * Automatically generated by ", $0 =~ s#^\Q$abs_srctree/\E##r, ". Do not edit\n"; print C_FILE " */\n"; # From 212f863fa8811c780abacc1d0404c573fdc0a2de Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 20 Mar 2024 11:19:52 +0100 Subject: [PATCH 04/62] bootconfig: do not put quotes on cmdline items unless necessary When trying to migrate to using bootconfig to embed the kernel's and PID1's command line with the kernel image itself, and so allowing changing that without modifying the bootloader, I noticed that /proc/cmdline changed from e.g. console=ttymxc0,115200n8 cma=128M quiet -- --log-level=notice to console="ttymxc0,115200n8" cma="128M" quiet -- --log-level="notice" The kernel parameters are parsed just fine, and the quotes are indeed stripped from the actual argv[] given to PID1. However, the quoting doesn't really serve any purpose and looks excessive, and might confuse some (naive) userspace tool trying to parse /proc/cmdline. So do not quote the value unless it contains whitespace. Link: https://lkml.kernel.org/r/20240320101952.62135-1-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Masami Hiramatsu Signed-off-by: Andrew Morton --- init/main.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/init/main.c b/init/main.c index 881f6230ee59..6b4b656cef1d 100644 --- a/init/main.c +++ b/init/main.c @@ -327,7 +327,7 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size, { struct xbc_node *knode, *vnode; char *end = buf + size; - const char *val; + const char *val, *q; int ret; xbc_node_for_each_key_value(root, knode, val) { @@ -345,8 +345,14 @@ static int __init xbc_snprint_cmdline(char *buf, size_t size, continue; } xbc_array_for_each_value(vnode, val) { - ret = snprintf(buf, rest(buf, end), "%s=\"%s\" ", - xbc_namebuf, val); + /* + * For prettier and more readable /proc/cmdline, only + * quote the value when necessary, i.e. when it contains + * whitespace. + */ + q = strpbrk(val, " \t\r\n") ? "\"" : ""; + ret = snprintf(buf, rest(buf, end), "%s=%s%s%s ", + xbc_namebuf, q, val, q); if (ret < 0) return ret; buf += ret; From 3429055f0451cd3a281f8ed6691335ead626b136 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 20 Mar 2024 11:18:49 +0100 Subject: [PATCH 05/62] mm: kmsan: implement kmsan_memmove() Provide a hook that can be used by custom memcpy implementations to tell KMSAN that the metadata needs to be copied. Without that, false positive reports are possible in the cases where KMSAN fails to intercept memory initialization. Link: https://lore.kernel.org/all/3b7dbd88-0861-4638-b2d2-911c97a4cadf@I-love.SAKURA.ne.jp/ Link: https://lkml.kernel.org/r/20240320101851.2589698-1-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Tetsuo Handa Reviewed-by: Marco Elver Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Tetsuo Handa Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/kmsan-checks.h | 15 +++++++++++++++ mm/kmsan/hooks.c | 11 +++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h index c4cae333deec..e1082dc40abc 100644 --- a/include/linux/kmsan-checks.h +++ b/include/linux/kmsan-checks.h @@ -61,6 +61,17 @@ void kmsan_check_memory(const void *address, size_t size); void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, size_t left); +/** + * kmsan_memmove() - Notify KMSAN about a data copy within kernel. + * @to: destination address in the kernel. + * @from: source address in the kernel. + * @size: number of bytes to copy. + * + * Invoked after non-instrumented version (e.g. implemented using assembly + * code) of memmove()/memcpy() is called, in order to copy KMSAN's metadata. + */ +void kmsan_memmove(void *to, const void *from, size_t to_copy); + #else static inline void kmsan_poison_memory(const void *address, size_t size, @@ -78,6 +89,10 @@ static inline void kmsan_copy_to_user(void __user *to, const void *from, { } +static inline void kmsan_memmove(void *to, const void *from, size_t to_copy) +{ +} + #endif #endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 0b09daa188ef..22e8657800ef 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -285,6 +285,17 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +void kmsan_memmove(void *to, const void *from, size_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(to, (void *)from, size); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_memmove); + /* Helper function to check an URB. */ void kmsan_handle_urb(const struct urb *urb, bool is_out) { From 922621a6828430ea3119b869336157d253489334 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 20 Mar 2024 11:18:50 +0100 Subject: [PATCH 06/62] instrumented.h: add instrument_memcpy_before, instrument_memcpy_after Bug detection tools based on compiler instrumentation may miss memory accesses in custom memcpy implementations (such as copy_mc_to_kernel). Provide instrumentation hooks that tell KASAN, KCSAN, and KMSAN about such accesses. Link: https://lore.kernel.org/all/3b7dbd88-0861-4638-b2d2-911c97a4cadf@I-love.SAKURA.ne.jp/ Link: https://lkml.kernel.org/r/20240320101851.2589698-2-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Dmitry Vyukov Cc: Tetsuo Handa Cc: Linus Torvalds Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 1b608e00290a..711a1f0d1a73 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -147,6 +147,41 @@ instrument_copy_from_user_after(const void *to, const void __user *from, kmsan_unpoison_memory(to, n - left); } +/** + * instrument_memcpy_before - add instrumentation before non-instrumented memcpy + * @to: destination address + * @from: source address + * @n: number of bytes to copy + * + * Instrument memory accesses that happen in custom memcpy implementations. The + * instrumentation should be inserted before the memcpy call. + */ +static __always_inline void instrument_memcpy_before(void *to, const void *from, + unsigned long n) +{ + kasan_check_write(to, n); + kasan_check_read(from, n); + kcsan_check_write(to, n); + kcsan_check_read(from, n); +} + +/** + * instrument_memcpy_after - add instrumentation after non-instrumented memcpy + * @to: destination address + * @from: source address + * @n: number of bytes to copy + * @left: number of bytes not copied (if known) + * + * Instrument memory accesses that happen in custom memcpy implementations. The + * instrumentation should be inserted after the memcpy call. + */ +static __always_inline void instrument_memcpy_after(void *to, const void *from, + unsigned long n, + unsigned long left) +{ + kmsan_memmove(to, from, n - left); +} + /** * instrument_get_user() - add instrumentation to get_user()-like macros * @to: destination variable, may not be address-taken From 61b258b0d2f60858888e248da9744d63e0fa6856 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 20 Mar 2024 11:18:51 +0100 Subject: [PATCH 07/62] x86: call instrumentation hooks from copy_mc.c Memory accesses in copy_mc_to_kernel() and copy_mc_to_user() are performed by assembly routines and are invisible to KASAN, KCSAN, and KMSAN. Add hooks from instrumentation.h to tell the tools these functions have memcpy/copy_from_user semantics. The call to copy_mc_fragile() in copy_mc_fragile_handle_tail() is left intact, because the latter is only called from the assembly implementation of copy_mc_fragile(), so the memory accesses in it are covered by the instrumentation in copy_mc_to_kernel() and copy_mc_to_user(). Link: https://lore.kernel.org/all/3b7dbd88-0861-4638-b2d2-911c97a4cadf@I-love.SAKURA.ne.jp/ Link: https://lkml.kernel.org/r/20240320101851.2589698-3-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Linus Torvalds Cc: Dmitry Vyukov Cc: Marco Elver Cc: Tetsuo Handa Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/lib/copy_mc.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/copy_mc.c b/arch/x86/lib/copy_mc.c index 6e8b7e600def..97e88e58567b 100644 --- a/arch/x86/lib/copy_mc.c +++ b/arch/x86/lib/copy_mc.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -61,10 +62,20 @@ unsigned long copy_mc_enhanced_fast_string(void *dst, const void *src, unsigned */ unsigned long __must_check copy_mc_to_kernel(void *dst, const void *src, unsigned len) { - if (copy_mc_fragile_enabled) - return copy_mc_fragile(dst, src, len); - if (static_cpu_has(X86_FEATURE_ERMS)) - return copy_mc_enhanced_fast_string(dst, src, len); + unsigned long ret; + + if (copy_mc_fragile_enabled) { + instrument_memcpy_before(dst, src, len); + ret = copy_mc_fragile(dst, src, len); + instrument_memcpy_after(dst, src, len, ret); + return ret; + } + if (static_cpu_has(X86_FEATURE_ERMS)) { + instrument_memcpy_before(dst, src, len); + ret = copy_mc_enhanced_fast_string(dst, src, len); + instrument_memcpy_after(dst, src, len, ret); + return ret; + } memcpy(dst, src, len); return 0; } @@ -75,6 +86,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un unsigned long ret; if (copy_mc_fragile_enabled) { + instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_fragile((__force void *)dst, src, len); __uaccess_end(); @@ -82,6 +94,7 @@ unsigned long __must_check copy_mc_to_user(void __user *dst, const void *src, un } if (static_cpu_has(X86_FEATURE_ERMS)) { + instrument_copy_to_user(dst, src, len); __uaccess_begin(); ret = copy_mc_enhanced_fast_string((__force void *)dst, src, len); __uaccess_end(); From 4d9784c00a15631b68a5d95be907aa308f269551 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 22 Mar 2024 15:37:24 +0800 Subject: [PATCH 08/62] fs: add kernel-doc comments to fat_parse_long() Add kernel-doc style comments with complete parameter descriptions for fat_parse_long(). Link: https://lkml.kernel.org/r/20240322073724.102332-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton --- fs/fat/dir.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 00235b8a1823..acbec5bdd521 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -269,6 +269,18 @@ enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; /** * fat_parse_long - Parse extended directory entry. * + * @dir: Pointer to the inode that represents the directory. + * @pos: On input, contains the starting position to read from. + * On output, updated with the new position. + * @bh: Pointer to the buffer head that may be used for reading directory + * entries. May be updated. + * @de: On input, points to the current directory entry. + * On output, points to the next directory entry. + * @unicode: Pointer to a buffer where the parsed Unicode long filename will be + * stored. + * @nr_slots: Pointer to a variable that will store the number of longname + * slots found. + * * This function returns zero on success, negative value on error, or one of * the following: * From f9899c028151468d8c4af0bcbb3d5e87619b0973 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Fri, 26 Jan 2024 14:44:51 +0800 Subject: [PATCH 09/62] NUMA: early use of cpu_to_node() returns 0 instead of the correct node id During the kernel booting, the generic cpu_to_node() is called too early in arm64, powerpc and riscv when CONFIG_NUMA is enabled. There are at least four places in the common code where the generic cpu_to_node() is called before it is initialized: 1.) early_trace_init() in kernel/trace/trace.c 2.) sched_init() in kernel/sched/core.c 3.) init_sched_fair_class() in kernel/sched/fair.c 4.) workqueue_init_early() in kernel/workqueue.c This will harm performance since there is an increase in off node accesses. In order to fix the bug, the patch introduces early_numa_node_init() which is called after smp_prepare_boot_cpu() in start_kernel. early_numa_node_init will initialize the "numa_node" as soon as the early_cpu_to_node() is ready, before the cpu_to_node() is called at the first time. Link: https://lkml.kernel.org/r/20240126064451.5465-1-shijie@os.amperecomputing.com Signed-off-by: Huang Shijie Acked-by: Palmer Dabbelt [RISC-V] Cc: Albert Ou Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Greg Kroah-Hartman Cc: Huacai Chen Cc: Ingo Molnar Cc: Jakub Kicinski Cc: Jiaxun Yang Cc: Josh Poimboeuf Cc: Mark Rutland Cc: Masami Hiramatsu Cc: Michael Ellerman Cc: Michael Kelley (LINUX) Cc: "Mike Rapoport (IBM)" Cc: Nick Desaulniers Cc: Paul Walmsley Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Vlastimil Babka Cc: Will Deacon Cc: Yury Norov Signed-off-by: Andrew Morton --- init/main.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/init/main.c b/init/main.c index 6b4b656cef1d..0aecd2839c1f 100644 --- a/init/main.c +++ b/init/main.c @@ -882,6 +882,19 @@ static void __init print_unknown_bootoptions(void) memblock_free(unknown_options, len); } +static void __init early_numa_node_init(void) +{ +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +#ifndef cpu_to_node + int cpu; + + /* The early_cpu_to_node() should be ready here. */ + for_each_possible_cpu(cpu) + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); +#endif +#endif +} + asmlinkage __visible __init __no_sanitize_address __noreturn __no_stack_protector void start_kernel(void) { @@ -912,6 +925,7 @@ void start_kernel(void) setup_nr_cpu_ids(); setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ + early_numa_node_init(); boot_cpu_hotplug_init(); pr_notice("Kernel command line: %s\n", saved_command_line); From 6b839b3b76cf17296ebd4a893841f32cae08229c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Mon, 5 Feb 2024 09:26:30 -0800 Subject: [PATCH 10/62] regset: use kvzalloc() for regset_get_alloc() While browsing through ChromeOS crash reports, I found one with an allocation failure that looked like this: chrome: page allocation failure: order:7, mode:0x40dc0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),cpuset=urgent,mems_allowed=0 CPU: 7 PID: 3295 Comm: chrome Not tainted 5.15.133-20574-g8044615ac35c #1 (HASH:1162 1) Hardware name: Google Lazor (rev3 - 8) with KB Backlight (DT) Call trace: ... warn_alloc+0x104/0x174 __alloc_pages+0x5f0/0x6e4 kmalloc_order+0x44/0x98 kmalloc_order_trace+0x34/0x124 __kmalloc+0x228/0x36c __regset_get+0x68/0xcc regset_get_alloc+0x1c/0x28 elf_core_dump+0x3d8/0xd8c do_coredump+0xeb8/0x1378 get_signal+0x14c/0x804 ... An order 7 allocation is (1 << 7) contiguous pages, or 512K. It's not a surprise that this allocation failed on a system that's been running for a while. More digging showed that it was fairly easy to see the order 7 allocation by just sending a SIGQUIT to chrome (or other processes) to generate a core dump. The actual amount being allocated was 279,584 bytes and it was for "core_note_type" NT_ARM_SVE. There was quite a bit of discussion [1] on the mailing lists in response to my v1 patch attempting to switch to vmalloc. The overall conclusion was that we could likely reduce the 279,584 byte allocation by quite a bit and Mark Brown has sent a patch to that effect [2]. However even with the 279,584 byte allocation gone there are still 65,552 byte allocations. These are just barely more than the 65,536 bytes and thus would require an order 5 allocation. An order 5 allocation is still something to avoid unless necessary and nothing needs the memory here to be contiguous. Change the allocation to kvzalloc() which should still be efficient for small allocations but doesn't force the memory subsystem to work hard (and maybe fail) at getting a large contiguous chunk. [1] https://lore.kernel.org/r/20240201171159.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid [2] https://lore.kernel.org/r/20240203-arm64-sve-ptrace-regset-size-v1-1-2c3ba1386b9e@kernel.org Link: https://lkml.kernel.org/r/20240205092626.v2.1.Id9ad163b60d21c9e56c2d686b0cc9083a8ba7924@changeid Signed-off-by: Douglas Anderson Reviewed-by: Catalin Marinas Cc: Al Viro Cc: Christian Brauner Cc: Dave Martin Cc: Eric Biederman Cc: Jan Kara Cc: Kees Cook Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Oleg Nesterov Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/binfmt_elf.c | 2 +- kernel/regset.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5397b552fbeb..ac178ad38823 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1928,7 +1928,7 @@ static void free_note_info(struct elf_note_info *info) threads = t->next; WARN_ON(t->notes[0].data && t->notes[0].data != &t->prstatus); for (i = 1; i < info->thread_notes; ++i) - kfree(t->notes[i].data); + kvfree(t->notes[i].data); kfree(t); } kfree(info->psinfo.data); diff --git a/kernel/regset.c b/kernel/regset.c index 586823786f39..b2871fa68b2a 100644 --- a/kernel/regset.c +++ b/kernel/regset.c @@ -16,14 +16,14 @@ static int __regset_get(struct task_struct *target, if (size > regset->n * regset->size) size = regset->n * regset->size; if (!p) { - to_free = p = kzalloc(size, GFP_KERNEL); + to_free = p = kvzalloc(size, GFP_KERNEL); if (!p) return -ENOMEM; } res = regset->regset_get(target, regset, (struct membuf){.p = p, .left = size}); if (res < 0) { - kfree(to_free); + kvfree(to_free); return res; } *data = p; @@ -71,6 +71,6 @@ int copy_regset_to_user(struct task_struct *target, ret = regset_get_alloc(target, regset, size, &buf); if (ret > 0) ret = copy_to_user(data, buf, ret) ? -EFAULT : 0; - kfree(buf); + kvfree(buf); return ret; } From 4eb7b93e03101fd3f35e69affe566e4b1e3e3dca Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 28 Mar 2024 20:52:00 +0800 Subject: [PATCH 11/62] ocfs2: improve write IO performance when fragmentation is high The group_search function ocfs2_cluster_group_search() should bypass groups with insufficient space to avoid unnecessary searches. This patch is particularly useful when ocfs2 is handling huge number small files, and volume fragmentation is very high. In this case, ocfs2 is busy with looking up available la window from //global_bitmap. This patch introduces a new member in the Group Description (gd) struct called 'bg_contig_free_bits', representing the max contigous free bits in this gd. When ocfs2 allocates a new la window from //global_bitmap, 'bg_contig_free_bits' helps expedite the search process. Let's image below path. 1. la state (->local_alloc_state) is set THROTTLED or DISABLED. 2. when user delete a large file and trigger ocfs2_local_alloc_seen_free_bits set osb->local_alloc_state unconditionally. 3. a write IOs thread run and trigger the worst performance path ``` ocfs2_reserve_clusters_with_limit ocfs2_reserve_local_alloc_bits ocfs2_local_alloc_slide_window //[1] + ocfs2_local_alloc_reserve_for_window //[2] + ocfs2_local_alloc_new_window //[3] ocfs2_recalc_la_window ``` [1]: will be called when la window bits used up. [2]: under la state is ENABLED, and this func only check global_bitmap free bits, it will succeed in general. [3]: will use the default la window size to search clusters then fail. ocfs2_recalc_la_window attempts other la window sizes. the timing complexity is O(n^4), resulting in a significant time cost for scanning global bitmap. This leads to a dramatic slowdown in write I/Os (e.g., user space 'dd'). i.e. an ocfs2 partition size: 1.45TB, cluster size: 4KB, la window default size: 106MB. The partition is fragmentation by creating & deleting huge mount of small files. before this patch, the timing of [3] should be (the number got from real world): - la window size change order (size: MB): 106, 53, 26.5, 13, 6.5, 3.25, 1.6, 0.8 only 0.8MB succeed, 0.8MB also triggers la window to disable. ocfs2_local_alloc_new_window retries 8 times, first 7 times totally runs in worst case. - group chain number: 242 ocfs2_claim_suballoc_bits calls for-loop 242 times - each chain has 49 block group ocfs2_search_chain calls while-loop 49 times - each bg has 32256 blocks ocfs2_block_group_find_clear_bits calls while-loop for 32256 bits. for ocfs2_find_next_zero_bit uses ffz() to find zero bit, let's use (32256/64) (this is not worst value) for timing calucation. the loop times: 7*242*49*(32256/64) = 41835024 (~42 million times) In the worst case, user space writes 1MB data will trigger 42M scanning times. under this patch, the timing is '7*242*49 = 83006', reduced by three orders of magnitude. Link: https://lkml.kernel.org/r/20240328125203.20892-2-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/move_extents.c | 2 +- fs/ocfs2/ocfs2_fs.h | 3 +- fs/ocfs2/resize.c | 8 ++++ fs/ocfs2/suballoc.c | 100 ++++++++++++++++++++++++++++++++++++---- fs/ocfs2/suballoc.h | 6 ++- 5 files changed, 108 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 1f9ed117e78b..f9d6a4f9ca92 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -685,7 +685,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, } ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, - goal_bit, len); + goal_bit, len, 0, 0); if (ret) { ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, le16_to_cpu(gd->bg_chain)); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7aebdbf5cc0a..c93689b568fe 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -883,7 +883,8 @@ struct ocfs2_group_desc __le16 bg_free_bits_count; /* Free bits count */ __le16 bg_chain; /* What chain I am in. */ /*10*/ __le32 bg_generation; - __le32 bg_reserved1; + __le16 bg_contig_free_bits; /* max contig free bits length */ + __le16 bg_reserved1; __le64 bg_next_group; /* Next group in my list, in blocks */ /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d65d43c61857..c4a4016d3866 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -91,6 +91,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, u16 cl_bpc = le16_to_cpu(cl->cl_bpc); u16 cl_cpg = le16_to_cpu(cl->cl_cpg); u16 old_bg_clusters; + u16 contig_bits; + __le16 old_bg_contig_free_bits; trace_ocfs2_update_last_group_and_inode(new_clusters, first_new_cluster); @@ -122,6 +124,11 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } + contig_bits = ocfs2_find_max_contig_free_bits(group->bg_bitmap, + le16_to_cpu(group->bg_bits), 0); + old_bg_contig_free_bits = group->bg_contig_free_bits; + group->bg_contig_free_bits = cpu_to_le16(contig_bits); + ocfs2_journal_dirty(handle, group_bh); /* update the inode accordingly. */ @@ -160,6 +167,7 @@ out_rollback: le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); + group->bg_contig_free_bits = old_bg_contig_free_bits; } out: if (ret) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 961998415308..7e30e9d79890 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -50,6 +50,10 @@ struct ocfs2_suballoc_result { u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ + unsigned int sr_max_contig_bits; /* The length for contiguous + * free bits, only available + * for cluster group + */ }; static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) @@ -1272,6 +1276,26 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, return ret; } +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start) +{ + u16 offset, free_bits; + u16 contig_bits = 0; + + while (start < total_bits) { + offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start); + if (offset == total_bits) + break; + + start = ocfs2_find_next_bit(bitmap, total_bits, offset); + free_bits = start - offset; + if (contig_bits < free_bits) + contig_bits = free_bits; + } + + return contig_bits; +} + static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, @@ -1280,6 +1304,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, { void *bitmap; u16 best_offset, best_size; + u16 prev_best_size = 0; int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; @@ -1306,6 +1331,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, /* got a zero after some ones */ found = 1; start = offset + 1; + prev_best_size = best_size; } if (found > best_size) { best_size = found; @@ -1318,6 +1344,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, } } + /* best_size will be allocated, we save prev_best_size */ + res->sr_max_contig_bits = prev_best_size; if (best_size) { res->sr_bit_offset = best_offset; res->sr_bits = best_size; @@ -1335,11 +1363,15 @@ int ocfs2_block_group_set_bits(handle_t *handle, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits) + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath) { int status; void *bitmap = bg->bg_bitmap; int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + unsigned int start = bit_off + num_bits; + u16 contig_bits; /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -1371,6 +1403,28 @@ int ocfs2_block_group_set_bits(handle_t *handle, while(num_bits--) ocfs2_set_bit(bit_off++, bitmap); + /* + * this is optimize path, caller set old contig value + * in max_contig_bits to bypass finding action. + */ + if (fastpath) { + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else if (ocfs2_is_cluster_bitmap(alloc_inode)) { + /* + * Usually, the block group bitmap allocates only 1 bit + * at a time, while the cluster group allocates n bits + * each time. Therefore, we only save the contig bits for + * the cluster group. + */ + contig_bits = ocfs2_find_max_contig_free_bits(bitmap, + le16_to_cpu(bg->bg_bits), start); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + ocfs2_journal_dirty(handle, group_bh); bail: @@ -1484,7 +1538,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (gd->bg_free_bits_count) { + if (le16_to_cpu(gd->bg_contig_free_bits) && + le16_to_cpu(gd->bg_contig_free_bits) < bits_wanted) + return -ENOSPC; + + /* ->bg_contig_free_bits may un-initialized, so compare again */ + if (le16_to_cpu(gd->bg_free_bits_count) >= bits_wanted) { max_bits = le16_to_cpu(gd->bg_bits); /* Tail groups in cluster bitmaps which aren't cpg @@ -1553,7 +1612,7 @@ static int ocfs2_block_group_search(struct inode *inode, BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (le16_to_cpu(bg->bg_free_bits_count) >= bits_wanted) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), @@ -1713,7 +1772,8 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, - res->sr_bit_offset, res->sr_bits); + res->sr_bit_offset, res->sr_bits, + res->sr_max_contig_bits, 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, @@ -1847,7 +1907,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bg, group_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (status < 0) { ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh, res->sr_bits, chain); @@ -2161,7 +2223,9 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle, bg, bg_bh, res->sr_bit_offset, - res->sr_bits); + res->sr_bits, + res->sr_max_contig_bits, + 0); if (ret < 0) { ocfs2_rollback_alloc_dinode_counts(ac->ac_inode, ac->ac_bh, res->sr_bits, chain); @@ -2380,11 +2444,13 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits, + unsigned int max_contig_bits, void (*undo_fn)(unsigned int bit, unsigned long *bmap)) { int status; unsigned int tmp; + u16 contig_bits; struct ocfs2_group_desc *undo_bg = NULL; struct journal_head *jh; @@ -2431,6 +2497,20 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, num_bits); } + /* + * TODO: even 'num_bits == 1' (the worst case, release 1 cluster), + * we still need to rescan whole bitmap. + */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (contig_bits > max_contig_bits) + max_contig_bits = contig_bits; + bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + } else { + bg->bg_contig_free_bits = 0; + } + if (undo_fn) spin_unlock(&jh->b_state_lock); @@ -2457,6 +2537,7 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *group; + __le16 old_bg_contig_free_bits = 0; /* The alloc_bh comes from ocfs2_free_dinode() or * ocfs2_free_clusters(). The callers have all locked the @@ -2481,9 +2562,11 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + old_bg_contig_free_bits = group->bg_contig_free_bits; status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count, undo_fn); + start_bit, count, 0, undo_fn); if (status < 0) { mlog_errno(status); goto bail; @@ -2494,7 +2577,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, if (status < 0) { mlog_errno(status); ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, - start_bit, count); + start_bit, count, + le16_to_cpu(old_bg_contig_free_bits), 1); goto bail; } diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 9c74eace3adc..b481b834857d 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -79,12 +79,16 @@ void ocfs2_rollback_alloc_dinode_counts(struct inode *inode, struct buffer_head *di_bh, u32 num_bits, u16 chain); +u16 ocfs2_find_max_contig_free_bits(void *bitmap, + u16 total_bits, u16 start); int ocfs2_block_group_set_bits(handle_t *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, struct buffer_head *group_bh, unsigned int bit_off, - unsigned int num_bits); + unsigned int num_bits, + unsigned int max_contig_bits, + int fastpath); int ocfs2_claim_metadata(handle_t *handle, struct ocfs2_alloc_context *ac, From f51dac026f75863004ebfb7885cec98e6d3172bb Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 28 Mar 2024 20:52:01 +0800 Subject: [PATCH 12/62] ocfs2: adjust enabling place for la window Patch series "improve write IO performance when fragmentation is high", v6. This patch (of 4): After introducing gd->bg_contig_free_bits, the code path 'ocfs2_cluster_group_search() => ocfs2_local_alloc_seen_free_bits()' becomes death when all the gd->bg_contig_free_bits are set to the correct value. This patch relocates ocfs2_local_alloc_seen_free_bits() to a more appropriate location. (The new place being ocfs2_block_group_set_bits().) In ocfs2_local_alloc_seen_free_bits(), the scope of the spin-lock has been adjusted to reduce meaningless lock races. e.g: when userspace creates & deletes 1 cluster_size files in parallel, acquiring the spin-lock in ocfs2_local_alloc_seen_free_bits() is totally pointless and impedes IO performance. Link: https://lkml.kernel.org/r/20240328125203.20892-3-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/localalloc.c | 11 ++++++----- fs/ocfs2/suballoc.c | 9 ++------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 33aeaaa056d7..c84ce53cdec0 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -212,14 +212,15 @@ static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, unsigned int num_clusters) { - spin_lock(&osb->osb_lock); - if (osb->local_alloc_state == OCFS2_LA_DISABLED || - osb->local_alloc_state == OCFS2_LA_THROTTLED) - if (num_clusters >= osb->local_alloc_default_bits) { + if (num_clusters >= osb->local_alloc_default_bits) { + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) { cancel_delayed_work(&osb->la_enable_wq); osb->local_alloc_state = OCFS2_LA_ENABLED; } - spin_unlock(&osb->osb_lock); + spin_unlock(&osb->osb_lock); + } } void ocfs2_la_enable_worker(struct work_struct *work) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 7e30e9d79890..8314ec487cfb 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1372,6 +1372,7 @@ int ocfs2_block_group_set_bits(handle_t *handle, int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; unsigned int start = bit_off + num_bits; u16 contig_bits; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); /* All callers get the descriptor via * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ @@ -1421,6 +1422,7 @@ int ocfs2_block_group_set_bits(handle_t *handle, if (contig_bits > max_contig_bits) max_contig_bits = contig_bits; bg->bg_contig_free_bits = cpu_to_le16(max_contig_bits); + ocfs2_local_alloc_seen_free_bits(osb, max_contig_bits); } else { bg->bg_contig_free_bits = 0; } @@ -1587,13 +1589,6 @@ static int ocfs2_cluster_group_search(struct inode *inode, * of bits. */ if (min_bits <= res->sr_bits) search = 0; /* success */ - else if (res->sr_bits) { - /* - * Don't show bits which we'll be returning - * for allocation to the local alloc bitmap. - */ - ocfs2_local_alloc_seen_free_bits(osb, res->sr_bits); - } } return search; From 525350221beb55bc6795595443d4cdeecb68ebec Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 28 Mar 2024 20:52:02 +0800 Subject: [PATCH 13/62] ocfs2: speed up chain-list searching Add short-circuit code to speed up searching Link: https://lkml.kernel.org/r/20240328125203.20892-4-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/suballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8314ec487cfb..f7b483f0de2a 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2006,7 +2006,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { if (i == victim) continue; - if (!cl->cl_recs[i].c_free) + if (le32_to_cpu(cl->cl_recs[i].c_free) < bits_wanted) continue; ac->ac_chain = i; From fc07d2a2118aa83dca9bdc8fda6ab9fe2f28a8d6 Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Thu, 28 Mar 2024 20:52:03 +0800 Subject: [PATCH 14/62] ocfs2: fix sparse warnings 1. fs/ocfs2/localalloc.c:1224:41: warning: incorrect type in argument 1 (different base types) fs/ocfs2/localalloc.c:1224:41: expected unsigned long long val1 fs/ocfs2/localalloc.c:1224:41: got restricted __le32 [usertype] la_bm_off 2. fs/ocfs2/export.c:258:32: warning: cast to restricted __le32 fs/ocfs2/export.c:259:33: warning: cast to restricted __le32 fs/ocfs2/export.c:260:32: warning: cast to restricted __le32 fs/ocfs2/export.c:272:32: warning: cast to restricted __le32 fs/ocfs2/export.c:273:33: warning: cast to restricted __le32 fs/ocfs2/export.c:274:32: warning: cast to restricted __le32 3. fs/ocfs2/inode.c:1623:13: warning: context imbalance in 'ocfs2_inode_cache_lock' - wrong count at exit fs/ocfs2/inode.c:1630:13: warning: context imbalance in 'ocfs2_inode_cache_unlock' - unexpected unlock 4. fs/ocfs2/refcounttree.c:633:27: warning: incorrect type in assignment (different base types) fs/ocfs2/refcounttree.c:633:27: expected restricted __le32 [usertype] rf_generation fs/ocfs2/refcounttree.c:633:27: got unsigned int 5. fs/ocfs2/dlm/dlmdomain.c:1316:20: warning: context imbalance in 'dlm_query_nodeinfo_handler' - different lock contexts for basic block Link: https://lkml.kernel.org/r/20240328125203.20892-5-heming.zhao@suse.com Signed-off-by: Heming Zhao Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmdomain.c | 11 +++++------ fs/ocfs2/export.c | 12 ++++++------ fs/ocfs2/inode.c | 2 ++ fs/ocfs2/localalloc.c | 4 ++-- fs/ocfs2/refcounttree.c | 2 +- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 5c04dde99981..2e0a2f338282 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1274,7 +1274,7 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, { struct dlm_query_nodeinfo *qn; struct dlm_ctxt *dlm = NULL; - int locked = 0, status = -EINVAL; + int status = -EINVAL; qn = (struct dlm_query_nodeinfo *) msg->buf; @@ -1290,12 +1290,11 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, } spin_lock(&dlm->spinlock); - locked = 1; if (dlm->joining_node != qn->qn_nodenum) { mlog(ML_ERROR, "Node %d queried nodes on domain %s but " "joining node is %d\n", qn->qn_nodenum, qn->qn_domain, dlm->joining_node); - goto bail; + goto unlock; } /* Support for node query was added in 1.1 */ @@ -1305,14 +1304,14 @@ static int dlm_query_nodeinfo_handler(struct o2net_msg *msg, u32 len, "but active dlm protocol is %d.%d\n", qn->qn_nodenum, qn->qn_domain, dlm->dlm_locking_proto.pv_major, dlm->dlm_locking_proto.pv_minor); - goto bail; + goto unlock; } status = dlm_match_nodes(dlm, qn); +unlock: + spin_unlock(&dlm->spinlock); bail: - if (locked) - spin_unlock(&dlm->spinlock); spin_unlock(&dlm_domain_lock); return status; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index b8b6a191b5cb..96b684763b39 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -255,9 +255,9 @@ static struct dentry *ocfs2_fh_to_dentry(struct super_block *sb, if (fh_len < 3 || fh_type > 2) return NULL; - handle.ih_blkno = (u64)le32_to_cpu(fid->raw[0]) << 32; - handle.ih_blkno |= (u64)le32_to_cpu(fid->raw[1]); - handle.ih_generation = le32_to_cpu(fid->raw[2]); + handle.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[1]); + handle.ih_generation = le32_to_cpu((__force __le32)fid->raw[2]); return ocfs2_get_dentry(sb, &handle); } @@ -269,9 +269,9 @@ static struct dentry *ocfs2_fh_to_parent(struct super_block *sb, if (fh_type != 2 || fh_len < 6) return NULL; - parent.ih_blkno = (u64)le32_to_cpu(fid->raw[3]) << 32; - parent.ih_blkno |= (u64)le32_to_cpu(fid->raw[4]); - parent.ih_generation = le32_to_cpu(fid->raw[5]); + parent.ih_blkno = (u64)le32_to_cpu((__force __le32)fid->raw[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu((__force __le32)fid->raw[4]); + parent.ih_generation = le32_to_cpu((__force __le32)fid->raw[5]); return ocfs2_get_dentry(sb, &parent); } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 999111bfc271..2cc5c99fe941 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1621,6 +1621,7 @@ static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info } static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) +__acquires(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); @@ -1628,6 +1629,7 @@ static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) } static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) +__releases(&oi->ip_lock) { struct ocfs2_inode_info *oi = cache_info_to_inode(ci); diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index c84ce53cdec0..5df34561c551 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -336,7 +336,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) "found = %u, set = %u, taken = %u, off = %u\n", num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), le32_to_cpu(alloc->id1.bitmap1.i_total), - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off)); status = -EINVAL; goto bail; @@ -1214,7 +1214,7 @@ retry_enospc: OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); trace_ocfs2_local_alloc_new_window_result( - OCFS2_LOCAL_ALLOC(alloc)->la_bm_off, + le32_to_cpu(OCFS2_LOCAL_ALLOC(alloc)->la_bm_off), le32_to_cpu(alloc->id1.bitmap1.i_total)); bail: diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3f80a56d0d60..1f303b1adf1a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -630,7 +630,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, rb->rf_records.rl_count = cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); spin_lock(&osb->osb_lock); - rb->rf_generation = osb->s_next_generation++; + rb->rf_generation = cpu_to_le32(osb->s_next_generation++); spin_unlock(&osb->osb_lock); ocfs2_journal_dirty(handle, new_bh); From b0f970c50d439df46ade159950201faba36da10b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 29 Mar 2024 21:28:25 +0800 Subject: [PATCH 15/62] Documentation: kdump: clean up the outdated description After commit 443cbaf9e2fd ("crash: split vmcoreinfo exporting code out from crash_core.c"), Kconfig item CRASH_CORE has gone away in kernel. Items VMCORE_INFO and CRASH_RESERVE are used instead. So clean up the outdated description about CRASH_CORE and update it accordingly. Link: https://lkml.kernel.org/r/20240329132825.1102459-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Jonathan Corbet Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: WANG Xuerui Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/kdump.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index 0302a93b1d40..5376890adbeb 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -136,10 +136,6 @@ System kernel config options CONFIG_KEXEC_CORE=y - Subsequently, CRASH_CORE is selected by KEXEC_CORE:: - - CONFIG_CRASH_CORE=y - 2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo filesystems." This is usually enabled by default:: @@ -168,6 +164,10 @@ Dump-capture kernel config options (Arch Independent) CONFIG_CRASH_DUMP=y + And this will select VMCORE_INFO and CRASH_RESERVE:: + CONFIG_VMCORE_INFO=y + CONFIG_CRASH_RESERVE=y + 2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: CONFIG_PROC_VMCORE=y From 56fd61628b7a5c1ff474619580796f11d5c8973a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Mar 2024 15:30:42 +0100 Subject: [PATCH 16/62] kcov: avoid clang out-of-range warning The area_size is never larger than the maximum on 64-bit architectutes: kernel/kcov.c:634:29: error: result of comparison of constant 1152921504606846975 with expression of type '__u32' (aka 'unsigned int') is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) ~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The compiler can correctly optimize the check away and the code appears correct to me, so just add a cast to avoid the warning. Link: https://lkml.kernel.org/r/20240328143051.1069575-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Justin Stitt Cc: Andrey Konovalov Cc: Bill Wendling Cc: Dmitry Vyukov Cc: Nathan Chancellor Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- kernel/kcov.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/kcov.c b/kernel/kcov.c index f9ac2e9e460f..c3124f6d5536 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -627,7 +627,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, mode = kcov_get_mode(remote_arg->trace_mode); if (mode < 0) return mode; - if (remote_arg->area_size > LONG_MAX / sizeof(unsigned long)) + if ((unsigned long)remote_arg->area_size > + LONG_MAX / sizeof(unsigned long)) return -EINVAL; kcov->mode = mode; t->kcov = kcov; From 5f08383c1558f30e3a52db3fd30110835d687b9b Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:57:51 +0100 Subject: [PATCH 17/62] initrd: remove the now superfluous sentinel element from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel from kern_do_mounts_initrd_table. Link: https://lkml.kernel.org/r/20240328-jag-sysctl_remset_misc-v1-4-47c1463b3af2@samsung.com Signed-off-by: Joel Granados Signed-off-by: Andrew Morton --- init/do_mounts_initrd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 425f4bcf4b77..22c7f41ff642 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -29,7 +29,6 @@ static struct ctl_table kern_do_mounts_initrd_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static __init int kernel_do_mounts_initrd_sysctls_init(void) From 029c45bb24d02a4de7b1e3866fe7592d59c10718 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:57:52 +0100 Subject: [PATCH 18/62] ipc: remove the now superfluous sentinel element from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove the sentinels from ipc_sysctls and mq_sysctls Link: https://lkml.kernel.org/r/20240328-jag-sysctl_remset_misc-v1-5-47c1463b3af2@samsung.com Signed-off-by: Joel Granados Signed-off-by: Andrew Morton --- ipc/ipc_sysctl.c | 1 - ipc/mq_sysctl.c | 1 - 2 files changed, 2 deletions(-) diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 45cb1dabce29..0867535af96f 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -178,7 +178,6 @@ static struct ctl_table ipc_sysctls[] = { .extra2 = SYSCTL_INT_MAX, }, #endif - {} }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 21fba3a6edaf..22ec532c7fa1 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -64,7 +64,6 @@ static struct ctl_table mq_sysctls[] = { .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, - {} }; static struct ctl_table_set *set_lookup(struct ctl_table_root *root) From 040bf9a717885d3646dcd5b4062dc3a4877ec421 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Wed, 3 Apr 2024 19:33:52 +0100 Subject: [PATCH 19/62] Squashfs: remove deprecated strncpy by not copying the string Squashfs copied the passed string (name) into a temporary buffer to ensure it was NUL-terminated. This however is completely unnecessary as the string is already NUL-terminated. So remove the deprecated strncpy() by completely removing the string copy. The background behind this unnecessary string copy is that it dates back to the days when Squashfs was an out of kernel patch. The code deliberately did not assume the string was NUL-terminated in case in future this changed (due to kernel changes). This would mean the out of tree patches would be broken but still compile OK. Link: https://lkml.kernel.org/r/20240403183352.391308-1-phillip@squashfs.org.uk Signed-off-by: Phillip Lougher Reviewed-by: Kees Cook Reviewed-by: Justin Stitt Signed-off-by: Andrew Morton --- fs/squashfs/namei.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 11e4539b9eae..65aae7e2a859 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -62,27 +62,21 @@ */ static int get_dir_index_using_name(struct super_block *sb, u64 *next_block, int *next_offset, u64 index_start, - int index_offset, int i_count, const char *name, - int len) + int index_offset, int i_count, const char *name) { struct squashfs_sb_info *msblk = sb->s_fs_info; int i, length = 0, err; unsigned int size; struct squashfs_dir_index *index; - char *str; TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); - index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); if (index == NULL) { ERROR("Failed to allocate squashfs_dir_index\n"); goto out; } - str = &index->name[SQUASHFS_NAME_LEN + 1]; - strncpy(str, name, len); - str[len] = '\0'; - for (i = 0; i < i_count; i++) { err = squashfs_read_metadata(sb, index, &index_start, &index_offset, sizeof(*index)); @@ -101,7 +95,7 @@ static int get_dir_index_using_name(struct super_block *sb, index->name[size] = '\0'; - if (strcmp(index->name, str) > 0) + if (strcmp(index->name, name) > 0) break; length = le32_to_cpu(index->index); @@ -153,7 +147,7 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, length = get_dir_index_using_name(dir->i_sb, &block, &offset, squashfs_i(dir)->dir_idx_start, squashfs_i(dir)->dir_idx_offset, - squashfs_i(dir)->dir_idx_cnt, name, len); + squashfs_i(dir)->dir_idx_cnt, name); while (length < i_size_read(dir)) { /* From b157f0e97e3ed6158cde4f247da4c537c3f6a0f9 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 3 Apr 2024 15:25:47 +0200 Subject: [PATCH 20/62] kgdb: add HAS_IOPORT dependency In a future patch HAS_IOPORT=n will disable inb()/outb() and friends at compile time. We thus need to add HAS_IOPORT as dependency for those drivers using them. Link: https://lkml.kernel.org/r/20240403132547.762429-2-schnelle@linux.ibm.com Co-developed-by: Arnd Bergmann Signed-off-by: Arnd Bergmann Signed-off-by: Niklas Schnelle Signed-off-by: Andrew Morton --- lib/Kconfig.kgdb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.kgdb b/lib/Kconfig.kgdb index b5c0e6576749..537e1b3f5734 100644 --- a/lib/Kconfig.kgdb +++ b/lib/Kconfig.kgdb @@ -122,6 +122,7 @@ config KDB_DEFAULT_ENABLE config KDB_KEYBOARD bool "KGDB_KDB: keyboard as input device" depends on VT && KGDB_KDB && !PARISC + depends on HAS_IOPORT default n help KDB can use a PS/2 type keyboard for an input device From f36c54f3ce3d69aadccc56bc7dc2cf6c464f3522 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 3 Apr 2024 13:46:56 +0300 Subject: [PATCH 21/62] devres: switch to use dev_err_probe() for unification Patch series "devres: A couple of cleanups". A couple of ad-hoc cleanups. No functional changes intended. This patch (of 2): The devm_*() APIs are supposed to be called during the ->probe() stage. Many drivers (especially new ones) have switched to use dev_err_probe() for error messaging for the sake of unification. Let's do the same in the devres APIs. Link: https://lkml.kernel.org/r/20240403104820.557487-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240403104820.557487-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Philipp Stanner Signed-off-by: Andrew Morton --- lib/devres.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/devres.c b/lib/devres.c index fe0c63caeb68..27f280a39dca 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -125,12 +125,13 @@ __devm_ioremap_resource(struct device *dev, const struct resource *res, resource_size_t size; void __iomem *dest_ptr; char *pretty_name; + int ret; BUG_ON(!dev); if (!res || resource_type(res) != IORESOURCE_MEM) { - dev_err(dev, "invalid resource %pR\n", res); - return IOMEM_ERR_PTR(-EINVAL); + ret = dev_err_probe(dev, -EINVAL, "invalid resource %pR\n", res); + return IOMEM_ERR_PTR(ret); } if (type == DEVM_IOREMAP && res->flags & IORESOURCE_MEM_NONPOSTED) @@ -144,20 +145,20 @@ __devm_ioremap_resource(struct device *dev, const struct resource *res, else pretty_name = devm_kstrdup(dev, dev_name(dev), GFP_KERNEL); if (!pretty_name) { - dev_err(dev, "can't generate pretty name for resource %pR\n", res); - return IOMEM_ERR_PTR(-ENOMEM); + ret = dev_err_probe(dev, -ENOMEM, "can't generate pretty name for resource %pR\n", res); + return IOMEM_ERR_PTR(ret); } if (!devm_request_mem_region(dev, res->start, size, pretty_name)) { - dev_err(dev, "can't request region for resource %pR\n", res); - return IOMEM_ERR_PTR(-EBUSY); + ret = dev_err_probe(dev, -EBUSY, "can't request region for resource %pR\n", res); + return IOMEM_ERR_PTR(ret); } dest_ptr = __devm_ioremap(dev, res->start, size, type); if (!dest_ptr) { - dev_err(dev, "ioremap failed for resource %pR\n", res); devm_release_mem_region(dev, res->start, size); - dest_ptr = IOMEM_ERR_PTR(-ENOMEM); + ret = dev_err_probe(dev, -ENOMEM, "ioremap failed for resource %pR\n", res); + return IOMEM_ERR_PTR(ret); } return dest_ptr; From 3cc98aa11ec4c3b9b55fa185c8882e4ae20cbd6a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 3 Apr 2024 13:46:57 +0300 Subject: [PATCH 22/62] devres: don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Link: https://lkml.kernel.org/r/20240403104820.557487-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Philipp Stanner Signed-off-by: Andrew Morton --- lib/devres.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/devres.c b/lib/devres.c index 27f280a39dca..4fc152de6d8b 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -1,10 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include -#include -#include -#include +#include #include +#include +#include +#include #include +#include enum devm_ioremap_type { DEVM_IOREMAP = 0, From ad5f0eb540d31d95e6eefe6e24af0fe1b1b393fa Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 1 Apr 2024 18:39:55 +0000 Subject: [PATCH 23/62] vmcore: replace strncpy with strscpy_pad strncpy() is in the process of being replaced as it is deprecated [1]. We should move towards safer and less ambiguous string interfaces. Looking at vmcoredd_header's definition: | struct vmcoredd_header { | __u32 n_namesz; /* Name size */ | __u32 n_descsz; /* Content size */ | __u32 n_type; /* NT_VMCOREDD */ | __u8 name[8]; /* LINUX\0\0\0 */ | __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */ | }; .. we see that @name wants to be NUL-padded. We're copying data->dump_name which is defined as: | char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ .. which shares the same size as vdd_hdr->dump_name. Let's make sure we NUL-pad this as well. Use strscpy_pad() which NUL-terminates and NUL-pads its destination buffers. Specifically, use the new 2-argument version of strscpy_pad introduced in Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 Link: https://lkml.kernel.org/r/20240401-strncpy-fs-proc-vmcore-c-v2-1-dd0a73f42635@google.com Signed-off-by: Justin Stitt Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 1fb213f379a5..5d08d4d159d3 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1370,9 +1370,8 @@ static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); vdd_hdr->n_type = NT_VMCOREDD; - strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, - sizeof(vdd_hdr->name)); - memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); + strscpy_pad(vdd_hdr->name, VMCOREDD_NOTE_NAME); + strscpy_pad(vdd_hdr->dump_name, data->dump_name); } /** From d11547071a9d916604a65394b169c0d1acf0a6d6 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 8 Apr 2024 16:20:38 +0800 Subject: [PATCH 24/62] ocfs2: return real error code in ocfs2_dio_wr_get_block Patch series "ocfs2 bugs fixes exposed by fstests", v3. The patchset is to fix some wrong behavior of ocfs2 exposed by fstests. Patch 1 makes userspace happy when some error happens when doing direct io. Before the patch, DIO always return -EIO in case of error. After the patch, it returns real error code such like -ENOSPC, EDQUOT... Patch 2 fixes an error case when doing AIO+DIO and hole punching at same file position in parallel. generic/300 Patch 3 fixes inode link count mismatch after power failure. Without the patch, inode link would be wrong even fync was called on the file. tests/generic/040,041,104,107,336 patch 4 fixes wrong atime with mount option realtime. Without the patch, atime of new created file won't be updated in right time. tests/generic/192 For stable kernels, I added fixes to patch 2,3,4. The patch 1 is not recommended to be backported since ocfs2_dio_wr_get_block calls too many functions. It's diffcult to check every git history of ocfs2 for every LTS kernel. This patch (of 4): ocfs2_dio_wr_get_block always returns -EIO in case of errors. However, some programs expect right exit codes while doing dio. For example, tools like fio treat -ENOSPC as expected code while doing stress jobs. And quota tools expect -EDQUOT when disk quota exceeds. -EIO is too strong return code in the dio path. The caller of ocfs2_dio_wr_get_block is __blockdev_direct_IO which is widely used and it handles error codes well. I have checked functions called by ocfs2_dio_wr_get_block and their return codes look good and clear. So I think it's safe to let ocfs2_dio_wr_get_block return real error code. Link: https://lkml.kernel.org/r/20240408082041.20925-1-glass.su@suse.com Link: https://lkml.kernel.org/r/20240408082041.20925-2-glass.su@suse.com Signed-off-by: Su Yue Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/aops.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b82185075de7..f0467d3b3c88 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2283,8 +2283,6 @@ unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out: - if (ret < 0) - ret = -EIO; return ret; } From 952b023f06a24b2ad6ba67304c4c84d45bea2f18 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 8 Apr 2024 16:20:39 +0800 Subject: [PATCH 25/62] ocfs2: fix races between hole punching and AIO+DIO After commit "ocfs2: return real error code in ocfs2_dio_wr_get_block", fstests/generic/300 become from always failed to sometimes failed: ======================================================================== [ 473.293420 ] run fstests generic/300 [ 475.296983 ] JBD2: Ignoring recovery information on journal [ 475.302473 ] ocfs2: Mounting device (253,1) on (node local, slot 0) with ordered data mode. [ 494.290998 ] OCFS2: ERROR (device dm-1): ocfs2_change_extent_flag: Owner 5668 has an extent at cpos 78723 which can no longer be found [ 494.291609 ] On-disk corruption discovered. Please run fsck.ocfs2 once the filesystem is unmounted. [ 494.292018 ] OCFS2: File system is now read-only. [ 494.292224 ] (kworker/19:11,2628,19):ocfs2_mark_extent_written:5272 ERROR: status = -30 [ 494.292602 ] (kworker/19:11,2628,19):ocfs2_dio_end_io_write:2374 ERROR: status = -3 fio: io_u error on file /mnt/scratch/racer: Read-only file system: write offset=460849152, buflen=131072 ========================================================================= In __blockdev_direct_IO, ocfs2_dio_wr_get_block is called to add unwritten extents to a list. extents are also inserted into extent tree in ocfs2_write_begin_nolock. Then another thread call fallocate to puch a hole at one of the unwritten extent. The extent at cpos was removed by ocfs2_remove_extent(). At end io worker thread, ocfs2_search_extent_list found there is no such extent at the cpos. T1 T2 T3 inode lock ... insert extents ... inode unlock ocfs2_fallocate __ocfs2_change_file_space inode lock lock ip_alloc_sem ocfs2_remove_inode_range inode ocfs2_remove_btree_range ocfs2_remove_extent ^---remove the extent at cpos 78723 ... unlock ip_alloc_sem inode unlock ocfs2_dio_end_io ocfs2_dio_end_io_write lock ip_alloc_sem ocfs2_mark_extent_written ocfs2_change_extent_flag ocfs2_search_extent_list ^---failed to find extent ... unlock ip_alloc_sem In most filesystems, fallocate is not compatible with racing with AIO+DIO, so fix it by adding to wait for all dio before fallocate/punch_hole like ext4. Link: https://lkml.kernel.org/r/20240408082041.20925-3-glass.su@suse.com Fixes: b25801038da5 ("ocfs2: Support xfs style space reservation ioctls") Signed-off-by: Su Yue Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0da8e7bd3261..ccc57038a977 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1936,6 +1936,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, inode_lock(inode); + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); /* * This prevents concurrent writes on other nodes */ From 8c40984eeb8804cffcd28640f427f4fe829243fc Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 8 Apr 2024 16:20:40 +0800 Subject: [PATCH 26/62] ocfs2: update inode fsync transaction id in ocfs2_unlink and ocfs2_link transaction id should be updated in ocfs2_unlink and ocfs2_link. Otherwise, inode link will be wrong after journal replay even fsync was called before power failure: ======================================================================= $ touch testdir/bar $ ln testdir/bar testdir/bar_link $ fsync testdir/bar $ stat -c %h $SCRATCH_MNT/testdir/bar 1 $ stat -c %h $SCRATCH_MNT/testdir/bar 1 ======================================================================= Link: https://lkml.kernel.org/r/20240408082041.20925-4-glass.su@suse.com Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") Signed-off-by: Su Yue Reviewed-by: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 9221a33f917b..55c9d90caaaf 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -797,6 +797,7 @@ static int ocfs2_link(struct dentry *old_dentry, ocfs2_set_links_count(fe, inode->i_nlink); fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -993,6 +994,7 @@ static int ocfs2_unlink(struct inode *dir, drop_nlink(inode); drop_nlink(inode); ocfs2_set_links_count(fe, inode->i_nlink); + ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); From b8cb324277ee16f3eca3055b96fce4735a5a41c6 Mon Sep 17 00:00:00 2001 From: Su Yue Date: Mon, 8 Apr 2024 16:20:41 +0800 Subject: [PATCH 27/62] ocfs2: use coarse time for new created files The default atime related mount option is '-o realtime' which means file atime should be updated if atime <= ctime or atime <= mtime. atime should be updated in the following scenario, but it is not: ========================================================== $ rm /mnt/testfile; $ echo test > /mnt/testfile $ stat -c "%X %Y %Z" /mnt/testfile 1711881646 1711881646 1711881646 $ sleep 5 $ cat /mnt/testfile > /dev/null $ stat -c "%X %Y %Z" /mnt/testfile 1711881646 1711881646 1711881646 ========================================================== And the reason the atime in the test is not updated is that ocfs2 calls ktime_get_real_ts64() in __ocfs2_mknod_locked during file creation. Then inode_set_ctime_current() is called in inode_set_ctime_current() calls ktime_get_coarse_real_ts64() to get current time. ktime_get_real_ts64() is more accurate than ktime_get_coarse_real_ts64(). In my test box, I saw ctime set by ktime_get_coarse_real_ts64() is less than ktime_get_real_ts64() even ctime is set later. The ctime of the new inode is smaller than atime. The call trace is like: ocfs2_create ocfs2_mknod __ocfs2_mknod_locked .... ktime_get_real_ts64 <------- set atime,ctime,mtime, more accurate ocfs2_populate_inode ... ocfs2_init_acl ocfs2_acl_set_mode inode_set_ctime_current current_time ktime_get_coarse_real_ts64 <-------less accurate ocfs2_file_read_iter ocfs2_inode_lock_atime ocfs2_should_update_atime atime <= ctime ? <-------- false, ctime < atime due to accuracy So here call ktime_get_coarse_real_ts64 to set inode time coarser while creating new files. It may lower the accuracy of file times. But it's not a big deal since we already use coarse time in other places like ocfs2_update_inode_atime and inode_set_ctime_current. Link: https://lkml.kernel.org/r/20240408082041.20925-5-glass.su@suse.com Fixes: c62c38f6b91b ("ocfs2: replace CURRENT_TIME macro") Signed-off-by: Su Yue Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 55c9d90caaaf..4d1ea8703fcd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -566,7 +566,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); - ktime_get_real_ts64(&ts); + ktime_get_coarse_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = From 3ef3a05ba6ac52998a52dc1eca90853dda771316 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Apr 2024 16:00:54 +0200 Subject: [PATCH 28/62] test_hexdump: avoid string truncation warning gcc can warn when a string is too long to fit into the strncpy() destination buffer, as it is here depending on the function arguments: inlined from 'test_hexdump_prepare_test.constprop' at /home/arnd/arm-soc/lib/test_hexdump.c:116:3: include/linux/fortify-string.h:108:33: error: '__builtin_strncpy' output truncated copying between 0 and 32 bytes from a string of length 32 [-Werror=stringop-truncation] 108 | #define __underlying_strncpy __builtin_strncpy | ^ include/linux/fortify-string.h:187:16: note: in expansion of macro '__underlying_strncpy' 187 | return __underlying_strncpy(p, q, size); | ^~~~~~~~~~~~~~~~~~~~ The intention here is to copy exactly 'l' bytes without any padding or NUL-termination, so the most logical change is to use memcpy(), just as a previous change adapted the other output from strncpy() to memcpy(). Link: https://lkml.kernel.org/r/20240409140059.3806717-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Justin Stitt Cc: Alexey Starikovskiy Cc: Bob Moore Cc: Jens Axboe Cc: Len Brown Cc: Lin Ming Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Rafael J. Wysocki Cc: "Richard Russon (FlatCap)" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- lib/test_hexdump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index b916801f23a8..fe2682bb21e6 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -113,7 +113,7 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, *p++ = ' '; } while (p < test + rs * 2 + rs / gs + 1); - strncpy(p, data_a, l); + memcpy(p, data_a, l); p += l; } From 597bc741e5ac2608e3ec0afd4256163879ab506b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Apr 2024 16:00:56 +0200 Subject: [PATCH 29/62] block/partitions/ldm: convert strncpy() to strscpy() The strncpy() here can cause a non-terminated string, which older gcc versions such as gcc-9 warn about: In function 'ldm_parse_tocblock', inlined from 'ldm_validate_tocblocks' at block/partitions/ldm.c:386:7, inlined from 'ldm_partition' at block/partitions/ldm.c:1457:7: block/partitions/ldm.c:134:2: error: 'strncpy' specified bound 16 equals destination size [-Werror=stringop-truncation] 134 | strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ block/partitions/ldm.c:145:2: error: 'strncpy' specified bound 16 equals destination size [-Werror=stringop-truncation] 145 | strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ New versions notice that the code is correct after all because of the following termination, but replacing the strncpy() with strscpy_pad() or strcpy() avoids the warning and simplifies the code at the same time. Use the padding version here to keep the existing behavior, in case the code relies on not including uninitialized data. Link: https://lkml.kernel.org/r/20240409140059.3806717-4-arnd@kernel.org Reviewed-by: Justin Stitt Signed-off-by: Arnd Bergmann Cc: Alexey Starikovskiy Cc: Bob Moore Cc: Jens Axboe Cc: Len Brown Cc: Lin Ming Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Rafael J. Wysocki Cc: "Richard Russon (FlatCap)" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- block/partitions/ldm.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 38e58960ae03..2bd42fedb907 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -131,8 +131,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); return false; } - strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); - toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; + strscpy_pad(toc->bitmap1_name, data + 0x24, sizeof(toc->bitmap1_name)); toc->bitmap1_start = get_unaligned_be64(data + 0x2E); toc->bitmap1_size = get_unaligned_be64(data + 0x36); @@ -142,8 +141,7 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) TOC_BITMAP1, toc->bitmap1_name); return false; } - strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); - toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; + strscpy_pad(toc->bitmap2_name, data + 0x46, sizeof(toc->bitmap2_name)); toc->bitmap2_start = get_unaligned_be64(data + 0x50); toc->bitmap2_size = get_unaligned_be64(data + 0x58); if (strncmp (toc->bitmap2_name, TOC_BITMAP2, From 051e7503070179f2278c0d4fa7dee441adb558ca Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 9 Apr 2024 16:00:57 +0200 Subject: [PATCH 30/62] blktrace: convert strncpy() to strscpy_pad() gcc-9 warns about a possibly non-terminated string copy: kernel/trace/blktrace.c: In function 'do_blk_trace_setup': kernel/trace/blktrace.c:527:2: error: 'strncpy' specified bound 32 equals destination size [-Werror=stringop-truncation] Newer versions are fine here because they see the following explicit nul-termination. Using strscpy_pad() avoids the warning and simplifies the code a little. The padding helps give a clean buffer to userspace. Link: https://lkml.kernel.org/r/20240409140059.3806717-5-arnd@kernel.org Signed-off-by: Arnd Bergmann Acked-by: Justin Stitt Cc: Alexey Starikovskiy Cc: Bob Moore Cc: Jens Axboe Cc: Len Brown Cc: Lin Ming Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Rafael J. Wysocki Cc: "Richard Russon (FlatCap)" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/trace/blktrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d5d94510afd3..8fd292d34d89 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -524,8 +524,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); - buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + strscpy_pad(buts->name, name, BLKTRACE_BDEV_SIZE); /* * some device names have larger paths - convert the slashes From 2725844080d2609c48a51e08f9a0a3fbe7316665 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 10 Apr 2024 16:56:27 +0900 Subject: [PATCH 31/62] nilfs2: add kernel-doc comments to nilfs_do_roll_forward() Patch series "nilfs2: fix missing kernel-doc comments". This commit adds kernel-doc style comments with complete parameter descriptions for the function nilfs_do_roll_forward. Link: https://lkml.kernel.org/r/20240410075629.3441-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20240410075629.3441-2-konishi.ryusuke@gmail.com Signed-off-by: Yang Li Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 49a70c68bf3c..e48372618ac4 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -563,6 +563,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, * checkpoint * @nilfs: nilfs object * @sb: super block instance + * @root: NILFS root instance * @ri: pointer to a nilfs_recovery_info */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, From 3da9b9650acc3a2a0c3d3f4542b93d4abe9da1de Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 10 Apr 2024 16:56:28 +0900 Subject: [PATCH 32/62] nilfs2: add kernel-doc comments to nilfs_btree_convert_and_insert() This commit adds kernel-doc style comments with complete parameter descriptions for the function nilfs_btree_convert_and_insert. Link: https://lkml.kernel.org/r/20240410075629.3441-3-konishi.ryusuke@gmail.com Signed-off-by: Yang Li Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/btree.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 65659fa0372e..a139970e4804 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1857,13 +1857,22 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, } /** - * nilfs_btree_convert_and_insert - - * @bmap: - * @key: - * @ptr: - * @keys: - * @ptrs: - * @n: + * nilfs_btree_convert_and_insert - Convert and insert entries into a B-tree + * @btree: NILFS B-tree structure + * @key: Key of the new entry to be inserted + * @ptr: Pointer (block number) associated with the key to be inserted + * @keys: Array of keys to be inserted in addition to @key + * @ptrs: Array of pointers associated with @keys + * @n: Number of keys and pointers in @keys and @ptrs + * + * This function is used to insert a new entry specified by @key and @ptr, + * along with additional entries specified by @keys and @ptrs arrays, into a + * NILFS B-tree. + * It prepares the necessary changes by allocating the required blocks and any + * necessary intermediate nodes. It converts configurations from other forms of + * block mapping (the one that currently exists is direct mapping) to a B-tree. + * + * Return: 0 on success or a negative error code on failure. */ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, From 4a458576941c78d905415a4a6edb48d4c04f7444 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 10 Apr 2024 16:56:29 +0900 Subject: [PATCH 33/62] nilfs2: add kernel-doc comments to nilfs_remove_all_gcinodes() This commit adds kernel-doc style comments with complete parameter descriptions for the function nilfs_remove_all_gcinodes. Link: https://lkml.kernel.org/r/20240410075629.3441-4-konishi.ryusuke@gmail.com Signed-off-by: Yang Li Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/gcinode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index bf9a11d58817..1c9ae36a03ab 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -175,6 +175,7 @@ int nilfs_init_gcinode(struct inode *inode) /** * nilfs_remove_all_gcinodes() - remove all unprocessed gc inodes + * @nilfs: NILFS filesystem instance */ void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs) { From 0f373e6d91b9dd75317d05a21abd999783cf70da Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Apr 2024 11:28:13 +0200 Subject: [PATCH 34/62] intel_th: remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Link: https://lkml.kernel.org/r/2aca50a9d061faecfd4ded80b5874cd3be9b855d.1713086613.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Alexander Shishkin Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/hwtracing/intel_th/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/hwtracing/intel_th/core.c b/drivers/hwtracing/intel_th/core.c index cc7f879bb175..86c8efecd7c2 100644 --- a/drivers/hwtracing/intel_th/core.c +++ b/drivers/hwtracing/intel_th/core.c @@ -871,7 +871,7 @@ intel_th_alloc(struct device *dev, const struct intel_th_drvdata *drvdata, if (!th) return ERR_PTR(-ENOMEM); - th->id = ida_simple_get(&intel_th_ida, 0, 0, GFP_KERNEL); + th->id = ida_alloc(&intel_th_ida, GFP_KERNEL); if (th->id < 0) { err = th->id; goto err_alloc; @@ -931,7 +931,7 @@ err_chrdev: "intel_th/output"); err_ida: - ida_simple_remove(&intel_th_ida, th->id); + ida_free(&intel_th_ida, th->id); err_alloc: kfree(th); @@ -964,7 +964,7 @@ void intel_th_free(struct intel_th *th) __unregister_chrdev(th->major, 0, TH_POSSIBLE_OUTPUTS, "intel_th/output"); - ida_simple_remove(&intel_th_ida, th->id); + ida_free(&intel_th_ida, th->id); kfree(th); } From 55dbc5b5174d0e7d1fa397d05aa4cb145e8b887e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Apr 2024 12:10:17 +0200 Subject: [PATCH 35/62] pps: remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Link: https://lkml.kernel.org/r/9f681747d446b874952a892491387d79ffe565a9.1713089394.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Rodolfo Giometti Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/pps/clients/pps_parport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index 42f93d4c6ee3..af972cdc04b5 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -148,7 +148,7 @@ static void parport_attach(struct parport *port) return; } - index = ida_simple_get(&pps_client_index, 0, 0, GFP_KERNEL); + index = ida_alloc(&pps_client_index, GFP_KERNEL); memset(&pps_client_cb, 0, sizeof(pps_client_cb)); pps_client_cb.private = device; pps_client_cb.irq_func = parport_irq; @@ -188,7 +188,7 @@ err_release_dev: err_unregister_dev: parport_unregister_device(device->pardev); err_free: - ida_simple_remove(&pps_client_index, index); + ida_free(&pps_client_index, index); kfree(device); } @@ -208,7 +208,7 @@ static void parport_detach(struct parport *port) pps_unregister_source(device->pps); parport_release(pardev); parport_unregister_device(pardev); - ida_simple_remove(&pps_client_index, device->index); + ida_free(&pps_client_index, device->index); kfree(device); } From 200a289b342bae6cbeb0b7406b2af55feef0de94 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Apr 2024 12:12:52 +0200 Subject: [PATCH 36/62] mux: remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Link: https://lkml.kernel.org/r/f82e013abe4c71f1c7d06819f96472f298acdcf3.1713089554.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Peter Rosin Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/mux/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mux/core.c b/drivers/mux/core.c index 775816112932..78c0022697ec 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -64,7 +64,7 @@ static void mux_chip_release(struct device *dev) { struct mux_chip *mux_chip = to_mux_chip(dev); - ida_simple_remove(&mux_ida, mux_chip->id); + ida_free(&mux_ida, mux_chip->id); kfree(mux_chip); } @@ -111,7 +111,7 @@ struct mux_chip *mux_chip_alloc(struct device *dev, mux_chip->dev.of_node = dev->of_node; dev_set_drvdata(&mux_chip->dev, mux_chip); - mux_chip->id = ida_simple_get(&mux_ida, 0, 0, GFP_KERNEL); + mux_chip->id = ida_alloc(&mux_ida, GFP_KERNEL); if (mux_chip->id < 0) { int err = mux_chip->id; From 055e09ac54ae9c8396c1086fe06a73e0ce9bdd10 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 17 Apr 2024 23:11:23 +0300 Subject: [PATCH 37/62] cpumask: delete unused reset_cpu_possible_mask() Link: https://lkml.kernel.org/r/20240417201123.2961-1-adobriyan@gmail.com Signed-off-by: Alexey Dobriyan Cc: Rasmus Villemoes Cc: Yury Norov Signed-off-by: Andrew Morton --- include/linux/cpumask.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 1c29947db848..04536a29f10f 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1017,11 +1017,6 @@ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); -static inline void reset_cpu_possible_mask(void) -{ - bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); -} - static inline void set_cpu_possible(unsigned int cpu, bool possible) { From 4707c13de3e42a47f0d99fe5fb58fa9dd23b455e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 18 Apr 2024 11:58:43 +0800 Subject: [PATCH 38/62] crash: add prefix for crash dumping messages Add pr_fmt() to kernel/crash_core.c to add the module name to debugging message printed as prefix. And also add prefix 'crashkernel:' to two lines of message printing code in kernel/crash_reserve.c. In kernel/crash_reserve.c, almost all debugging messages have 'crashkernel:' prefix or there's keyword crashkernel at the beginning or in the middle, adding pr_fmt() makes it redundant. Link: https://lkml.kernel.org/r/20240418035843.1562887-1-bhe@redhat.com Signed-off-by: Baoquan He Cc: Dave Young Cc: Jiri Slaby Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 ++ kernel/crash_reserve.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..1e7ac977f7c0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,8 @@ * Copyright (C) 2002-2004 Eric Biederman */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 066668799f75..5b2722a93a48 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -109,7 +109,7 @@ static int __init parse_crashkernel_mem(char *cmdline, size = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected\n"); + pr_warn("crashkernel: Memory value expected\n"); return -EINVAL; } cur = tmp; @@ -132,7 +132,7 @@ static int __init parse_crashkernel_mem(char *cmdline, cur++; *crash_base = memparse(cur, &tmp); if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); + pr_warn("crahskernel: Memory value expected after '@'\n"); return -EINVAL; } } From f4af41bf177add167e39e4b0203460b1d0b531f6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 9 Apr 2024 12:22:38 +0800 Subject: [PATCH 39/62] kexec: fix the unexpected kexec_dprintk() macro Jiri reported that the current kexec_dprintk() always prints out debugging message whenever kexec/kdmmp loading is triggered. That is not wanted. The debugging message is supposed to be printed out when 'kexec -s -d' is specified for kexec/kdump loading. After investigating, the reason is the current kexec_dprintk() takes printk(KERN_INFO) or printk(KERN_DEBUG) depending on whether '-d' is specified. However, distros usually have defaulg log level like below: [~]# cat /proc/sys/kernel/printk 7 4 1 7 So, even though '-d' is not specified, printk(KERN_DEBUG) also always prints out. I thought printk(KERN_DEBUG) is equal to pr_debug(), it's not. Fix it by changing to use pr_info() instead which are expected to work. Link: https://lkml.kernel.org/r/20240409042238.1240462-1-bhe@redhat.com Fixes: cbc2fe9d9cb2 ("kexec_file: add kexec_file flag to control debug printing") Signed-off-by: Baoquan He Reported-by: Jiri Slaby Closes: https://lore.kernel.org/all/4c775fca-5def-4a2d-8437-7130b02722a2@kernel.org Reviewed-by: Dave Young Cc: Signed-off-by: Andrew Morton --- include/linux/kexec.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 060835bb82d5..f31bd304df45 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -461,10 +461,8 @@ static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { extern bool kexec_file_dbg_print; -#define kexec_dprintk(fmt, ...) \ - printk("%s" fmt, \ - kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG, \ - ##__VA_ARGS__) +#define kexec_dprintk(fmt, arg...) \ + do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; From 36defdd9d7c605ba042962ef1eeb66a9d3ff5884 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 25 Apr 2024 03:27:16 +0900 Subject: [PATCH 40/62] nilfs2: convert to use the new mount API Convert nilfs2 to use the new mount API. [sandeen@redhat.com: v2] Link: https://lkml.kernel.org/r/33d078a7-9072-4d8e-a3a9-dec23d4191da@redhat.com Link: https://lkml.kernel.org/r/20240425190526.10905-1-konishi.ryusuke@gmail.com [konishi.ryusuke: fixed missing SB_RDONLY flag repair in nilfs_reconfigure] Link: https://lkml.kernel.org/r/33d078a7-9072-4d8e-a3a9-dec23d4191da@redhat.com Link: https://lkml.kernel.org/r/20240424182716.6024-1-konishi.ryusuke@gmail.com Signed-off-by: Eric Sandeen Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/nilfs.h | 4 +- fs/nilfs2/super.c | 386 ++++++++++++++++++------------------------ fs/nilfs2/the_nilfs.c | 5 +- fs/nilfs2/the_nilfs.h | 6 +- 4 files changed, 173 insertions(+), 228 deletions(-) diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 2e29b98ba8ba..728e90be3570 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -335,8 +335,8 @@ void __nilfs_error(struct super_block *sb, const char *function, extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); -extern int nilfs_store_magic_and_option(struct super_block *, - struct nilfs_super_block *, char *); +extern int nilfs_store_magic(struct super_block *sb, + struct nilfs_super_block *sbp); extern int nilfs_check_feature_compatibility(struct super_block *, struct nilfs_super_block *); extern void nilfs_set_log_cursor(struct nilfs_super_block *, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index ac24ed109ce9..e835e1f5a712 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -29,13 +29,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include "nilfs.h" #include "export.h" #include "mdt.h" @@ -61,7 +61,6 @@ struct kmem_cache *nilfs_segbuf_cachep; struct kmem_cache *nilfs_btree_path_cache; static int nilfs_setup_super(struct super_block *sb, int is_mount); -static int nilfs_remount(struct super_block *sb, int *flags, char *data); void __nilfs_msg(struct super_block *sb, const char *fmt, ...) { @@ -702,105 +701,98 @@ static const struct super_operations nilfs_sops = { .freeze_fs = nilfs_freeze, .unfreeze_fs = nilfs_unfreeze, .statfs = nilfs_statfs, - .remount_fs = nilfs_remount, .show_options = nilfs_show_options }; enum { - Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, - Opt_discard, Opt_nodiscard, Opt_err, + Opt_err, Opt_barrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, }; -static match_table_t tokens = { - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_snapshot, "cp=%u"}, - {Opt_order, "order=%s"}, - {Opt_norecovery, "norecovery"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_err, NULL} +static const struct constant_table nilfs_param_err[] = { + {"continue", NILFS_MOUNT_ERRORS_CONT}, + {"panic", NILFS_MOUNT_ERRORS_PANIC}, + {"remount-ro", NILFS_MOUNT_ERRORS_RO}, + {} }; -static int parse_options(char *options, struct super_block *sb, int is_remount) +static const struct fs_parameter_spec nilfs_param_spec[] = { + fsparam_enum ("errors", Opt_err, nilfs_param_err), + fsparam_flag_no ("barrier", Opt_barrier), + fsparam_u64 ("cp", Opt_snapshot), + fsparam_string ("order", Opt_order), + fsparam_flag ("norecovery", Opt_norecovery), + fsparam_flag_no ("discard", Opt_discard), + {} +}; + +struct nilfs_fs_context { + unsigned long ns_mount_opt; + __u64 cno; +}; + +static int nilfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct the_nilfs *nilfs = sb->s_fs_info; - char *p; - substring_t args[MAX_OPT_ARGS]; + struct nilfs_fs_context *nilfs = fc->fs_private; + int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + struct fs_parse_result result; + int opt; - if (!options) - return 1; + opt = fs_parse(fc, nilfs_param_spec, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_barrier: - nilfs_set_opt(nilfs, BARRIER); - break; - case Opt_nobarrier: + switch (opt) { + case Opt_barrier: + if (result.negated) nilfs_clear_opt(nilfs, BARRIER); - break; - case Opt_order: - if (strcmp(args[0].from, "relaxed") == 0) - /* Ordered data semantics */ - nilfs_clear_opt(nilfs, STRICT_ORDER); - else if (strcmp(args[0].from, "strict") == 0) - /* Strict in-order semantics */ - nilfs_set_opt(nilfs, STRICT_ORDER); - else - return 0; - break; - case Opt_err_panic: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC); - break; - case Opt_err_ro: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO); - break; - case Opt_err_cont: - nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT); - break; - case Opt_snapshot: - if (is_remount) { - nilfs_err(sb, - "\"%s\" option is invalid for remount", - p); - return 0; - } - break; - case Opt_norecovery: - nilfs_set_opt(nilfs, NORECOVERY); - break; - case Opt_discard: - nilfs_set_opt(nilfs, DISCARD); - break; - case Opt_nodiscard: - nilfs_clear_opt(nilfs, DISCARD); - break; - default: - nilfs_err(sb, "unrecognized mount option \"%s\"", p); - return 0; + else + nilfs_set_opt(nilfs, BARRIER); + break; + case Opt_order: + if (strcmp(param->string, "relaxed") == 0) + /* Ordered data semantics */ + nilfs_clear_opt(nilfs, STRICT_ORDER); + else if (strcmp(param->string, "strict") == 0) + /* Strict in-order semantics */ + nilfs_set_opt(nilfs, STRICT_ORDER); + else + return -EINVAL; + break; + case Opt_err: + nilfs->ns_mount_opt &= ~NILFS_MOUNT_ERROR_MODE; + nilfs->ns_mount_opt |= result.uint_32; + break; + case Opt_snapshot: + if (is_remount) { + struct super_block *sb = fc->root->d_sb; + + nilfs_err(sb, + "\"%s\" option is invalid for remount", + param->key); + return -EINVAL; } + if (result.uint_64 == 0) { + nilfs_err(NULL, + "invalid option \"cp=0\": invalid checkpoint number 0"); + return -EINVAL; + } + nilfs->cno = result.uint_64; + break; + case Opt_norecovery: + nilfs_set_opt(nilfs, NORECOVERY); + break; + case Opt_discard: + if (result.negated) + nilfs_clear_opt(nilfs, DISCARD); + else + nilfs_set_opt(nilfs, DISCARD); + break; + default: + return -EINVAL; } - return 1; -} -static inline void -nilfs_set_default_options(struct super_block *sb, - struct nilfs_super_block *sbp) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - - nilfs->ns_mount_opt = - NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; + return 0; } static int nilfs_setup_super(struct super_block *sb, int is_mount) @@ -857,9 +849,8 @@ struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset); } -int nilfs_store_magic_and_option(struct super_block *sb, - struct nilfs_super_block *sbp, - char *data) +int nilfs_store_magic(struct super_block *sb, + struct nilfs_super_block *sbp) { struct the_nilfs *nilfs = sb->s_fs_info; @@ -870,14 +861,12 @@ int nilfs_store_magic_and_option(struct super_block *sb, sb->s_flags |= SB_NOATIME; #endif - nilfs_set_default_options(sb, sbp); - nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid); nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid); nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval); nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb, 0) ? -EINVAL : 0; + return 0; } int nilfs_check_feature_compatibility(struct super_block *sb, @@ -1035,17 +1024,17 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) /** * nilfs_fill_super() - initialize a super block instance * @sb: super_block - * @data: mount options - * @silent: silent mode flag + * @fc: filesystem context * * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int -nilfs_fill_super(struct super_block *sb, void *data, int silent) +nilfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct the_nilfs *nilfs; struct nilfs_root *fsroot; + struct nilfs_fs_context *ctx = fc->fs_private; __u64 cno; int err; @@ -1055,10 +1044,13 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = nilfs; - err = init_nilfs(nilfs, sb, (char *)data); + err = init_nilfs(nilfs, sb); if (err) goto failed_nilfs; + /* Copy in parsed mount options */ + nilfs->ns_mount_opt = ctx->ns_mount_opt; + sb->s_op = &nilfs_sops; sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; @@ -1117,34 +1109,25 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) return err; } -static int nilfs_remount(struct super_block *sb, int *flags, char *data) +static int nilfs_reconfigure(struct fs_context *fc) { + struct nilfs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - unsigned long old_sb_flags; - unsigned long old_mount_opt; int err; sync_filesystem(sb); - old_sb_flags = sb->s_flags; - old_mount_opt = nilfs->ns_mount_opt; - - if (!parse_options(data, sb, 1)) { - err = -EINVAL; - goto restore_opts; - } - sb->s_flags = (sb->s_flags & ~SB_POSIXACL); err = -EINVAL; if (!nilfs_valid_fs(nilfs)) { nilfs_warn(sb, "couldn't remount because the filesystem is in an incomplete recovery state"); - goto restore_opts; + goto ignore_opts; } - - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { sb->s_flags |= SB_RDONLY; /* @@ -1172,138 +1155,67 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) "couldn't remount RDWR because of unsupported optional features (%llx)", (unsigned long long)features); err = -EROFS; - goto restore_opts; + goto ignore_opts; } sb->s_flags &= ~SB_RDONLY; root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); - if (err) - goto restore_opts; + if (err) { + sb->s_flags |= SB_RDONLY; + goto ignore_opts; + } down_write(&nilfs->ns_sem); nilfs_setup_super(sb, true); up_write(&nilfs->ns_sem); } out: + sb->s_flags = (sb->s_flags & ~SB_POSIXACL); + /* Copy over parsed remount options */ + nilfs->ns_mount_opt = ctx->ns_mount_opt; + return 0; - restore_opts: - sb->s_flags = old_sb_flags; - nilfs->ns_mount_opt = old_mount_opt; + ignore_opts: return err; } -struct nilfs_super_data { - __u64 cno; - int flags; -}; - -static int nilfs_parse_snapshot_option(const char *option, - const substring_t *arg, - struct nilfs_super_data *sd) +static int +nilfs_get_tree(struct fs_context *fc) { - unsigned long long val; - const char *msg = NULL; - int err; - - if (!(sd->flags & SB_RDONLY)) { - msg = "read-only option is not specified"; - goto parse_error; - } - - err = kstrtoull(arg->from, 0, &val); - if (err) { - if (err == -ERANGE) - msg = "too large checkpoint number"; - else - msg = "malformed argument"; - goto parse_error; - } else if (val == 0) { - msg = "invalid checkpoint number 0"; - goto parse_error; - } - sd->cno = val; - return 0; - -parse_error: - nilfs_err(NULL, "invalid option \"%s\": %s", option, msg); - return 1; -} - -/** - * nilfs_identify - pre-read mount options needed to identify mount instance - * @data: mount options - * @sd: nilfs_super_data - */ -static int nilfs_identify(char *data, struct nilfs_super_data *sd) -{ - char *p, *options = data; - substring_t args[MAX_OPT_ARGS]; - int token; - int ret = 0; - - do { - p = strsep(&options, ","); - if (p != NULL && *p) { - token = match_token(p, tokens, args); - if (token == Opt_snapshot) - ret = nilfs_parse_snapshot_option(p, &args[0], - sd); - } - if (!options) - break; - BUG_ON(options == data); - *(options - 1) = ','; - } while (!ret); - return ret; -} - -static int nilfs_set_bdev_super(struct super_block *s, void *data) -{ - s->s_dev = *(dev_t *)data; - return 0; -} - -static int nilfs_test_bdev_super(struct super_block *s, void *data) -{ - return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; -} - -static struct dentry * -nilfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - struct nilfs_super_data sd = { .flags = flags }; + struct nilfs_fs_context *ctx = fc->fs_private; struct super_block *s; dev_t dev; int err; - if (nilfs_identify(data, &sd)) - return ERR_PTR(-EINVAL); + if (ctx->cno && !(fc->sb_flags & SB_RDONLY)) { + nilfs_err(NULL, + "invalid option \"cp=%llu\": read-only option is not specified", + ctx->cno); + return -EINVAL; + } - err = lookup_bdev(dev_name, &dev); + err = lookup_bdev(fc->source, &dev); if (err) - return ERR_PTR(err); + return err; - s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, - &dev); + s = sget_dev(fc, dev); if (IS_ERR(s)) - return ERR_CAST(s); + return PTR_ERR(s); if (!s->s_root) { - err = setup_bdev_super(s, flags, NULL); + err = setup_bdev_super(s, fc->sb_flags, fc); if (!err) - err = nilfs_fill_super(s, data, - flags & SB_SILENT ? 1 : 0); + err = nilfs_fill_super(s, fc); if (err) goto failed_super; s->s_flags |= SB_ACTIVE; - } else if (!sd.cno) { + } else if (!ctx->cno) { if (nilfs_tree_is_busy(s->s_root)) { - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { nilfs_err(s, "the device already has a %s mount.", sb_rdonly(s) ? "read-only" : "read/write"); @@ -1312,37 +1224,75 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } } else { /* - * Try remount to setup mount states if the current + * Try reconfigure to setup mount states if the current * tree is not mounted and only snapshots use this sb. + * + * Since nilfs_reconfigure() requires fc->root to be + * set, set it first and release it on failure. */ - err = nilfs_remount(s, &flags, data); - if (err) + fc->root = dget(s->s_root); + err = nilfs_reconfigure(fc); + if (err) { + dput(fc->root); + fc->root = NULL; /* prevent double release */ goto failed_super; + } + return 0; } } - if (sd.cno) { + if (ctx->cno) { struct dentry *root_dentry; - err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); + err = nilfs_attach_snapshot(s, ctx->cno, &root_dentry); if (err) goto failed_super; - return root_dentry; + fc->root = root_dentry; + return 0; } - return dget(s->s_root); + fc->root = dget(s->s_root); + return 0; failed_super: deactivate_locked_super(s); - return ERR_PTR(err); + return err; +} + +static void nilfs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations nilfs_context_ops = { + .parse_param = nilfs_parse_param, + .get_tree = nilfs_get_tree, + .reconfigure = nilfs_reconfigure, + .free = nilfs_free_fc, +}; + +static int nilfs_init_fs_context(struct fs_context *fc) +{ + struct nilfs_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns_mount_opt = NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; + fc->fs_private = ctx; + fc->ops = &nilfs_context_ops; + + return 0; } struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", - .mount = nilfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = nilfs_init_fs_context, + .parameters = nilfs_param_spec, }; MODULE_ALIAS_FS("nilfs2"); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2ae2c1bbf6d1..db322068678f 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -659,7 +659,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, * init_nilfs - initialize a NILFS instance. * @nilfs: the_nilfs structure * @sb: super block - * @data: mount options * * init_nilfs() performs common initialization per block device (e.g. * reading the super block, getting disk layout information, initializing @@ -668,7 +667,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, * Return Value: On success, 0 is returned. On error, a negative error * code is returned. */ -int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) { struct nilfs_super_block *sbp; int blocksize; @@ -686,7 +685,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto out; - err = nilfs_store_magic_and_option(sb, sbp, data); + err = nilfs_store_magic(sb, sbp); if (err) goto failed_sbh; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index cd4ae1b8ae16..85da0629415d 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -219,10 +219,6 @@ THE_NILFS_FNS(PURGING, purging) #define nilfs_set_opt(nilfs, opt) \ ((nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt) #define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt) -#define nilfs_write_opt(nilfs, mask, opt) \ - ((nilfs)->ns_mount_opt = \ - (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \ - NILFS_MOUNT_##opt)) \ /** * struct nilfs_root - nilfs root object @@ -276,7 +272,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *alloc_nilfs(struct super_block *sb); void destroy_nilfs(struct the_nilfs *nilfs); -int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); +int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); From f492fb365699448949a2eb4aa295e0e8fbf79b10 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 23 Apr 2024 23:30:18 +0100 Subject: [PATCH 41/62] ocfs2: remove redundant assignment to variable status Variable status is being assigned and error code that is never read, it is being assigned inside of a do-while loop. The assignment is redundant and can be removed. Cleans up clang scan build warning: fs/ocfs2/dlm/dlmdomain.c:1530:2: warning: Value stored to 'status' is never read [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20240423223018.1573213-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Cc: Heming Zhao Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/ocfs2/dlm/dlmdomain.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 2e0a2f338282..2018501b2249 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1527,7 +1527,6 @@ static void dlm_send_join_asserts(struct dlm_ctxt *dlm, { int status, node, live; - status = 0; node = -1; while ((node = find_next_bit(node_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { From bbed8b9ffed16572357c5a348c7172846d106cfe Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Apr 2024 17:27:57 -0300 Subject: [PATCH 42/62] tools lib rbtree: pick some improvements from the kernel rbtree code The tools/lib/rbtree.c code came from the kernel. Remove the EXPORT_SYMBOL() that make sense only there. Unfortunately it is not being checked with tools/perf/check_headers.sh. Will try to remedy this. Until then pick the improvements from: b0687c1119b4e8c8 ("lib/rbtree: use '+' instead of '|' for setting color.") That I noticed by doing: diff -u tools/lib/rbtree.c lib/rbtree.c diff -u tools/include/linux/rbtree_augmented.h include/linux/rbtree_augmented.h There is one other cases, but lets pick it in separate patches. Link: https://lkml.kernel.org/r/ZigZzeFoukzRKG1Q@x1 Signed-off-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Noah Goldstein Signed-off-by: Andrew Morton --- tools/include/linux/rbtree_augmented.h | 4 ++-- tools/lib/rbtree.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 570bb9794421..95483c7d81df 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -158,13 +158,13 @@ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { - rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; + rb->__rb_parent_color = rb_color(rb) + (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { - rb->__rb_parent_color = (unsigned long)p | color; + rb->__rb_parent_color = (unsigned long)p + color; } static inline void diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 727396de6be5..9e7307186b7f 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -58,7 +58,7 @@ static inline void rb_set_black(struct rb_node *rb) { - rb->__rb_parent_color |= RB_BLACK; + rb->__rb_parent_color += RB_BLACK; } static inline struct rb_node *rb_red_parent(struct rb_node *red) From 1f65ce65a3749f858b2b3cd0898483ea61313549 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Apr 2024 22:23:08 +0300 Subject: [PATCH 43/62] media: rc: add missing io.h Patch series "kfifo: Clean up kfifo.h", v2. To reduce dependency hell a degree, clean up kfifo.h (mainly getting rid of kernel.h in the global header). This patch (of 3): In many remote control drivers the io.h is implied by others. This is not good as it prevents from cleanups done in other headers. Add missing include. Link: https://lkml.kernel.org/r/20240423192529.3249134-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20240423192529.3249134-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alain Volmat Cc: AngeloGioacchino Del Regno Cc: Chen-Yu Tsai Cc: Hans Verkuil Cc: Jernej Skrabec Cc: Matthias Brugger Cc: Mauro Carvalho Chehab Cc: Patrice Chotard Cc: Rob Herring Cc: Samuel Holland Cc: Sean Wang Cc: Sean Young Cc: Stefani Seibold Signed-off-by: Andrew Morton --- drivers/media/rc/mtk-cir.c | 1 + drivers/media/rc/serial_ir.c | 1 + drivers/media/rc/st_rc.c | 1 + drivers/media/rc/sunxi-cir.c | 1 + 4 files changed, 4 insertions(+) diff --git a/drivers/media/rc/mtk-cir.c b/drivers/media/rc/mtk-cir.c index 4e294e59d3cb..b2f82b2d1c8d 100644 --- a/drivers/media/rc/mtk-cir.c +++ b/drivers/media/rc/mtk-cir.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/rc/serial_ir.c b/drivers/media/rc/serial_ir.c index 96ae0294ac10..fc5fd3927177 100644 --- a/drivers/media/rc/serial_ir.c +++ b/drivers/media/rc/serial_ir.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/rc/st_rc.c b/drivers/media/rc/st_rc.c index 28477aa95563..988b09191c4c 100644 --- a/drivers/media/rc/st_rc.c +++ b/drivers/media/rc/st_rc.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/rc/sunxi-cir.c b/drivers/media/rc/sunxi-cir.c index bf58c965ead8..b49df8355e6b 100644 --- a/drivers/media/rc/sunxi-cir.c +++ b/drivers/media/rc/sunxi-cir.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include From 495ae16a2895d6475384bca59f5128c9964dee0c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Apr 2024 22:23:09 +0300 Subject: [PATCH 44/62] media: stih-cec: add missing io.h In the driver the io.h is implied by others. This is not good as it prevents from cleanups done in other headers. Add missing include. Link: https://lkml.kernel.org/r/20240423192529.3249134-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alain Volmat Cc: AngeloGioacchino Del Regno Cc: Chen-Yu Tsai Cc: Hans Verkuil Cc: Jernej Skrabec Cc: Matthias Brugger Cc: Mauro Carvalho Chehab Cc: Patrice Chotard Cc: Rob Herring Cc: Samuel Holland Cc: Sean Wang Cc: Sean Young Cc: Stefani Seibold Signed-off-by: Andrew Morton --- drivers/media/cec/platform/sti/stih-cec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/cec/platform/sti/stih-cec.c b/drivers/media/cec/platform/sti/stih-cec.c index a20fc5c0c88d..99978a7c8d9b 100644 --- a/drivers/media/cec/platform/sti/stih-cec.c +++ b/drivers/media/cec/platform/sti/stih-cec.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include From 22bcc915ae910bc823d8351542f6d9e7623fff24 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 23 Apr 2024 22:23:10 +0300 Subject: [PATCH 45/62] kfifo: don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Link: https://lkml.kernel.org/r/20240423192529.3249134-4-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Alain Volmat Cc: AngeloGioacchino Del Regno Cc: Chen-Yu Tsai Cc: Hans Verkuil Cc: Jernej Skrabec Cc: Matthias Brugger Cc: Mauro Carvalho Chehab Cc: Patrice Chotard Cc: Rob Herring Cc: Samuel Holland Cc: Sean Wang Cc: Sean Young Cc: Stefani Seibold Signed-off-by: Andrew Morton --- include/linux/kfifo.h | 9 +++++++-- lib/kfifo.c | 10 +++++----- samples/kfifo/dma-example.c | 3 ++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 0b35a41440ff..6b28d642f332 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -36,10 +36,15 @@ * to lock the reader. */ -#include +#include #include #include -#include +#include + +#include +#include + +struct scatterlist; struct __kfifo { unsigned int in; diff --git a/lib/kfifo.c b/lib/kfifo.c index 12f5a347aa13..15acdee4a8f3 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -5,13 +5,13 @@ * Copyright (C) 2009/2010 Stefani Seibold */ -#include -#include -#include #include -#include -#include +#include #include +#include +#include +#include +#include /* * internal helper to calculate the unused elements in a fifo diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index 0cf27483cb36..74fe915b7ffe 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -6,8 +6,9 @@ */ #include -#include #include +#include +#include /* * This module shows how to handle fifo dma operations. From ec0b6d17a5f89da2182ec8e2f978c20bbedf6ae2 Mon Sep 17 00:00:00 2001 From: Florian Rommel Date: Thu, 25 Apr 2024 17:34:58 +0200 Subject: [PATCH 46/62] scripts/gdb: fix failing KGDB detection during probe Patch series "scripts/gdb: Fixes for $lx_current and $lx_per_cpu". This series fixes several bugs in the GDB scripts related to the $lx_current and $lx_per_cpu functions. The changes were tested with GDB 10, 11, 12, 13, and 14. Patch 1 fixes false-negative results when probing for KGDB Patch 2 fixes the $lx_per_cpu function, which is currently non-functional in QEMU-GDB and KGDB. Patch 3 fixes an additional bug in $lx_per_cpu that occurs with KGDB. Patch 4 fixes the incorrect detection of the current CPU number in KGDB, which silently breaks $lx_per_cpu and $lx_current. This patch (of 4): The KGDB probe function sometimes failed to detect KGDB for SMP machines as it assumed that task 2 (kthreadd) is running on CPU 0, which is not necessarily the case. Now, the detection is agnostic to kthreadd's CPU. Link: https://lkml.kernel.org/r/20240425153501.749966-1-mail@florommel.de Link: https://lkml.kernel.org/r/20240425153501.749966-2-mail@florommel.de Signed-off-by: Florian Rommel Cc: Andrew Jones Cc: Deepak Gupta Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- scripts/gdb/linux/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py index 7d5278d815fa..245ab297ea84 100644 --- a/scripts/gdb/linux/utils.py +++ b/scripts/gdb/linux/utils.py @@ -196,7 +196,7 @@ def get_gdbserver_type(): def probe_kgdb(): try: thread_info = gdb.execute("info thread 2", to_string=True) - return "shadowCPU0" in thread_info + return "shadowCPU" in thread_info except gdb.error: return False From db08c53fdd542bb7f83bd57c3cfa3e1b95c9b54d Mon Sep 17 00:00:00 2001 From: Florian Rommel Date: Thu, 25 Apr 2024 17:34:59 +0200 Subject: [PATCH 47/62] scripts/gdb: fix parameter handling in $lx_per_cpu Before, the script tried to get the address by constructing a pointer to the parameter (by name). However, since GDB now passes the parameter as a GdbValue, we cannot get its name. Instead, we retrieve the address through GdbValue's address attribute. Before: >>> p $lx_per_cpu(cpu_info) Traceback (most recent call last): File "./scripts/gdb/linux/cpus.py", line 152, in invoke var_ptr = gdb.parse_and_eval("&" + var_name.string()) ^^^^^^^^^^^^^^^^^ gdb.error: Trying to read string with inappropriate type `struct cpuinfo_x86'. Link: https://lkml.kernel.org/r/20240425153501.749966-3-mail@florommel.de Signed-off-by: Florian Rommel Cc: Andrew Jones Cc: Deepak Gupta Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- scripts/gdb/linux/cpus.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index cba589e5b57d..2b51a3abd363 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -152,9 +152,8 @@ Note that VAR has to be quoted as string.""" def __init__(self): super(PerCpu, self).__init__("lx_per_cpu") - def invoke(self, var_name, cpu=-1): - var_ptr = gdb.parse_and_eval("&" + var_name.string()) - return per_cpu(var_ptr, cpu) + def invoke(self, var, cpu=-1): + return per_cpu(var.address, cpu) PerCpu() From 7566b063e9e4af908123ebe8b80cc0d0c7429507 Mon Sep 17 00:00:00 2001 From: Florian Rommel Date: Thu, 25 Apr 2024 17:35:00 +0200 Subject: [PATCH 48/62] scripts/gdb: make get_thread_info accept pointers get_thread_info ($lx_thread_info) only accepted a dereferenced task parameter. Passing a pointer to a task_struct (like $lx_per_cpu does with KGDB) threw an exception. With this patch, both (dereferenced values and pointers) are accepted. Before (on x86, KGDB): >>> p $lx_per_cpu(cpu_info) Traceback (most recent call last): File "./scripts/gdb/linux/cpus.py", line 158, in invoke return per_cpu(var_ptr, cpu) ^^^^^^^^^^^^^^^^^^^^^ File "./scripts/gdb/linux/cpus.py", line 42, in per_cpu cpu = get_current_cpu() ^^^^^^^^^^^^^^^^^ File "./scripts/gdb/linux/cpus.py", line 33, in get_current_cpu return tasks.get_thread_info(tasks.get_task_by_pid(tid))['cpu'] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "./scripts/gdb/linux/tasks.py", line 88, in get_thread_info if task.type.fields()[0].type == thread_info_type.get_type(): ~~~~~~~~~~~~~~~~~~^^^ IndexError: list index out of range Link: https://lkml.kernel.org/r/20240425153501.749966-4-mail@florommel.de Signed-off-by: Florian Rommel Cc: Andrew Jones Cc: Deepak Gupta Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- scripts/gdb/linux/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py index 6793d6e86e77..62348397c1f5 100644 --- a/scripts/gdb/linux/tasks.py +++ b/scripts/gdb/linux/tasks.py @@ -85,7 +85,7 @@ thread_info_type = utils.CachedType("struct thread_info") def get_thread_info(task): thread_info_ptr_type = thread_info_type.get_type().pointer() - if task.type.fields()[0].type == thread_info_type.get_type(): + if task_type.get_type().fields()[0].type == thread_info_type.get_type(): return task['thread_info'] thread_info = task['stack'].cast(thread_info_ptr_type) return thread_info.dereference() From 40eea5abbb9ccae6df55dfd94c3c85c023e2521b Mon Sep 17 00:00:00 2001 From: Florian Rommel Date: Thu, 25 Apr 2024 17:35:01 +0200 Subject: [PATCH 49/62] scripts/gdb: fix detection of current CPU in KGDB Directly read the current CPU number from the kgdb_active variable. Before, the active CPU was obtained through the current task, which required searching the task list for the pid of GDB's selected thread. Obtaining the pid was buggy: GDB may use selected_thread().ptid[1] (LWPID) instead of .ptid[2] (TID) to store the threads pid; see https://sourceware.org/gdb/current/onlinedocs/gdb.html/Threads-In-Python.html As a result, the detection could return the wrong CPU number, leading to incorrect results for $lx_per_cpu and $lx_current. As a side effect, the patch significantly speeds up $lx_per_cpu and $lx_current in KGDB by avoiding the task-list iteration. Link: https://lkml.kernel.org/r/20240425153501.749966-5-mail@florommel.de Signed-off-by: Florian Rommel Cc: Andrew Jones Cc: Deepak Gupta Cc: Jan Kiszka Cc: Kieran Bingham Cc: Kuan-Ying Lee Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- scripts/gdb/linux/cpus.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py index 2b51a3abd363..2f11c4f9c345 100644 --- a/scripts/gdb/linux/cpus.py +++ b/scripts/gdb/linux/cpus.py @@ -26,11 +26,7 @@ def get_current_cpu(): if utils.get_gdbserver_type() == utils.GDBSERVER_QEMU: return gdb.selected_thread().num - 1 elif utils.get_gdbserver_type() == utils.GDBSERVER_KGDB: - tid = gdb.selected_thread().ptid[2] - if tid > (0x100000000 - MAX_CPUS - 2): - return 0x100000000 - tid - 2 - else: - return tasks.get_thread_info(tasks.get_task_by_pid(tid))['cpu'] + return gdb.parse_and_eval("kgdb_active.counter") else: raise gdb.GdbError("Sorry, obtaining the current CPU is not yet " "supported with this gdb server.") From 675f02e5e65651553c945a464e4f7630859f2032 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Apr 2024 03:50:17 +0100 Subject: [PATCH 50/62] squashfs: convert squashfs_symlink_read_folio to use folio APIs Remove use of page APIs, return the errno instead of 0, switch from kmap_atomic to kmap_local and use folio_end_read() to unify the two exit paths. Link: https://lkml.kernel.org/r/20240420025029.2166544-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Phillip Lougher Reviewed-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/symlink.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 2bf977a52c2c..6ef735bd841a 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -32,20 +32,19 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; - int index = page->index << PAGE_SHIFT; + int index = folio_pos(folio); u64 block = squashfs_i(inode)->start; int offset = squashfs_i(inode)->offset; int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE); - int bytes, copied; + int bytes, copied, error; void *pageaddr; struct squashfs_cache_entry *entry; TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " - "%llx, offset %x\n", page->index, block, offset); + "%llx, offset %x\n", folio->index, block, offset); /* * Skip index bytes into symlink metadata. @@ -57,14 +56,15 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) ERROR("Unable to read symlink [%llx:%x]\n", squashfs_i(inode)->start, squashfs_i(inode)->offset); - goto error_out; + error = bytes; + goto out; } } /* * Read length bytes from symlink metadata. Squashfs_read_metadata * is not used here because it can sleep and we want to use - * kmap_atomic to map the page. Instead call the underlying + * kmap_local to map the folio. Instead call the underlying * squashfs_cache_get routine. As length bytes may overlap metadata * blocks, we may need to call squashfs_cache_get multiple times. */ @@ -75,29 +75,26 @@ static int squashfs_symlink_read_folio(struct file *file, struct folio *folio) squashfs_i(inode)->start, squashfs_i(inode)->offset); squashfs_cache_put(entry); - goto error_out; + error = entry->error; + goto out; } - pageaddr = kmap_atomic(page); + pageaddr = kmap_local_folio(folio, 0); copied = squashfs_copy_data(pageaddr + bytes, entry, offset, length - bytes); if (copied == length - bytes) memset(pageaddr + length, 0, PAGE_SIZE - length); else block = entry->next_index; - kunmap_atomic(pageaddr); + kunmap_local(pageaddr); squashfs_cache_put(entry); } - flush_dcache_page(page); - SetPageUptodate(page); - unlock_page(page); - return 0; - -error_out: - SetPageError(page); - unlock_page(page); - return 0; + flush_dcache_folio(folio); + error = 0; +out: + folio_end_read(folio, error == 0); + return error; } From bbf45b7e68555569489ab2428dd9c23960cdc9bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Apr 2024 03:50:18 +0100 Subject: [PATCH 51/62] squashfs: remove calls to set the folio error flag Nobody checks the error flag on squashfs folios, so stop setting it. Link: https://lkml.kernel.org/r/20240420025029.2166544-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Phillip Lougher Reviewed-by: Phillip Lougher Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 6 +----- fs/squashfs/file_direct.c | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e8df6430444b..a8c1e7f9a609 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -375,8 +375,6 @@ void squashfs_fill_page(struct page *page, struct squashfs_cache_entry *buffer, flush_dcache_page(page); if (copied == avail) SetPageUptodate(page); - else - SetPageError(page); } /* Copy data into page cache */ @@ -471,7 +469,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) res = read_blocklist(inode, index, &block); if (res < 0) - goto error_out; + goto out; if (res == 0) res = squashfs_readpage_sparse(page, expected); @@ -483,8 +481,6 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) if (!res) return 0; -error_out: - SetPageError(page); out: pageaddr = kmap_atomic(page); memset(pageaddr, 0, PAGE_SIZE); diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 763a3f7a75f6..2a689ce71de9 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -106,14 +106,13 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize, return 0; mark_errored: - /* Decompression failed, mark pages as errored. Target_page is + /* Decompression failed. Target_page is * dealt with by the caller */ for (i = 0; i < pages; i++) { if (page[i] == NULL || page[i] == target_page) continue; flush_dcache_page(page[i]); - SetPageError(page[i]); unlock_page(page[i]); put_page(page[i]); } From 91d743a9c8299de1fc1b47428d8bb4c85face00f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 30 Apr 2024 17:00:19 +0900 Subject: [PATCH 52/62] nilfs2: make superblock data array index computation sparse friendly Upon running sparse, "warning: dubious: x & !y" is output at an array index calculation within nilfs_load_super_block(). The calculation is not wrong, but to eliminate the sparse warning, replace it with an equivalent calculation. Also, add a comment to make it easier to understand what the unintuitive array index calculation is doing and whether it's correct. Link: https://lkml.kernel.org/r/20240430080019.4242-3-konishi.ryusuke@gmail.com Fixes: e339ad31f599 ("nilfs2: introduce secondary super block") Signed-off-by: Ryusuke Konishi Cc: Bart Van Assche Cc: Jens Axboe Cc: kernel test robot Signed-off-by: Andrew Morton --- fs/nilfs2/the_nilfs.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index db322068678f..f41d7b6d432c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -592,7 +592,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; u64 sb2off, devsize = bdev_nr_bytes(nilfs->ns_bdev); - int valid[2], swp = 0; + int valid[2], swp = 0, older; if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) { nilfs_err(sb, "device size too small"); @@ -648,9 +648,25 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, if (swp) nilfs_swap_super_block(nilfs); + /* + * Calculate the array index of the older superblock data. + * If one has been dropped, set index 0 pointing to the remaining one, + * otherwise set index 1 pointing to the old one (including if both + * are the same). + * + * Divided case valid[0] valid[1] swp -> older + * ------------------------------------------------------------- + * Both SBs are invalid 0 0 N/A (Error) + * SB1 is invalid 0 1 1 0 + * SB2 is invalid 1 0 0 0 + * SB2 is newer 1 1 1 0 + * SB2 is older or the same 1 1 0 1 + */ + older = valid[1] ^ swp; + nilfs->ns_sbwcount = 0; nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); - nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); + nilfs->ns_prot_seq = le64_to_cpu(sbp[older]->s_last_seq); *sbpp = sbp[0]; return 0; } From 602ba77361160d99c77e3dadf7d891364f8c71b3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:35 -0700 Subject: [PATCH 53/62] watchdog: handle comma separated nmi_watchdog command line Per the document, the kernel can accept comma separated command line like nmi_watchdog=nopanic,0. However, the code doesn't really handle it. Fix the kernel to handle it properly. Link: https://lkml.kernel.org/r/20240430060236.1878002-1-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/watchdog.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7b2125503af..7f54484de16f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,6 +71,7 @@ void __init hardlockup_detector_disable(void) static int __init hardlockup_panic_setup(char *str) { +next: if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "nopanic", 7)) @@ -79,6 +80,12 @@ static int __init hardlockup_panic_setup(char *str) watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + while (*(str++)) { + if (*str == ',') { + str++; + goto next; + } + } return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); From 393fb313a2e150b768e4850658679e2afff431e9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 29 Apr 2024 23:02:36 -0700 Subject: [PATCH 54/62] watchdog: allow nmi watchdog to use raw perf event NMI watchdog permanently consumes one hardware counters per CPU on the system. For systems that use many hardware counters, this causes more aggressive time multiplexing of perf events. OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely used. Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog to use raw event. For example, on Intel CPUs, we can use "r300" to configure the watchdog to use ref-cycles event. If the raw event does not work, fall back to use "cycles". [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org Signed-off-by: Song Liu Cc: Peter Zijlstra Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 5 +- include/linux/nmi.h | 2 + kernel/watchdog.c | 2 + kernel/watchdog_perf.c | 46 +++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 902ecd92a29f..1fa79a3d0d1a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3773,10 +3773,12 @@ Format: [state][,regs][,debounce][,die] nmi_watchdog= [KNL,BUGS=X86] Debugging features for SMP kernels - Format: [panic,][nopanic,][num] + Format: [panic,][nopanic,][rNNN,][num] Valid num: 0 or 1 0 - turn hardlockup detector in nmi_watchdog off 1 - turn hardlockup detector in nmi_watchdog on + rNNN - configure the watchdog with raw perf event 0xNNN + When panic is specified, panic when an NMI watchdog timeout occurs (or 'nopanic' to not panic on an NMI watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set) @@ -7464,4 +7466,3 @@ memory, and other data can't be written using xmon commands. off xmon is disabled. - diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f53438eae815..a8dfb38c9bb6 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -105,10 +105,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs); extern void hardlockup_detector_perf_stop(void); extern void hardlockup_detector_perf_restart(void); extern void hardlockup_detector_perf_cleanup(void); +extern void hardlockup_config_perf_event(const char *str); #else static inline void hardlockup_detector_perf_stop(void) { } static inline void hardlockup_detector_perf_restart(void) { } static inline void hardlockup_detector_perf_cleanup(void) { } +static inline void hardlockup_config_perf_event(const char *str) { } #endif void watchdog_hardlockup_stop(void); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7f54484de16f..ab0129b15f25 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -80,6 +80,8 @@ next: watchdog_hardlockup_user_enabled = 0; else if (!strncmp(str, "1", 1)) watchdog_hardlockup_user_enabled = 1; + else if (!strncmp(str, "r", 1)) + hardlockup_config_perf_event(str + 1); while (*(str++)) { if (*str == ',') { str++; diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 8ea00c4a24b2..5f7d1f0d4268 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -90,6 +90,14 @@ static struct perf_event_attr wd_hw_attr = { .disabled = 1, }; +static struct perf_event_attr fallback_wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, struct perf_sample_data *data, @@ -122,6 +130,13 @@ static int hardlockup_detector_event_create(void) /* Try to register using hardware perf events */ evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + wd_attr = &fallback_wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + } + if (IS_ERR(evt)) { pr_debug("Perf event create on CPU %d failed with %ld\n", cpu, PTR_ERR(evt)); @@ -259,3 +274,34 @@ int __init watchdog_hardlockup_probe(void) } return ret; } + +/** + * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. + * + * @str: number which identifies the raw perf event to use + */ +void __init hardlockup_config_perf_event(const char *str) +{ + u64 config; + char buf[24]; + char *comma = strchr(str, ','); + + if (!comma) { + if (kstrtoull(str, 16, &config)) + return; + } else { + unsigned int len = comma - str; + + if (len >= sizeof(buf)) + return; + + if (strscpy(buf, str, sizeof(buf)) < 0) + return; + buf[len] = 0; + if (kstrtoull(buf, 16, &config)) + return; + } + + wd_hw_attr.type = PERF_TYPE_RAW; + wd_hw_attr.config = config; +} From 8fcb916cac8985188a3f825966ffa99507a90cb1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 May 2024 16:41:07 -0700 Subject: [PATCH 55/62] kernel/watchdog_perf.c: tidy up kerneldoc It is unconventional to have a blank line between name-of-function and description-of-args. Cc: Peter Zijlstra Cc: Song Liu Cc: "Matthew Wilcox (Oracle)" Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- kernel/watchdog_perf.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 5f7d1f0d4268..d577c4a8321e 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -148,7 +148,6 @@ static int hardlockup_detector_event_create(void) /** * watchdog_hardlockup_enable - Enable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_enable(unsigned int cpu) @@ -167,7 +166,6 @@ void watchdog_hardlockup_enable(unsigned int cpu) /** * watchdog_hardlockup_disable - Disable the local event - * * @cpu: The CPU to enable hard lockup on. */ void watchdog_hardlockup_disable(unsigned int cpu) @@ -277,7 +275,6 @@ int __init watchdog_hardlockup_probe(void) /** * hardlockup_config_perf_event - Overwrite config of wd_hw_attr. - * * @str: number which identifies the raw perf event to use */ void __init hardlockup_config_perf_event(const char *str) From a7ac59f4f23660473e6350306f9b88f24fcc38f1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 30 Apr 2024 14:09:01 +0900 Subject: [PATCH 56/62] nilfs2: remove calls to folio_set_error() and folio_clear_error() Nobody checks this flag on nilfs2 folios, stop setting and clearing it. That lets us simplify nilfs_end_folio_io() slightly. Link: https://lkml.kernel.org/r/20240420025029.2166544-17-willy@infradead.org Link: https://lkml.kernel.org/r/20240430050901.3239-1-konishi.ryusuke@gmail.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ryusuke Konishi Cc: kernel test robot Cc: Peter Zijlstra Cc: Song Liu Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 1 - fs/nilfs2/segment.c | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bc846b904b68..0900fcad2d0c 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -174,7 +174,6 @@ Eend: dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: - folio_set_error(folio); return false; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index aa5290cb7467..8654ab8ad534 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1725,14 +1725,8 @@ static void nilfs_end_folio_io(struct folio *folio, int err) return; } - if (!err) { - if (!nilfs_folio_buffers_clean(folio)) - filemap_dirty_folio(folio->mapping, folio); - folio_clear_error(folio); - } else { + if (err || !nilfs_folio_buffers_clean(folio)) filemap_dirty_folio(folio->mapping, folio); - folio_set_error(folio); - } folio_end_writeback(folio); } From eb59a58113717df04b8a8229befd8ab1e5dbf86e Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Mon, 29 Apr 2024 23:46:09 +0000 Subject: [PATCH 57/62] selftests/kcmp: remove unused open mode Android bionic warns that open modes are ignored if O_CREAT or O_TMPFILE aren't specified. The permissions for the file are set above: fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644); Link: https://lkml.kernel.org/r/20240429234610.191144-1-edliaw@google.com Fixes: d97b46a64674 ("syscalls, x86: add __NR_kcmp syscall") Signed-off-by: Edward Liaw Reviewed-by: Cyrill Gorcunov Cc: Eric Biederman Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/kcmp/kcmp_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c index 25110c7c0b3e..d7a8e321bb16 100644 --- a/tools/testing/selftests/kcmp/kcmp_test.c +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -91,7 +91,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(3); - fd2 = open(kpath, O_RDWR, 0644); + fd2 = open(kpath, O_RDWR); if (fd2 < 0) { perror("Can't open file"); ksft_exit_fail(); From 33580d667bb20e00356fd06500f5197ef1baa1f5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 7 May 2024 23:24:54 +0900 Subject: [PATCH 58/62] nilfs2: use __field_struct() for a bitwise field As one can see in include/trace/stages/stage4_event_fields.h, the implementation of __field() uses the is_signed_type() macro. As one can see in commit dcf8e5633e2e ("tracing: Define the is_signed_type() macro once"), there has been an attempt to not make is_signed_type() trigger sparse warnings for bitwise types. Despite that change, sparse complains when passing a bitwise type to is_signed_type(). The reason is that in its definition below, an inequality comparison will be made against bitwise types, which are random collections of bits (the casts to bitwise types themselves are semantically valid and not problematic): #define is_signed_type(type) (((type)(-1)) < (__force type)1) So, as a workaround, follow the example of and suppress the following sparse warnings by changing __field() into __field_struct() that doesn't use is_signed_type(): fs/nilfs2/segment.c: note: in included file (through include/trace/trace_events.h, include/trace/define_trace.h, include/trace/events/nilfs2.h): ./include/trace/events/nilfs2.h:191:1: warning: cast to restricted blk_opf_t ./include/trace/events/nilfs2.h:191:1: warning: restricted blk_opf_t degrades to integer ./include/trace/events/nilfs2.h:191:1: warning: restricted blk_opf_t degrades to integer [konishi.ryusuke: describe the reason for the warnings per Linus's explanation] Link: https://lkml.kernel.org/r/20240507222041.4876-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20240507142454.3344-1-konishi.ryusuke@gmail.com Signed-off-by: Bart Van Assche Signed-off-by: Ryusuke Konishi Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401092241.I4mm9OWl-lkp@intel.com/ Reported-by: Ryusuke Konishi Closes: https://lore.kernel.org/all/20240430080019.4242-2-konishi.ryusuke@gmail.com/ Cc: Linus Torvalds Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- include/trace/events/nilfs2.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index 8efc6236f57c..8880c11733dd 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -200,7 +200,11 @@ TRACE_EVENT(nilfs2_mdt_submit_block, __field(struct inode *, inode) __field(unsigned long, ino) __field(unsigned long, blkoff) - __field(enum req_op, mode) + /* + * Use field_struct() to avoid is_signed_type() on the + * bitwise type enum req_op. + */ + __field_struct(enum req_op, mode) ), TP_fast_assign( From 6813216bbdba18e182759d949589be95ebef290f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Tue, 7 May 2024 15:27:56 +1200 Subject: [PATCH 59/62] Documentation: coding-style: ask function-like macros to evaluate parameters Patch series "codingstyle: avoid unused parameters for a function-like macro", v7. A function-like macro could result in build warnings such as "unused variable." This patchset updates the guidance to recommend always using a static inline function instead and also provides checkpatch support for this new rule. This patch (of 2): Recent commit 77292bb8ca69c80 ("crypto: scomp - remove memcpy if sg_nents is 1 and pages are lowmem") leads to warnings on xtensa and loongarch, In file included from crypto/scompress.c:12: include/crypto/scatterwalk.h: In function 'scatterwalk_pagedone': include/crypto/scatterwalk.h:76:30: warning: variable 'page' set but not used [-Wunused-but-set-variable] 76 | struct page *page; | ^~~~ crypto/scompress.c: In function 'scomp_acomp_comp_decomp': >> crypto/scompress.c:174:38: warning: unused variable 'dst_page' [-Wunused-variable] 174 | struct page *dst_page = sg_page(req->dst); | The reason is that flush_dcache_page() is implemented as a noop macro on these platforms as below, #define flush_dcache_page(page) do { } while (0) The driver code, for itself, seems be quite innocent and placing maybe_unused seems pointless, struct page *dst_page = sg_page(req->dst); for (i = 0; i < nr_pages; i++) flush_dcache_page(dst_page + i); And it should be independent of architectural implementation differences. Let's provide guidance on coding style for requesting parameter evaluation or proposing the migration to a static inline function. Link: https://lkml.kernel.org/r/20240507032757.146386-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240507032757.146386-2-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: Max Filippov Reviewed-by: Mark Brown Acked-by: Joe Perches Cc: Chris Zankel Cc: Huacai Chen Cc: Herbert Xu Cc: Guenter Roeck Cc: Stephen Rothwell Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Joe Perches Cc: Jonathan Corbet Cc: Lukas Bulwahn Cc: Xining Xu Cc: Charlemagne Lasse Cc: Jeff Johnson Signed-off-by: Andrew Morton --- Documentation/process/coding-style.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/Documentation/process/coding-style.rst b/Documentation/process/coding-style.rst index 9c7cf7347394..7e768c65aa92 100644 --- a/Documentation/process/coding-style.rst +++ b/Documentation/process/coding-style.rst @@ -827,6 +827,29 @@ Macros with multiple statements should be enclosed in a do - while block: do_this(b, c); \ } while (0) +Function-like macros with unused parameters should be replaced by static +inline functions to avoid the issue of unused variables: + +.. code-block:: c + + static inline void fun(struct foo *foo) + { + } + +Due to historical practices, many files still employ the "cast to (void)" +approach to evaluate parameters. However, this method is not advisable. +Inline functions address the issue of "expression with side effects +evaluated more than once", circumvent unused-variable problems, and +are generally better documented than macros for some reason. + +.. code-block:: c + + /* + * Avoid doing this whenever possible and instead opt for static + * inline functions + */ + #define macrofun(foo) do { (void) (foo); } while (0) + Things to avoid when using macros: 1) macros that affect control flow: From b1be5844c1a0124a49a30a20a189d0a53aa10578 Mon Sep 17 00:00:00 2001 From: Xining Xu Date: Tue, 7 May 2024 15:27:57 +1200 Subject: [PATCH 60/62] scripts: checkpatch: check unused parameters for function-like macro If function-like macros do not utilize a parameter, it might result in a build warning. In our coding style guidelines, we advocate for utilizing static inline functions to replace such macros. This patch verifies compliance with the new rule. For a macro such as the one below, #define test(a) do { } while (0) The test result is as follows. WARNING: Argument 'a' is not used in function-like macro #21: FILE: mm/init-mm.c:20: +#define test(a) do { } while (0) total: 0 errors, 1 warnings, 8 lines checked Link: https://lkml.kernel.org/r/20240507032757.146386-3-21cnbao@gmail.com Signed-off-by: Xining Xu Tested-by: Barry Song Signed-off-by: Barry Song Acked-by: Joe Perches Cc: Chris Zankel Cc: Huacai Chen Cc: Herbert Xu Cc: Guenter Roeck Cc: Stephen Rothwell Cc: Mark Brown Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Jonathan Corbet Cc: Lukas Bulwahn Cc: Max Filippov Cc: Jeff Johnson Cc: Charlemagne Lasse Signed-off-by: Andrew Morton --- Documentation/dev-tools/checkpatch.rst | 14 ++++++++++++++ scripts/checkpatch.pl | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/Documentation/dev-tools/checkpatch.rst b/Documentation/dev-tools/checkpatch.rst index 127968995847..a9fac978a525 100644 --- a/Documentation/dev-tools/checkpatch.rst +++ b/Documentation/dev-tools/checkpatch.rst @@ -906,6 +906,20 @@ Macros, Attributes and Symbols See: https://lore.kernel.org/lkml/1399671106.2912.21.camel@joe-AO725/ + **MACRO_ARG_UNUSED** + If function-like macros do not utilize a parameter, it might result + in a build warning. We advocate for utilizing static inline functions + to replace such macros. + For example, for a macro such as the one below:: + + #define test(a) do { } while (0) + + there would be a warning like below:: + + WARNING: Argument 'a' is not used in function-like macro. + + See: https://www.kernel.org/doc/html/latest/process/coding-style.html#macros-enums-and-rtl + **SINGLE_STATEMENT_DO_WHILE_MACRO** For the multi-statement macros, it is necessary to use the do-while loop to avoid unpredictable code paths. The do-while loop helps to diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9c4c4a61bc83..2b812210b412 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6040,6 +6040,12 @@ sub process { CHK("MACRO_ARG_PRECEDENCE", "Macro argument '$arg' may be better as '($arg)' to avoid precedence issues\n" . "$herectx"); } + +# check if this is an unused argument + if ($define_stmt !~ /\b$arg\b/) { + WARN("MACRO_ARG_UNUSED", + "Argument '$arg' is not used in function-like macro\n" . "$herectx"); + } } # check for macros with flow control, but without ## concatenation From 0a73eac1ed10097d1799c10dff2172605fd40c75 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 9 May 2024 07:14:29 +0900 Subject: [PATCH 61/62] nilfs2: convert BUG_ON() in nilfs_finish_roll_forward() to WARN_ON() The BUG_ON check performed on the return value of __getblk() in nilfs_finish_roll_forward() assumes that a buffer that has been successfully read once is retrieved with the same parameters and does not fail (__getblk() does not return an error due to memory allocation failure). Also, nilfs_finish_roll_forward() is called at most once during mount. Taking these into consideration, rewrite the check to use WARN_ON() to avoid using BUG_ON(). Link: https://lkml.kernel.org/r/20240508221429.7559-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/nilfs2/recovery.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index e48372618ac4..020f304c600e 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -699,7 +699,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, return; bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); - BUG_ON(!bh); + if (WARN_ON(!bh)) + return; /* should never happen */ + memset(bh->b_data, 0, bh->b_size); set_buffer_dirty(bh); err = sync_dirty_buffer(bh); From 5cbcb62dddf5346077feb82b7b0c9254222d3445 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 7 May 2024 09:18:58 -0400 Subject: [PATCH 62/62] fs/proc: fix softlockup in __read_vmcore While taking a kernel core dump with makedumpfile on a larger system, softlockup messages often appear. While softlockup warnings can be harmless, they can also interfere with things like RCU freeing memory, which can be problematic when the kdump kexec image is configured with as little memory as possible. Avoid the softlockup, and give things like work items and RCU a chance to do their thing during __read_vmcore by adding a cond_resched. Link: https://lkml.kernel.org/r/20240507091858.36ff767f@imladris.surriel.com Signed-off-by: Rik van Riel Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 5d08d4d159d3..b52d85f8ad59 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -383,6 +383,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) /* leave now if filled buffer already */ if (!iov_iter_count(iter)) return acc; + + cond_resched(); } list_for_each_entry(m, &vmcore_list, list) {