From 8f7262cd66699a4b02eb7549b35c81b2116aad95 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:37 +0900 Subject: [PATCH 001/101] kprobes: Do not use local variable when creating debugfs file debugfs_create_file() takes a pointer argument that can be used during file operation callbacks (accessible via i_private in the inode structure). An obvious requirement is for the pointer to refer to valid memory when used. When creating the debugfs file to dynamically enable / disable kprobes, a pointer to local variable is passed to debugfs_create_file(); which will go out of scope when the init function returns. The reason this hasn't triggered random memory corruption is because the pointer is not accessed during the debugfs file callbacks. Since the enabled state is managed by the kprobes_all_disabled global variable, the local variable is not needed. Fix the incorrect (and unnecessary) usage of local variable during debugfs_file_create() by passing NULL instead. Link: https://lkml.kernel.org/r/163163031686.489837.4476867635937014973.stgit@devnote2 Fixes: bf8f6e5b3e51 ("Kprobes: The ON/OFF knob thru debugfs") Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 790a573bbe00..1cf8bca1ea86 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2809,13 +2809,12 @@ static const struct file_operations fops_kp = { static int __init debugfs_kprobe_init(void) { struct dentry *dir; - unsigned int value = 1; dir = debugfs_create_dir("kprobes", NULL); debugfs_create_file("list", 0400, dir, NULL, &kprobes_fops); - debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); + debugfs_create_file("enabled", 0600, dir, NULL, &fops_kp); debugfs_create_file("blacklist", 0400, dir, NULL, &kprobe_blacklist_fops); From 5d6de7d7fb4b0f752adff80ca003b4fd4b467b64 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:46 +0900 Subject: [PATCH 002/101] kprobes: Use helper to parse boolean input from userspace The "enabled" file provides a debugfs interface to arm / disarm kprobes in the kernel. In order to parse the buffer containing the values written from userspace, the callback manually parses the user input to convert it to a boolean value. As taking a string value from userspace and converting it to boolean is a common operation, a helper kstrtobool_from_user() is already available in the kernel. Update the callback to use the common helper to parse the write buffer from userspace. Link: https://lkml.kernel.org/r/163163032637.489837.10678039554832855327.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1cf8bca1ea86..26fc9904c3b1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2770,30 +2770,14 @@ static ssize_t read_enabled_file_bool(struct file *file, static ssize_t write_enabled_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - char buf[32]; - size_t buf_size; - int ret = 0; + bool enable; + int ret; - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - ret = arm_all_kprobes(); - break; - case 'n': - case 'N': - case '0': - ret = disarm_all_kprobes(); - break; - default: - return -EINVAL; - } + ret = kstrtobool_from_user(user_buf, count, &enable); + if (ret) + return ret; + ret = enable ? arm_all_kprobes() : disarm_all_kprobes(); if (ret) return ret; From 02afb8d6048d6526619e6e2dcdc95ce9c2bdb52f Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:38:57 +0900 Subject: [PATCH 003/101] kprobe: Simplify prepare_kprobe() by dropping redundant version The function prepare_kprobe() is called during kprobe registration and is responsible for ensuring any architecture related preparation for the kprobe is done before returning. One of two versions of prepare_kprobe() is chosen depending on the availability of KPROBE_ON_FTRACE in the kernel configuration. Simplify the code by dropping the version when KPROBE_ON_FTRACE is not selected - instead relying on kprobe_ftrace() to return false when KPROBE_ON_FTRACE is not set. No functional change. Link: https://lkml.kernel.org/r/163163033696.489837.9264661820279300788.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 5 +++++ kernel/kprobes.c | 23 +++++++++-------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e4f3bfe08757..0b75549b2815 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -354,6 +354,11 @@ static inline void wait_for_kprobe_optimizer(void) { } extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +#else +static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + return -EINVAL; +} #endif int arch_check_ftrace_location(struct kprobe *p); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 26fc9904c3b1..cfa9d3c263eb 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1033,15 +1033,6 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Must ensure p->addr is really on ftrace */ -static int prepare_kprobe(struct kprobe *p) -{ - if (!kprobe_ftrace(p)) - return arch_prepare_kprobe(p); - - return arch_prepare_kprobe_ftrace(p); -} - /* Caller must lock kprobe_mutex */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1113,11 +1104,6 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } #else /* !CONFIG_KPROBES_ON_FTRACE */ -static inline int prepare_kprobe(struct kprobe *p) -{ - return arch_prepare_kprobe(p); -} - static inline int arm_kprobe_ftrace(struct kprobe *p) { return -ENODEV; @@ -1129,6 +1115,15 @@ static inline int disarm_kprobe_ftrace(struct kprobe *p) } #endif +static int prepare_kprobe(struct kprobe *p) +{ + /* Must ensure p->addr is really on ftrace */ + if (kprobe_ftrace(p)) + return arch_prepare_kprobe_ftrace(p); + + return arch_prepare_kprobe(p); +} + /* Arm a kprobe with text_mutex */ static int arm_kprobe(struct kprobe *kp) { From 71bdc8fe22ace3554144911a49d5d973b7e8a49f Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:39:06 +0900 Subject: [PATCH 004/101] csky: ftrace: Drop duplicate implementation of arch_check_ftrace_location() The csky specific arch_check_ftrace_location() shadows a weak implementation of the function in core code that offers the same functionality but with additional error checking. Drop the architecture specific function as a step towards further cleanup in core code. Link: https://lkml.kernel.org/r/163163034617.489837.7789033031868135258.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Guo Ren Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index ef2bb9bd9605..b388228abbf2 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -2,13 +2,6 @@ #include -int arch_check_ftrace_location(struct kprobe *p) -{ - if (ftrace_location((unsigned long)p->addr)) - p->flags |= KPROBE_FLAG_FTRACE; - return 0; -} - /* Ftrace callback handler for kprobes -- called under preepmt disabled */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs) From 4402deae8993fb0e25a19bb999b38df13e25a7e0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 14 Sep 2021 23:39:16 +0900 Subject: [PATCH 005/101] kprobes: Make arch_check_ftrace_location static arch_check_ftrace_location() was introduced as a weak function in commit f7f242ff004499 ("kprobes: introduce weak arch_check_ftrace_location() helper function") to allow architectures to handle kprobes call site on their own. Recently, the only architecture (csky) to implement arch_check_ftrace_location() was migrated to using the common version. As a result, further cleanup the code to drop the weak attribute and rename the function to remove the architecture specific implementation. Link: https://lkml.kernel.org/r/163163035673.489837.2367816318195254104.stgit@devnote2 Signed-off-by: Punit Agrawal Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 -- kernel/kprobes.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0b75549b2815..8a9412bb0d5e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -361,8 +361,6 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) } #endif -int arch_check_ftrace_location(struct kprobe *p); - /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cfa9d3c263eb..30199bfcc74a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1524,7 +1524,7 @@ static inline int warn_kprobe_rereg(struct kprobe *p) return ret; } -int __weak arch_check_ftrace_location(struct kprobe *p) +static int check_ftrace_location(struct kprobe *p) { unsigned long ftrace_addr; @@ -1547,7 +1547,7 @@ static int check_kprobe_address_safe(struct kprobe *p, { int ret; - ret = arch_check_ftrace_location(p); + ret = check_ftrace_location(p); if (ret) return ret; jump_label_lock(); From 9c89bb8e327203bc27e09ebd82d8f61ac2ae8b24 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:25 +0900 Subject: [PATCH 006/101] kprobes: treewide: Cleanup the error messages for kprobes This clean up the error/notification messages in kprobes related code. Basically this defines 'pr_fmt()' macros for each files and update the messages which describes - what happened, - what is the kernel going to do or not do, - is the kernel fine, - what can the user do about it. Also, if the message is not needed (e.g. the function returns unique error code, or other error message is already shown.) remove it, and replace the message with WARN_*() macros if suitable. Link: https://lkml.kernel.org/r/163163036568.489837.14085396178727185469.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arm/probes/kprobes/core.c | 4 +++- arch/arm64/kernel/probes/kprobes.c | 5 ++++- arch/csky/kernel/probes/kprobes.c | 10 ++++----- arch/mips/kernel/kprobes.c | 11 +++++---- arch/riscv/kernel/probes/kprobes.c | 11 +++++---- arch/s390/kernel/kprobes.c | 4 +++- kernel/kprobes.c | 36 +++++++++++++----------------- 7 files changed, 41 insertions(+), 40 deletions(-) diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 27e0af78e88b..a59e38de4a03 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -11,6 +11,8 @@ * Copyright (C) 2007 Marvell Ltd. */ +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -278,7 +280,7 @@ void __kprobes kprobe_handler(struct pt_regs *regs) break; case KPROBE_REENTER: /* A nested probe was hit in FIQ, it is a BUG */ - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); fallthrough; default: diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 6dbcc89f6662..ce429cbacd35 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -7,6 +7,9 @@ * Copyright (C) 2013 Linaro Limited. * Author: Sandeepa Prabhu */ + +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -218,7 +221,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 8fffa34d4e1c..632407bf45d5 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -77,10 +79,8 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; - if (probe_addr & 0x1) { - pr_warn("Address not aligned.\n"); - return -EINVAL; - } + if (probe_addr & 0x1) + return -EILSEQ; /* copy instruction */ p->opcode = le32_to_cpu(*p->addr); @@ -225,7 +225,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index 75bff0f77319..b0934a0d7aed 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -11,6 +11,8 @@ * Copyright (C) IBM Corporation, 2002, 2004 */ +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -80,8 +82,7 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) insn = p->addr[0]; if (insn_has_ll_or_sc(insn)) { - pr_notice("Kprobes for ll and sc instructions are not" - "supported\n"); + pr_notice("Kprobes for ll and sc instructions are not supported\n"); ret = -EINVAL; goto out; } @@ -219,7 +220,7 @@ static int evaluate_branch_instruction(struct kprobe *p, struct pt_regs *regs, return 0; unaligned: - pr_notice("%s: unaligned epc - sending SIGBUS.\n", current->comm); + pr_notice("Failed to emulate branch instruction because of unaligned epc - sending SIGBUS to %s.\n", current->comm); force_sig(SIGBUS); return -EFAULT; @@ -238,10 +239,8 @@ static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs, regs->cp0_epc = (unsigned long)p->addr; else if (insn_has_delayslot(p->opcode)) { ret = evaluate_branch_instruction(p, regs, kcb); - if (ret < 0) { - pr_notice("Kprobes: Error in evaluating branch\n"); + if (ret < 0) return; - } } regs->cp0_epc = (unsigned long)&p->ainsn.insn[0]; } diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 00088dc6da4b..cab6f874358e 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -50,11 +52,8 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; - if (probe_addr & 0x1) { - pr_warn("Address not aligned.\n"); - - return -EINVAL; - } + if (probe_addr & 0x1) + return -EILSEQ; /* copy instruction */ p->opcode = *p->addr; @@ -191,7 +190,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, break; case KPROBE_HIT_SS: case KPROBE_REENTER: - pr_warn("Unrecoverable kprobe detected.\n"); + pr_warn("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); break; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 52d056a5f89f..952d44b0610b 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -7,6 +7,8 @@ * s390 port, used ppc64 as template. Mike Grundy */ +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -259,7 +261,7 @@ static void kprobe_reenter_check(struct kprobe_ctlblk *kcb, struct kprobe *p) * is a BUG. The code path resides in the .kprobes.text * section and is executed with interrupts disabled. */ - pr_err("Invalid kprobe detected.\n"); + pr_err("Failed to recover from reentered kprobes.\n"); dump_kprobe(p); BUG(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 30199bfcc74a..7663c8a51889 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -18,6 +18,9 @@ * and Prasanna S Panchamukhi * added function-return probes. */ + +#define pr_fmt(fmt) "kprobes: " fmt + #include #include #include @@ -892,7 +895,7 @@ static void optimize_all_kprobes(void) optimize_kprobe(p); } cpus_read_unlock(); - printk(KERN_INFO "Kprobes globally optimized\n"); + pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n"); out: mutex_unlock(&kprobe_mutex); } @@ -925,7 +928,7 @@ static void unoptimize_all_kprobes(void) /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); - printk(KERN_INFO "Kprobes globally unoptimized\n"); + pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } static DEFINE_MUTEX(kprobe_sysctl_mutex); @@ -1003,7 +1006,7 @@ static int reuse_unused_kprobe(struct kprobe *ap) * unregistered. * Thus there should be no chance to reuse unused kprobe. */ - printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + WARN_ON_ONCE(1); return -EINVAL; } @@ -1040,18 +1043,13 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int ret = 0; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); - if (ret) { - pr_debug("Failed to arm kprobe-ftrace at %pS (%d)\n", - p->addr, ret); + if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret)) return ret; - } if (*cnt == 0) { ret = register_ftrace_function(ops); - if (ret) { - pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) goto err_ftrace; - } } (*cnt)++; @@ -1083,14 +1081,14 @@ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, if (*cnt == 1) { ret = unregister_ftrace_function(ops); - if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret)) return ret; } (*cnt)--; ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0); - WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (%d)\n", + WARN_ONCE(ret < 0, "Failed to disarm kprobe-ftrace at %pS (error %d)\n", p->addr, ret); return ret; } @@ -1880,7 +1878,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, node = node->next; } - pr_err("Oops! Kretprobe fails to find correct return address.\n"); + pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); BUG_ON(1); found: @@ -2209,8 +2207,7 @@ EXPORT_SYMBOL_GPL(enable_kprobe); /* Caller must NOT call this in usual path. This is only for critical case */ void dump_kprobe(struct kprobe *kp) { - pr_err("Dumping kprobe:\n"); - pr_err("Name: %s\nOffset: %x\nAddress: %pS\n", + pr_err("Dump kprobe:\n.symbol_name = %s, .offset = %x, .addr = %pS\n", kp->symbol_name, kp->offset, kp->addr); } NOKPROBE_SYMBOL(dump_kprobe); @@ -2473,8 +2470,7 @@ static int __init init_kprobes(void) err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); if (err) { - pr_err("kprobes: failed to populate blacklist: %d\n", err); - pr_err("Please take care of using kprobes.\n"); + pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err); } if (kretprobe_blacklist_size) { @@ -2483,7 +2479,7 @@ static int __init init_kprobes(void) kretprobe_blacklist[i].addr = kprobe_lookup_name(kretprobe_blacklist[i].name, 0); if (!kretprobe_blacklist[i].addr) - printk("kretprobe: lookup failed: %s\n", + pr_err("Failed to lookup symbol '%s' for kretprobe blacklist. Maybe the target function is removed or renamed.\n", kretprobe_blacklist[i].name); } } @@ -2687,7 +2683,7 @@ static int arm_all_kprobes(void) } if (errors) - pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + pr_warn("Kprobes globally enabled, but failed to enable %d out of %d probes. Please check which kprobes are kept disabled via debugfs.\n", errors, total); else pr_info("Kprobes globally enabled\n"); @@ -2730,7 +2726,7 @@ static int disarm_all_kprobes(void) } if (errors) - pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + pr_warn("Kprobes globally disabled, but failed to disable %d out of %d probes. Please check which kprobes are kept enabled via debugfs.\n", errors, total); else pr_info("Kprobes globally disabled\n"); From 223a76b268c9cfa265d454879ae09e2c9c808f87 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:34 +0900 Subject: [PATCH 007/101] kprobes: Fix coding style issues Fix coding style issues reported by checkpatch.pl and update comments to quote variable names and add "()" to function name. One TODO comment in __disarm_kprobe() is removed because it has been done by following commit. Link: https://lkml.kernel.org/r/163163037468.489837.4282347782492003960.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 40 ++++--- kernel/kprobes.c | 236 +++++++++++++++++++++------------------- 2 files changed, 145 insertions(+), 131 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8a9412bb0d5e..756d3d23ce37 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -3,7 +3,6 @@ #define _LINUX_KPROBES_H /* * Kernel Probes (KProbes) - * include/linux/kprobes.h * * Copyright (C) IBM Corporation, 2002, 2004 * @@ -39,7 +38,7 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 -#else /* CONFIG_KPROBES */ +#else /* !CONFIG_KPROBES */ #include typedef int kprobe_opcode_t; struct arch_specific_insn { @@ -228,7 +227,7 @@ static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance return READ_ONCE(ri->rph->rp); } -#else /* CONFIG_KRETPROBES */ +#else /* !CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { @@ -239,11 +238,15 @@ static inline int arch_trampoline_kprobe(struct kprobe *p) } #endif /* CONFIG_KRETPROBES */ +/* Markers of '_kprobe_blacklist' section */ +extern unsigned long __start_kprobe_blacklist[]; +extern unsigned long __stop_kprobe_blacklist[]; + extern struct kretprobe_blackpoint kretprobe_blacklist[]; #ifdef CONFIG_KPROBES_SANITY_TEST extern int init_test_probes(void); -#else +#else /* !CONFIG_KPROBES_SANITY_TEST */ static inline int init_test_probes(void) { return 0; @@ -303,7 +306,7 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ #define KPROBE_OPTINSN_PAGE_SYM "kprobe_optinsn_page" int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, unsigned long *value, char *type, char *sym); -#else /* __ARCH_WANT_KPROBES_INSN_SLOT */ +#else /* !__ARCH_WANT_KPROBES_INSN_SLOT */ #define DEFINE_INSN_CACHE_OPS(__name) \ static inline bool is_kprobe_##__name##_slot(unsigned long addr) \ { \ @@ -345,11 +348,12 @@ extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); -#endif +#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); -#else +#else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ + #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); @@ -359,7 +363,7 @@ static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) { return -EINVAL; } -#endif +#endif /* CONFIG_KPROBES_ON_FTRACE */ /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); @@ -367,7 +371,7 @@ struct kprobe *get_kprobe(void *addr); /* kprobe_running() will just return the current_kprobe on this CPU */ static inline struct kprobe *kprobe_running(void) { - return (__this_cpu_read(current_kprobe)); + return __this_cpu_read(current_kprobe); } static inline void reset_current_kprobe(void) @@ -431,11 +435,11 @@ static inline struct kprobe *kprobe_running(void) } static inline int register_kprobe(struct kprobe *p) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int register_kprobes(struct kprobe **kps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline void unregister_kprobe(struct kprobe *p) { @@ -445,11 +449,11 @@ static inline void unregister_kprobes(struct kprobe **kps, int num) } static inline int register_kretprobe(struct kretprobe *rp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int register_kretprobes(struct kretprobe **rps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline void unregister_kretprobe(struct kretprobe *rp) { @@ -465,11 +469,11 @@ static inline void kprobe_free_init_mem(void) } static inline int disable_kprobe(struct kprobe *kp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline int enable_kprobe(struct kprobe *kp) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline bool within_kprobe_blacklist(unsigned long addr) @@ -482,6 +486,7 @@ static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, return -ERANGE; } #endif /* CONFIG_KPROBES */ + static inline int disable_kretprobe(struct kretprobe *rp) { return disable_kprobe(&rp->kp); @@ -496,13 +501,14 @@ static inline bool is_kprobe_insn_slot(unsigned long addr) { return false; } -#endif +#endif /* !CONFIG_KPROBES */ + #ifndef CONFIG_OPTPROBES static inline bool is_kprobe_optinsn_slot(unsigned long addr) { return false; } -#endif +#endif /* !CONFIG_OPTPROBES */ /* Returns true if kprobes handled the fault */ static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7663c8a51889..ad39eeaa4371 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* * Kernel Probes (KProbes) - * kernel/kprobes.c * * Copyright (C) IBM Corporation, 2002, 2004 * @@ -52,18 +51,18 @@ static int kprobes_initialized; /* kprobe_table can be accessed by - * - Normal hlist traversal and RCU add/del under kprobe_mutex is held. + * - Normal hlist traversal and RCU add/del under 'kprobe_mutex' is held. * Or * - RCU hlist traversal under disabling preempt (breakpoint handlers) */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -/* NOTE: change this value only with kprobe_mutex held */ +/* NOTE: change this value only with 'kprobe_mutex' held */ static bool kprobes_all_disarmed; -/* This protects kprobe_table and optimizing_list */ +/* This protects 'kprobe_table' and 'optimizing_list' */ static DEFINE_MUTEX(kprobe_mutex); -static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance); kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, unsigned int __unused) @@ -71,12 +70,15 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); } -/* Blacklist -- list of struct kprobe_blacklist_entry */ +/* + * Blacklist -- list of 'struct kprobe_blacklist_entry' to store info where + * kprobes can not probe. + */ static LIST_HEAD(kprobe_blacklist); #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* - * kprobe->ainsn.insn points to the copy of the instruction to be + * 'kprobe::ainsn.insn' points to the copy of the instruction to be * single-stepped. x86_64, POWER4 and above have no-exec support and * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster @@ -107,6 +109,12 @@ enum kprobe_slot_state { void __weak *alloc_insn_page(void) { + /* + * Use module_alloc() so this page is within +/- 2GB of where the + * kernel image and loaded module images reside. This is required + * for most of the architectures. + * (e.g. x86-64 needs this to handle the %rip-relative fixups.) + */ return module_alloc(PAGE_SIZE); } @@ -142,6 +150,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; @@ -167,11 +176,6 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) if (!kip) goto out; - /* - * Use module_alloc so this page is within +/- 2GB of where the - * kernel image and loaded module images reside. This is required - * so x86_64 can correctly handle the %rip-relative fixups. - */ kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); @@ -233,6 +237,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) list_for_each_entry_safe(kip, next, &c->pages, list) { int i; + if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ @@ -313,7 +318,7 @@ int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum, list_for_each_entry_rcu(kip, &c->pages, list) { if ((*symnum)--) continue; - strlcpy(sym, c->sym, KSYM_NAME_LEN); + strscpy(sym, c->sym, KSYM_NAME_LEN); *type = 't'; *value = (unsigned long)kip->insns; ret = 0; @@ -361,9 +366,9 @@ static inline void reset_kprobe_instance(void) /* * This routine is called either: - * - under the kprobe_mutex - during kprobe_[un]register() - * OR - * - with preemption disabled - from arch/xxx/kernel/kprobes.c + * - under the 'kprobe_mutex' - during kprobe_[un]register(). + * OR + * - with preemption disabled - from architecture specific code. */ struct kprobe *get_kprobe(void *addr) { @@ -383,22 +388,20 @@ NOKPROBE_SYMBOL(get_kprobe); static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); -/* Return true if the kprobe is an aggregator */ +/* Return true if 'p' is an aggregator */ static inline int kprobe_aggrprobe(struct kprobe *p) { return p->pre_handler == aggr_pre_handler; } -/* Return true(!0) if the kprobe is unused */ +/* Return true if 'p' is unused */ static inline int kprobe_unused(struct kprobe *p) { return kprobe_aggrprobe(p) && kprobe_disabled(p) && list_empty(&p->list); } -/* - * Keep all fields in the kprobe consistent - */ +/* Keep all fields in the kprobe consistent. */ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) { memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); @@ -406,11 +409,11 @@ static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) } #ifdef CONFIG_OPTPROBES -/* NOTE: change this value only with kprobe_mutex held */ +/* NOTE: This is protected by 'kprobe_mutex'. */ static bool kprobes_allow_optimization; /* - * Call all pre_handler on the list, but ignores its return value. + * Call all 'kprobe::pre_handler' on the list, but ignores its return value. * This must be called from arch-dep optimized caller. */ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs) @@ -438,7 +441,7 @@ static void free_aggr_kprobe(struct kprobe *p) kfree(op); } -/* Return true(!0) if the kprobe is ready for optimization. */ +/* Return true if the kprobe is ready for optimization. */ static inline int kprobe_optready(struct kprobe *p) { struct optimized_kprobe *op; @@ -451,7 +454,7 @@ static inline int kprobe_optready(struct kprobe *p) return 0; } -/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ +/* Return true if the kprobe is disarmed. Note: p must be on hash list */ static inline int kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -465,7 +468,7 @@ static inline int kprobe_disarmed(struct kprobe *p) return kprobe_disabled(p) && list_empty(&op->list); } -/* Return true(!0) if the probe is queued on (un)optimizing lists */ +/* Return true if the probe is queued on (un)optimizing lists */ static int kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; @@ -480,7 +483,7 @@ static int kprobe_queued(struct kprobe *p) /* * Return an optimized kprobe whose optimizing code replaces - * instructions including addr (exclude breakpoint). + * instructions including 'addr' (exclude breakpoint). */ static struct kprobe *get_optimized_kprobe(unsigned long addr) { @@ -501,7 +504,7 @@ static struct kprobe *get_optimized_kprobe(unsigned long addr) return NULL; } -/* Optimization staging list, protected by kprobe_mutex */ +/* Optimization staging list, protected by 'kprobe_mutex' */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); static LIST_HEAD(freeing_list); @@ -512,20 +515,20 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); /* * Optimize (replace a breakpoint with a jump) kprobes listed on - * optimizing_list. + * 'optimizing_list'. */ static void do_optimize_kprobes(void) { lockdep_assert_held(&text_mutex); /* - * The optimization/unoptimization refers online_cpus via - * stop_machine() and cpu-hotplug modifies online_cpus. - * And same time, text_mutex will be held in cpu-hotplug and here. - * This combination can cause a deadlock (cpu-hotplug try to lock - * text_mutex but stop_machine can not be done because online_cpus - * has been changed) - * To avoid this deadlock, caller must have locked cpu hotplug - * for preventing cpu-hotplug outside of text_mutex locking. + * The optimization/unoptimization refers 'online_cpus' via + * stop_machine() and cpu-hotplug modifies the 'online_cpus'. + * And same time, 'text_mutex' will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug tries to lock + * 'text_mutex' but stop_machine() can not be done because + * the 'online_cpus' has been changed) + * To avoid this deadlock, caller must have locked cpu-hotplug + * for preventing cpu-hotplug outside of 'text_mutex' locking. */ lockdep_assert_cpus_held(); @@ -539,7 +542,7 @@ static void do_optimize_kprobes(void) /* * Unoptimize (replace a jump with a breakpoint and remove the breakpoint - * if need) kprobes listed on unoptimizing_list. + * if need) kprobes listed on 'unoptimizing_list'. */ static void do_unoptimize_kprobes(void) { @@ -554,7 +557,7 @@ static void do_unoptimize_kprobes(void) return; arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); - /* Loop free_list for disarming */ + /* Loop on 'freeing_list' for disarming */ list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Switching from detour code to origin */ op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; @@ -565,7 +568,7 @@ static void do_unoptimize_kprobes(void) /* * Remove unused probes from hash list. After waiting * for synchronization, these probes are reclaimed. - * (reclaiming is done by do_free_cleaned_kprobes.) + * (reclaiming is done by do_free_cleaned_kprobes().) */ hlist_del_rcu(&op->kp.hlist); } else @@ -573,7 +576,7 @@ static void do_unoptimize_kprobes(void) } } -/* Reclaim all kprobes on the free_list */ +/* Reclaim all kprobes on the 'freeing_list' */ static void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -645,9 +648,9 @@ void wait_for_kprobe_optimizer(void) while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) { mutex_unlock(&kprobe_mutex); - /* this will also make optimizing_work execute immmediately */ + /* This will also make 'optimizing_work' execute immmediately */ flush_delayed_work(&optimizing_work); - /* @optimizing_work might not have been queued yet, relax */ + /* 'optimizing_work' might not have been queued yet, relax */ cpu_relax(); mutex_lock(&kprobe_mutex); @@ -678,7 +681,7 @@ static void optimize_kprobe(struct kprobe *p) (kprobe_disabled(p) || kprobes_all_disarmed)) return; - /* kprobes with post_handler can not be optimized */ + /* kprobes with 'post_handler' can not be optimized */ if (p->post_handler) return; @@ -698,7 +701,10 @@ static void optimize_kprobe(struct kprobe *p) } op->kp.flags |= KPROBE_FLAG_OPTIMIZED; - /* On unoptimizing/optimizing_list, op must have OPTIMIZED flag */ + /* + * On the 'unoptimizing_list' and 'optimizing_list', + * 'op' must have OPTIMIZED flag + */ if (WARN_ON_ONCE(!list_empty(&op->list))) return; @@ -768,7 +774,7 @@ static int reuse_unused_kprobe(struct kprobe *ap) WARN_ON_ONCE(list_empty(&op->list)); /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; - /* Optimize it again (remove from op->list) */ + /* Optimize it again. (remove from 'op->list') */ if (!kprobe_optready(ap)) return -EINVAL; @@ -818,7 +824,7 @@ static void prepare_optimized_kprobe(struct kprobe *p) __prepare_optimized_kprobe(op, p); } -/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +/* Allocate new optimized_kprobe and try to prepare optimized instructions. */ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) { struct optimized_kprobe *op; @@ -837,19 +843,19 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); /* - * Prepare an optimized_kprobe and optimize it - * NOTE: p must be a normal registered kprobe + * Prepare an optimized_kprobe and optimize it. + * NOTE: 'p' must be a normal registered kprobe. */ static void try_to_optimize_kprobe(struct kprobe *p) { struct kprobe *ap; struct optimized_kprobe *op; - /* Impossible to optimize ftrace-based kprobe */ + /* Impossible to optimize ftrace-based kprobe. */ if (kprobe_ftrace(p)) return; - /* For preparing optimization, jump_label_text_reserved() is called */ + /* For preparing optimization, jump_label_text_reserved() is called. */ cpus_read_lock(); jump_label_lock(); mutex_lock(&text_mutex); @@ -860,14 +866,14 @@ static void try_to_optimize_kprobe(struct kprobe *p) op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { - /* If failed to setup optimizing, fallback to kprobe */ + /* If failed to setup optimizing, fallback to kprobe. */ arch_remove_optimized_kprobe(op); kfree(op); goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); /* This just kicks optimizer thread */ + optimize_kprobe(ap); /* This just kicks optimizer thread. */ out: mutex_unlock(&text_mutex); @@ -882,7 +888,7 @@ static void optimize_all_kprobes(void) unsigned int i; mutex_lock(&kprobe_mutex); - /* If optimization is already allowed, just return */ + /* If optimization is already allowed, just return. */ if (kprobes_allow_optimization) goto out; @@ -908,7 +914,7 @@ static void unoptimize_all_kprobes(void) unsigned int i; mutex_lock(&kprobe_mutex); - /* If optimization is already prohibited, just return */ + /* If optimization is already prohibited, just return. */ if (!kprobes_allow_optimization) { mutex_unlock(&kprobe_mutex); return; @@ -926,7 +932,7 @@ static void unoptimize_all_kprobes(void) cpus_read_unlock(); mutex_unlock(&kprobe_mutex); - /* Wait for unoptimizing completion */ + /* Wait for unoptimizing completion. */ wait_for_kprobe_optimizer(); pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n"); } @@ -953,12 +959,12 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ -/* Put a breakpoint for a probe. Must be called with text_mutex locked */ +/* Put a breakpoint for a probe. Must be called with 'text_mutex' locked. */ static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; - /* Check collision with other optimized kprobes */ + /* Find the overlapping optimized kprobes. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p)) /* Fallback to unoptimized kprobe */ @@ -968,7 +974,7 @@ static void __arm_kprobe(struct kprobe *p) optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ +/* Remove the breakpoint of a probe. Must be called with 'text_mutex' locked. */ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; @@ -978,12 +984,17 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) if (!kprobe_queued(p)) { arch_disarm_kprobe(p); - /* If another kprobe was blocked, optimize it. */ + /* If another kprobe was blocked, re-optimize it. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p) && reopt) optimize_kprobe(_p); } - /* TODO: reoptimize others after unoptimized this probe */ + /* + * TODO: Since unoptimization and real disarming will be done by + * the worker thread, we can not check whether another probe are + * unoptimized because of this probe here. It should be re-optimized + * by the worker thread. + */ } #else /* !CONFIG_OPTPROBES */ @@ -1036,7 +1047,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Caller must lock kprobe_mutex */ +/* Caller must lock 'kprobe_mutex' */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { @@ -1073,7 +1084,7 @@ static int arm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } -/* Caller must lock kprobe_mutex */ +/* Caller must lock 'kprobe_mutex'. */ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { @@ -1122,7 +1133,7 @@ static int prepare_kprobe(struct kprobe *p) return arch_prepare_kprobe(p); } -/* Arm a kprobe with text_mutex */ +/* Arm a kprobe with 'text_mutex'. */ static int arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) @@ -1137,7 +1148,7 @@ static int arm_kprobe(struct kprobe *kp) return 0; } -/* Disarm a kprobe with text_mutex */ +/* Disarm a kprobe with 'text_mutex'. */ static int disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) @@ -1187,17 +1198,17 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, } NOKPROBE_SYMBOL(aggr_post_handler); -/* Walks the list and increments nmissed count for multiprobe case */ +/* Walks the list and increments 'nmissed' if 'p' has child probes. */ void kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; + if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) kp->nmissed++; } - return; } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); @@ -1215,9 +1226,9 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = get_kretprobe(ri); - if (likely(rp)) { + if (likely(rp)) freelist_add(&ri->freelist, &rp->freelist); - } else + else call_rcu(&ri->rcu, free_rp_inst_rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); @@ -1243,8 +1254,8 @@ void kprobe_busy_end(void) } /* - * This function is called from finish_task_switch when task tk becomes dead, - * so that we can recycle any function-return probe instances associated + * This function is called from finish_task_switch() when task 'tk' becomes + * dead, so that we can recycle any kretprobe instances associated * with this task. These left over instances represent probed functions * that have been called but will never return. */ @@ -1292,7 +1303,7 @@ static inline void free_rp_inst(struct kretprobe *rp) } } -/* Add the new probe to ap->list */ +/* Add the new probe to 'ap->list'. */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) { if (p->post_handler) @@ -1306,12 +1317,12 @@ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) } /* - * Fill in the required fields of the "manager kprobe". Replace the - * earlier kprobe in the hlist with the manager kprobe + * Fill in the required fields of the aggregator kprobe. Replace the + * earlier kprobe in the hlist with the aggregator kprobe. */ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { - /* Copy p's insn slot to ap */ + /* Copy the insn slot of 'p' to 'ap'. */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; @@ -1329,8 +1340,7 @@ static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) } /* - * This is the second or subsequent kprobe at the address - handle - * the intricacies + * This registers the second or subsequent kprobe at the same address. */ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) { @@ -1344,7 +1354,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) mutex_lock(&text_mutex); if (!kprobe_aggrprobe(orig_p)) { - /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ + /* If 'orig_p' is not an 'aggr_kprobe', create new one. */ ap = alloc_aggr_kprobe(orig_p); if (!ap) { ret = -ENOMEM; @@ -1369,8 +1379,8 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) if (ret) /* * Even if fail to allocate new slot, don't need to - * free aggr_probe. It will be used next time, or - * freed by unregister_kprobe. + * free the 'ap'. It will be used next time, or + * freed by unregister_kprobe(). */ goto out; @@ -1385,7 +1395,7 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p) | KPROBE_FLAG_DISABLED; } - /* Copy ap's insn slot to p */ + /* Copy the insn slot of 'p' to 'ap'. */ copy_kprobe(ap, p); ret = add_new_kprobe(ap, p); @@ -1411,7 +1421,7 @@ out: bool __weak arch_within_kprobe_blacklist(unsigned long addr) { - /* The __kprobes marked functions and entry code must not be probed */ + /* The '__kprobes' functions and entry code must not be probed. */ return addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end; } @@ -1423,8 +1433,8 @@ static bool __within_kprobe_blacklist(unsigned long addr) if (arch_within_kprobe_blacklist(addr)) return true; /* - * If there exists a kprobe_blacklist, verify and - * fail any probe registration in the prohibited area + * If 'kprobe_blacklist' is defined, check the address and + * reject any probe registration in the prohibited area. */ list_for_each_entry(ent, &kprobe_blacklist, list) { if (addr >= ent->start_addr && addr < ent->end_addr) @@ -1454,7 +1464,7 @@ bool within_kprobe_blacklist(unsigned long addr) } /* - * If we have a symbol_name argument, look it up and add the offset field + * If 'symbol_name' is specified, look it up and add the 'offset' * to it. This way, we can specify a relative address to a symbol. * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. @@ -1484,7 +1494,10 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p) return _kprobe_addr(p->addr, p->symbol_name, p->offset); } -/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +/* + * Check the 'p' is valid and return the aggregator kprobe + * at the same address. + */ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { struct kprobe *ap, *list_p; @@ -1561,7 +1574,7 @@ static int check_kprobe_address_safe(struct kprobe *p, goto out; } - /* Check if are we probing a module */ + /* Check if 'p' is probing a module. */ *probed_mod = __module_text_address((unsigned long) p->addr); if (*probed_mod) { /* @@ -1574,7 +1587,7 @@ static int check_kprobe_address_safe(struct kprobe *p, } /* - * If the module freed .init.text, we couldn't insert + * If the module freed '.init.text', we couldn't insert * kprobes in there. */ if (within_module_init((unsigned long)p->addr, *probed_mod) && @@ -1621,7 +1634,7 @@ int register_kprobe(struct kprobe *p) old_p = get_kprobe(p->addr); if (old_p) { - /* Since this may unoptimize old_p, locking text_mutex. */ + /* Since this may unoptimize 'old_p', locking 'text_mutex'. */ ret = register_aggr_kprobe(old_p, p); goto out; } @@ -1660,7 +1673,7 @@ out: } EXPORT_SYMBOL_GPL(register_kprobe); -/* Check if all probes on the aggrprobe are disabled */ +/* Check if all probes on the 'ap' are disabled. */ static int aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1670,15 +1683,15 @@ static int aggr_kprobe_disabled(struct kprobe *ap) list_for_each_entry(kp, &ap->list, list) if (!kprobe_disabled(kp)) /* - * There is an active probe on the list. - * We can't disable this ap. + * Since there is an active probe on the list, + * we can't disable this 'ap'. */ return 0; return 1; } -/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ +/* Disable one kprobe: Make sure called under 'kprobe_mutex' is locked. */ static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; @@ -1697,7 +1710,7 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If kprobes_all_disarmed is set, orig_p + * If 'kprobes_all_disarmed' is set, 'orig_p' * should have already been disarmed, so * skip unneed disarming process. */ @@ -1984,7 +1997,7 @@ int register_kretprobe(struct kretprobe *rp) if (ret) return ret; - /* If only rp->kp.addr is specified, check reregistering kprobes */ + /* If only 'rp->kp.addr' is specified, check reregistering kprobes */ if (rp->kp.addr && warn_kprobe_rereg(&rp->kp)) return -EINVAL; @@ -2089,13 +2102,13 @@ EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ int register_kretprobe(struct kretprobe *rp) { - return -ENOSYS; + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(register_kretprobe); int register_kretprobes(struct kretprobe **rps, int num) { - return -ENOSYS; + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(register_kretprobes); @@ -2144,7 +2157,7 @@ static void kill_kprobe(struct kprobe *p) /* * The module is going away. We should disarm the kprobe which * is using ftrace, because ftrace framework is still available at - * MODULE_STATE_GOING notification. + * 'MODULE_STATE_GOING' notification. */ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); @@ -2317,13 +2330,13 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return ret; } - /* Symbols in __kprobes_text are blacklisted */ + /* Symbols in '__kprobes_text' are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__kprobes_text_start, (unsigned long)__kprobes_text_end); if (ret) return ret; - /* Symbols in noinstr section are blacklisted */ + /* Symbols in 'noinstr' section are blacklisted */ ret = kprobe_add_area_blacklist((unsigned long)__noinstr_text_start, (unsigned long)__noinstr_text_end); @@ -2395,9 +2408,9 @@ static int kprobes_module_callback(struct notifier_block *nb, return NOTIFY_DONE; /* - * When MODULE_STATE_GOING was notified, both of module .text and - * .init.text sections would be freed. When MODULE_STATE_LIVE was - * notified, only .init.text section would be freed. We need to + * When 'MODULE_STATE_GOING' was notified, both of module '.text' and + * '.init.text' sections would be freed. When 'MODULE_STATE_LIVE' was + * notified, only '.init.text' section would be freed. We need to * disable kprobes which have been inserted in the sections. */ mutex_lock(&kprobe_mutex); @@ -2414,9 +2427,9 @@ static int kprobes_module_callback(struct notifier_block *nb, * * Note, this will also move any optimized probes * that are pending to be removed from their - * corresponding lists to the freeing_list and + * corresponding lists to the 'freeing_list' and * will not be touched by the delayed - * kprobe_optimizer work handler. + * kprobe_optimizer() work handler. */ kill_kprobe(p); } @@ -2432,10 +2445,6 @@ static struct notifier_block kprobe_module_nb = { .priority = 0 }; -/* Markers of _kprobe_blacklist section */ -extern unsigned long __start_kprobe_blacklist[]; -extern unsigned long __stop_kprobe_blacklist[]; - void kprobe_free_init_mem(void) { void *start = (void *)(&__init_begin); @@ -2446,7 +2455,7 @@ void kprobe_free_init_mem(void) mutex_lock(&kprobe_mutex); - /* Kill all kprobes on initmem */ + /* Kill all kprobes on initmem because the target code has been freed. */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry(p, head, hlist) { @@ -2469,9 +2478,8 @@ static int __init init_kprobes(void) err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); - if (err) { + if (err) pr_err("Failed to populate blacklist (error %d), kprobes not restricted, be careful using them!\n", err); - } if (kretprobe_blacklist_size) { /* lookup the function address from its name */ @@ -2488,7 +2496,7 @@ static int __init init_kprobes(void) kprobes_all_disarmed = false; #if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) - /* Init kprobe_optinsn_slots for allocation */ + /* Init 'kprobe_optinsn_slots' for allocation */ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; #endif @@ -2622,7 +2630,7 @@ static int kprobe_blacklist_seq_show(struct seq_file *m, void *v) list_entry(v, struct kprobe_blacklist_entry, list); /* - * If /proc/kallsyms is not showing kernel address, we won't + * If '/proc/kallsyms' is not showing kernel address, we won't * show them here either. */ if (!kallsyms_show_value(m->file->f_cred)) From dfc05b55c3c6b15dbd889e9901ecb3fb695421bd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:46 +0900 Subject: [PATCH 008/101] kprobes: Use IS_ENABLED() instead of kprobes_built_in() Use IS_ENABLED(CONFIG_KPROBES) instead of kprobes_built_in(). This inline function is introduced only for avoiding #ifdef. But since now we have IS_ENABLED(), it is no longer needed. Link: https://lkml.kernel.org/r/163163038581.489837.2805250706507372658.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 756d3d23ce37..9c28fbb18e74 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -180,14 +180,6 @@ struct kprobe_blacklist_entry { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -/* - * For #ifdef avoidance: - */ -static inline int kprobes_built_in(void) -{ - return 1; -} - extern void kprobe_busy_begin(void); extern void kprobe_busy_end(void); @@ -417,10 +409,6 @@ int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value, char *type, char *sym); #else /* !CONFIG_KPROBES: */ -static inline int kprobes_built_in(void) -{ - return 0; -} static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { return 0; @@ -514,7 +502,7 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, unsigned int trap) { - if (!kprobes_built_in()) + if (!IS_ENABLED(CONFIG_KPROBES)) return false; if (user_mode(regs)) return false; From 57d4e31780106ad97516bfd197fac47a81482353 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:39:55 +0900 Subject: [PATCH 009/101] kprobes: Add assertions for required lock Add assertions for required locks instead of comment it so that the lockdep can inspect locks automatically. Link: https://lkml.kernel.org/r/163163039572.489837.18011973177537476885.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ad39eeaa4371..ec3d97fd8c6b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -959,11 +959,13 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, } #endif /* CONFIG_SYSCTL */ -/* Put a breakpoint for a probe. Must be called with 'text_mutex' locked. */ +/* Put a breakpoint for a probe. */ static void __arm_kprobe(struct kprobe *p) { struct kprobe *_p; + lockdep_assert_held(&text_mutex); + /* Find the overlapping optimized kprobes. */ _p = get_optimized_kprobe((unsigned long)p->addr); if (unlikely(_p)) @@ -974,11 +976,13 @@ static void __arm_kprobe(struct kprobe *p) optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ } -/* Remove the breakpoint of a probe. Must be called with 'text_mutex' locked. */ +/* Remove the breakpoint of a probe. */ static void __disarm_kprobe(struct kprobe *p, bool reopt) { struct kprobe *_p; + lockdep_assert_held(&text_mutex); + /* Try to unoptimize */ unoptimize_kprobe(p, kprobes_all_disarmed); @@ -1047,12 +1051,13 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; -/* Caller must lock 'kprobe_mutex' */ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; + lockdep_assert_held(&kprobe_mutex); + ret = ftrace_set_filter_ip(ops, (unsigned long)p->addr, 0, 0); if (WARN_ONCE(ret < 0, "Failed to arm kprobe-ftrace at %pS (error %d)\n", p->addr, ret)) return ret; @@ -1084,12 +1089,13 @@ static int arm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } -/* Caller must lock 'kprobe_mutex'. */ static int __disarm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) { int ret = 0; + lockdep_assert_held(&kprobe_mutex); + if (*cnt == 1) { ret = unregister_ftrace_function(ops); if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (error %d)\n", ret)) @@ -1133,7 +1139,6 @@ static int prepare_kprobe(struct kprobe *p) return arch_prepare_kprobe(p); } -/* Arm a kprobe with 'text_mutex'. */ static int arm_kprobe(struct kprobe *kp) { if (unlikely(kprobe_ftrace(kp))) @@ -1148,7 +1153,6 @@ static int arm_kprobe(struct kprobe *kp) return 0; } -/* Disarm a kprobe with 'text_mutex'. */ static int disarm_kprobe(struct kprobe *kp, bool reopt) { if (unlikely(kprobe_ftrace(kp))) @@ -1691,12 +1695,13 @@ static int aggr_kprobe_disabled(struct kprobe *ap) return 1; } -/* Disable one kprobe: Make sure called under 'kprobe_mutex' is locked. */ static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; int ret; + lockdep_assert_held(&kprobe_mutex); + /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) From c42421e205fc2570a4d019184ea7d6c382c93f4c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:07 +0900 Subject: [PATCH 010/101] kprobes: treewide: Use 'kprobe_opcode_t *' for the code address in get_optimized_kprobe() Since get_optimized_kprobe() is only used inside kprobes, it doesn't need to use 'unsigned long' type for 'addr' parameter. Make it use 'kprobe_opcode_t *' for the 'addr' parameter and subsequent call of arch_within_optimized_kprobe() also should use 'kprobe_opcode_t *'. Note that MAX_OPTIMIZED_LENGTH and RELATIVEJUMP_SIZE are defined by byte-size, but the size of 'kprobe_opcode_t' depends on the architecture. Therefore, we must be careful when calculating addresses using those macros. Link: https://lkml.kernel.org/r/163163040680.489837.12133032364499833736.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arm/probes/kprobes/opt-arm.c | 7 ++++--- arch/powerpc/kernel/optprobes.c | 6 +++--- arch/x86/kernel/kprobes/opt.c | 6 +++--- include/linux/kprobes.h | 2 +- kernel/kprobes.c | 10 +++++----- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index c78180172120..dbef34ed933f 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -347,10 +347,11 @@ void arch_unoptimize_kprobes(struct list_head *oplist, } int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) + kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr); + return (op->kp.addr <= addr && + op->kp.addr + (RELATIVEJUMP_SIZE / sizeof(kprobe_opcode_t)) > addr); + } void arch_remove_optimized_kprobe(struct optimized_kprobe *op) diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index c79899abcec8..325ba544883c 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -301,8 +301,8 @@ void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_li } } -int arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr) +int arch_within_optimized_kprobe(struct optimized_kprobe *op, kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr); + return (op->kp.addr <= addr && + op->kp.addr + (RELATIVEJUMP_SIZE / sizeof(kprobe_opcode_t)) > addr); } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 71425ebba98a..b4a54a52aa59 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -367,10 +367,10 @@ int arch_check_optimized_kprobe(struct optimized_kprobe *op) /* Check the addr is within the optimized instructions. */ int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr) + kprobe_opcode_t *addr) { - return ((unsigned long)op->kp.addr <= addr && - (unsigned long)op->kp.addr + op->optinsn.size > addr); + return (op->kp.addr <= addr && + op->kp.addr + op->optinsn.size > addr); } /* Free optimized instruction slot */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9c28fbb18e74..6a5995f334a0 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -329,7 +329,7 @@ extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, - unsigned long addr); + kprobe_opcode_t *addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ec3d97fd8c6b..b6f1dcf4bff3 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -485,15 +485,15 @@ static int kprobe_queued(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including 'addr' (exclude breakpoint). */ -static struct kprobe *get_optimized_kprobe(unsigned long addr) +static struct kprobe *get_optimized_kprobe(kprobe_opcode_t *addr) { int i; struct kprobe *p = NULL; struct optimized_kprobe *op; /* Don't check i == 0, since that is a breakpoint case. */ - for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) - p = get_kprobe((void *)(addr - i)); + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH / sizeof(kprobe_opcode_t); i++) + p = get_kprobe(addr - i); if (p && kprobe_optready(p)) { op = container_of(p, struct optimized_kprobe, kp); @@ -967,7 +967,7 @@ static void __arm_kprobe(struct kprobe *p) lockdep_assert_held(&text_mutex); /* Find the overlapping optimized kprobes. */ - _p = get_optimized_kprobe((unsigned long)p->addr); + _p = get_optimized_kprobe(p->addr); if (unlikely(_p)) /* Fallback to unoptimized kprobe */ unoptimize_kprobe(_p, true); @@ -989,7 +989,7 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt) if (!kprobe_queued(p)) { arch_disarm_kprobe(p); /* If another kprobe was blocked, re-optimize it. */ - _p = get_optimized_kprobe((unsigned long)p->addr); + _p = get_optimized_kprobe(p->addr); if (unlikely(_p) && reopt) optimize_kprobe(_p); } From 29e8077ae2beea6a85ad2d0bae9c550bd5d05ed9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:16 +0900 Subject: [PATCH 011/101] kprobes: Use bool type for functions which returns boolean value Use the 'bool' type instead of 'int' for the functions which returns a boolean value, because this makes clear that those functions don't return any error code. Link: https://lkml.kernel.org/r/163163041649.489837.17311187321419747536.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 8 ++++---- kernel/kprobes.c | 26 +++++++++++++------------- kernel/trace/trace_kprobe.c | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6a5995f334a0..0ba3f9e316d4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -104,25 +104,25 @@ struct kprobe { #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ /* Has this kprobe gone ? */ -static inline int kprobe_gone(struct kprobe *p) +static inline bool kprobe_gone(struct kprobe *p) { return p->flags & KPROBE_FLAG_GONE; } /* Is this kprobe disabled ? */ -static inline int kprobe_disabled(struct kprobe *p) +static inline bool kprobe_disabled(struct kprobe *p) { return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE); } /* Is this kprobe really running optimized path ? */ -static inline int kprobe_optimized(struct kprobe *p) +static inline bool kprobe_optimized(struct kprobe *p) { return p->flags & KPROBE_FLAG_OPTIMIZED; } /* Is this kprobe uses ftrace ? */ -static inline int kprobe_ftrace(struct kprobe *p) +static inline bool kprobe_ftrace(struct kprobe *p) { return p->flags & KPROBE_FLAG_FTRACE; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b6f1dcf4bff3..8021bccb7770 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -198,8 +198,8 @@ out: return slot; } -/* Return 1 if all garbages are collected, otherwise 0. */ -static int collect_one_slot(struct kprobe_insn_page *kip, int idx) +/* Return true if all garbages are collected, otherwise false. */ +static bool collect_one_slot(struct kprobe_insn_page *kip, int idx) { kip->slot_used[idx] = SLOT_CLEAN; kip->nused--; @@ -223,9 +223,9 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) kip->cache->free(kip->insns); kfree(kip); } - return 1; + return true; } - return 0; + return false; } static int collect_garbage_slots(struct kprobe_insn_cache *c) @@ -389,13 +389,13 @@ NOKPROBE_SYMBOL(get_kprobe); static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); /* Return true if 'p' is an aggregator */ -static inline int kprobe_aggrprobe(struct kprobe *p) +static inline bool kprobe_aggrprobe(struct kprobe *p) { return p->pre_handler == aggr_pre_handler; } /* Return true if 'p' is unused */ -static inline int kprobe_unused(struct kprobe *p) +static inline bool kprobe_unused(struct kprobe *p) { return kprobe_aggrprobe(p) && kprobe_disabled(p) && list_empty(&p->list); @@ -455,7 +455,7 @@ static inline int kprobe_optready(struct kprobe *p) } /* Return true if the kprobe is disarmed. Note: p must be on hash list */ -static inline int kprobe_disarmed(struct kprobe *p) +static inline bool kprobe_disarmed(struct kprobe *p) { struct optimized_kprobe *op; @@ -469,16 +469,16 @@ static inline int kprobe_disarmed(struct kprobe *p) } /* Return true if the probe is queued on (un)optimizing lists */ -static int kprobe_queued(struct kprobe *p) +static bool kprobe_queued(struct kprobe *p) { struct optimized_kprobe *op; if (kprobe_aggrprobe(p)) { op = container_of(p, struct optimized_kprobe, kp); if (!list_empty(&op->list)) - return 1; + return true; } - return 0; + return false; } /* @@ -1678,7 +1678,7 @@ out: EXPORT_SYMBOL_GPL(register_kprobe); /* Check if all probes on the 'ap' are disabled. */ -static int aggr_kprobe_disabled(struct kprobe *ap) +static bool aggr_kprobe_disabled(struct kprobe *ap) { struct kprobe *kp; @@ -1690,9 +1690,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap) * Since there is an active probe on the list, * we can't disable this 'ap'. */ - return 0; + return false; - return 1; + return true; } static struct kprobe *__disable_kprobe(struct kprobe *p) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3a64ba4bbad6..0e1e7ce5f7ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -97,7 +97,7 @@ static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(kprobe_gone(&tk->rp.kp)); + return kprobe_gone(&tk->rp.kp); } static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, From a7fe2378454cf46cd5e2776d05e72bbe8f0a468c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:27 +0900 Subject: [PATCH 012/101] ia64: kprobes: Fix to pass correct trampoline address to the handler The following commit: Commit e792ff804f49 ("ia64: kprobes: Use generic kretprobe trampoline handler") Passed the wrong trampoline address to __kretprobe_trampoline_handler(): it passes the descriptor address instead of function entry address. Pass the right parameter. Also use correct symbol dereference function to get the function address from 'kretprobe_trampoline' - an IA64 special. Link: https://lkml.kernel.org/r/163163042696.489837.12551102356265354730.stgit@devnote2 Fixes: e792ff804f49 ("ia64: kprobes: Use generic kretprobe trampoline handler") Cc: Josh Poimboeuf Cc: Ingo Molnar Cc: X86 ML Cc: Daniel Xu Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Abhishek Sagar Cc: Andrii Nakryiko Cc: Paul McKenney Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/ia64/kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 441ed04b1037..d4048518a1d7 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -398,7 +398,8 @@ static void kretprobe_trampoline(void) int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL); + regs->cr_iip = __kretprobe_trampoline_handler(regs, + dereference_function_descriptor(kretprobe_trampoline), NULL); /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler @@ -414,7 +415,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip; + regs->b0 = (unsigned long)dereference_function_descriptor(kretprobe_trampoline); } /* Check the instruction in the slot is break */ @@ -902,14 +903,14 @@ static struct kprobe trampoline_p = { int __init arch_init_kprobes(void) { trampoline_p.addr = - (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip; + dereference_function_descriptor(kretprobe_trampoline); return register_kprobe(&trampoline_p); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) { if (p->addr == - (kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip) + dereference_function_descriptor(kretprobe_trampoline)) return 1; return 0; From f2ec8d9a3b8c0f22cd6a2b4f5a2d9aee5206e3b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:36 +0900 Subject: [PATCH 013/101] kprobes: treewide: Replace arch_deref_entry_point() with dereference_symbol_descriptor() ~15 years ago kprobes grew the 'arch_deref_entry_point()' __weak function: 3d7e33825d87: ("jprobes: make jprobes a little safer for users") But this is just open-coded dereference_symbol_descriptor() in essence, and its obscure nature was causing bugs. Just use the real thing and remove arch_deref_entry_point(). Link: https://lkml.kernel.org/r/163163043630.489837.7924988885652708696.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- arch/ia64/kernel/kprobes.c | 5 ----- arch/powerpc/kernel/kprobes.c | 11 ----------- include/linux/kprobes.h | 1 - kernel/kprobes.c | 7 +------ lib/error-inject.c | 3 ++- 5 files changed, 3 insertions(+), 24 deletions(-) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index d4048518a1d7..0f8573bbf520 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -891,11 +891,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, return ret; } -unsigned long arch_deref_entry_point(void *entry) -{ - return ((struct fnptr *)entry)->ip; -} - static struct kprobe trampoline_p = { .pre_handler = trampoline_probe_handler }; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 7a7cd6bda53e..d422e297978b 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -542,17 +542,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) } NOKPROBE_SYMBOL(kprobe_fault_handler); -unsigned long arch_deref_entry_point(void *entry) -{ -#ifdef PPC64_ELF_ABI_v1 - if (!kernel_text_address((unsigned long)entry)) - return ppc_global_function_entry(entry); - else -#endif - return (unsigned long)entry; -} -NOKPROBE_SYMBOL(arch_deref_entry_point); - static struct kprobe trampoline_p = { .addr = (kprobe_opcode_t *) &kretprobe_trampoline, .pre_handler = trampoline_probe_handler diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0ba3f9e316d4..2ed61fcbc89c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -381,7 +381,6 @@ int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); int register_kprobes(struct kprobe **kps, int num); void unregister_kprobes(struct kprobe **kps, int num); -unsigned long arch_deref_entry_point(void *); int register_kretprobe(struct kretprobe *rp); void unregister_kretprobe(struct kretprobe *rp); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8021bccb7770..550042d9a6ef 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1861,11 +1861,6 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to be notified first */ }; -unsigned long __weak arch_deref_entry_point(void *entry) -{ - return (unsigned long)entry; -} - #ifdef CONFIG_KRETPROBES unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, @@ -2327,7 +2322,7 @@ static int __init populate_kprobe_blacklist(unsigned long *start, int ret; for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); + entry = (unsigned long)dereference_symbol_descriptor((void *)*iter); ret = kprobe_add_ksym_blacklist(entry); if (ret == -EINVAL) continue; diff --git a/lib/error-inject.c b/lib/error-inject.c index c73651b15b76..2ff5ef689d72 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -8,6 +8,7 @@ #include #include #include +#include /* Whitelist of symbols that can be overridden for error injection. */ static LIST_HEAD(error_injection_list); @@ -64,7 +65,7 @@ static void populate_error_injection_list(struct error_injection_entry *start, mutex_lock(&ei_mutex); for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)iter->addr); + entry = (unsigned long)dereference_symbol_descriptor((void *)iter->addr); if (!kernel_text_address(entry) || !kallsyms_lookup_size_offset(entry, &size, &offset)) { From 96fed8ac2bb64ab45497fdd8e3d390165b7a9be8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:45 +0900 Subject: [PATCH 014/101] kprobes: treewide: Remove trampoline_address from kretprobe_trampoline_handler() The __kretprobe_trampoline_handler() callback, called from low level arch kprobes methods, has the 'trampoline_address' parameter, which is entirely superfluous as it basically just replicates: dereference_kernel_function_descriptor(kretprobe_trampoline) In fact we had bugs in arch code where it wasn't replicated correctly. So remove this superfluous parameter and use kretprobe_trampoline_addr() instead. Link: https://lkml.kernel.org/r/163163044546.489837.13505751885476015002.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- arch/arc/kernel/kprobes.c | 2 +- arch/arm/probes/kprobes/core.c | 3 +-- arch/arm64/kernel/probes/kprobes.c | 3 +-- arch/csky/kernel/probes/kprobes.c | 2 +- arch/ia64/kernel/kprobes.c | 5 ++--- arch/mips/kernel/kprobes.c | 3 +-- arch/parisc/kernel/kprobes.c | 4 ++-- arch/powerpc/kernel/kprobes.c | 2 +- arch/riscv/kernel/probes/kprobes.c | 2 +- arch/s390/kernel/kprobes.c | 2 +- arch/sh/kernel/kprobes.c | 2 +- arch/sparc/kernel/kprobes.c | 2 +- arch/x86/include/asm/kprobes.h | 1 - arch/x86/kernel/kprobes/core.c | 2 +- include/linux/kprobes.h | 18 +++++++++++++----- kernel/kprobes.c | 3 +-- 16 files changed, 29 insertions(+), 27 deletions(-) diff --git a/arch/arc/kernel/kprobes.c b/arch/arc/kernel/kprobes.c index 5f0415fc7328..3cee75c87f97 100644 --- a/arch/arc/kernel/kprobes.c +++ b/arch/arc/kernel/kprobes.c @@ -381,7 +381,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->ret = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + regs->ret = __kretprobe_trampoline_handler(regs, NULL); /* By returning a non zero value, we are telling the kprobe handler * that we don't want the post_handler to run diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index a59e38de4a03..08098ed6f035 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -392,8 +392,7 @@ void __naked __kprobes kretprobe_trampoline(void) /* Called from kretprobe_trampoline */ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) { - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, - (void *)regs->ARM_fp); + return (void *)kretprobe_trampoline_handler(regs, (void *)regs->ARM_fp); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index ce429cbacd35..f627a12984a8 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -401,8 +401,7 @@ int __init arch_populate_kprobe_blacklist(void) void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, - (void *)kernel_stack_pointer(regs)); + return (void *)kretprobe_trampoline_handler(regs, (void *)kernel_stack_pointer(regs)); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 632407bf45d5..784c5aba7f66 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -386,7 +386,7 @@ int __init arch_populate_kprobe_blacklist(void) void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + return (void *)kretprobe_trampoline_handler(regs, NULL); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 0f8573bbf520..44c84c20b626 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -392,14 +392,13 @@ static void __kprobes set_current_kprobe(struct kprobe *p, __this_cpu_write(current_kprobe, p); } -static void kretprobe_trampoline(void) +void kretprobe_trampoline(void) { } int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->cr_iip = __kretprobe_trampoline_handler(regs, - dereference_function_descriptor(kretprobe_trampoline), NULL); + regs->cr_iip = __kretprobe_trampoline_handler(regs, NULL); /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index b0934a0d7aed..b33bd2498651 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -485,8 +485,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, static int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - instruction_pointer(regs) = __kretprobe_trampoline_handler(regs, - kretprobe_trampoline, NULL); + instruction_pointer(regs) = __kretprobe_trampoline_handler(regs, NULL); /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 6d21a515eea5..4a35ac6e2ca2 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -175,7 +175,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) return 1; } -static inline void kretprobe_trampoline(void) +void kretprobe_trampoline(void) { asm volatile("nop"); asm volatile("nop"); @@ -193,7 +193,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, { unsigned long orig_ret_address; - orig_ret_address = __kretprobe_trampoline_handler(regs, trampoline_p.addr, NULL); + orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); instruction_pointer_set(regs, orig_ret_address); return 1; diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index d422e297978b..43c77142a262 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -417,7 +417,7 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { unsigned long orig_ret_address; - orig_ret_address = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); /* * We get here through one of two paths: * 1. by taking a trap -> kprobe_handler() -> here diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index cab6f874358e..62d477cf11da 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -347,7 +347,7 @@ int __init arch_populate_kprobe_blacklist(void) void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + return (void *)kretprobe_trampoline_handler(regs, NULL); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 952d44b0610b..5fa86e54f129 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -343,7 +343,7 @@ static void __used kretprobe_trampoline_holder(void) */ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->psw.addr = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + regs->psw.addr = __kretprobe_trampoline_handler(regs, NULL); /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 1c7f358ef0be..8e76a35e6e33 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -303,7 +303,7 @@ static void __used kretprobe_trampoline_holder(void) */ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - regs->pc = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + regs->pc = __kretprobe_trampoline_handler(regs, NULL); return 1; } diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index 4c05a4ee6a0e..401534236c2e 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -451,7 +451,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, { unsigned long orig_ret_address = 0; - orig_ret_address = __kretprobe_trampoline_handler(regs, &kretprobe_trampoline, NULL); + orig_ret_address = __kretprobe_trampoline_handler(regs, NULL); regs->tpc = orig_ret_address; regs->tnpc = orig_ret_address + 4; diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index bd7f5886a789..71ea2eab43d5 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -49,7 +49,6 @@ extern __visible kprobe_opcode_t optprobe_template_end[]; extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); -asmlinkage void kretprobe_trampoline(void); extern void arch_kprobe_override_function(struct pt_regs *regs); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b6e046e4b289..0c59ef5971de 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1064,7 +1064,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) regs->ip = (unsigned long)&kretprobe_trampoline; regs->orig_ax = ~0UL; - return (void *)kretprobe_trampoline_handler(regs, &kretprobe_trampoline, ®s->sp); + return (void *)kretprobe_trampoline_handler(regs, ®s->sp); } NOKPROBE_SYMBOL(trampoline_handler); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 2ed61fcbc89c..96f5df93e36e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,15 +188,23 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); +void kretprobe_trampoline(void); +/* + * Since some architecture uses structured function pointer, + * use dereference_function_descriptor() to get real function address. + */ +static nokprobe_inline void *kretprobe_trampoline_addr(void) +{ + return dereference_kernel_function_descriptor(kretprobe_trampoline); +} + /* If the trampoline handler called from a kprobe, use this version */ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, - void *frame_pointer); + void *frame_pointer); static nokprobe_inline unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, - void *frame_pointer) + void *frame_pointer) { unsigned long ret; /* @@ -205,7 +213,7 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, * be running at this point. */ kprobe_busy_begin(); - ret = __kretprobe_trampoline_handler(regs, trampoline_address, frame_pointer); + ret = __kretprobe_trampoline_handler(regs, frame_pointer); kprobe_busy_end(); return ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 550042d9a6ef..6ed755111eea 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1864,7 +1864,6 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, - void *trampoline_address, void *frame_pointer) { kprobe_opcode_t *correct_ret_addr = NULL; @@ -1879,7 +1878,7 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, BUG_ON(ri->fp != frame_pointer); - if (ri->ret_addr != trampoline_address) { + if (ri->ret_addr != kretprobe_trampoline_addr()) { correct_ret_addr = ri->ret_addr; /* * This is the real return address. Any other From adf8a61a940c49fea6fab9c3865f2b69b8ceef28 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:54 +0900 Subject: [PATCH 015/101] kprobes: treewide: Make it harder to refer kretprobe_trampoline directly Since now there is kretprobe_trampoline_addr() for referring the address of kretprobe trampoline code, we don't need to access kretprobe_trampoline directly. Make it harder to refer by renaming it to __kretprobe_trampoline(). Link: https://lkml.kernel.org/r/163163045446.489837.14510577516938803097.stgit@devnote2 Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arc/include/asm/kprobes.h | 2 +- arch/arc/kernel/kprobes.c | 11 ++++++----- arch/arm/probes/kprobes/core.c | 6 +++--- arch/arm64/include/asm/kprobes.h | 2 +- arch/arm64/kernel/probes/kprobes.c | 2 +- arch/arm64/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/csky/include/asm/kprobes.h | 2 +- arch/csky/kernel/probes/kprobes.c | 2 +- arch/csky/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/ia64/kernel/kprobes.c | 8 ++++---- arch/mips/kernel/kprobes.c | 12 ++++++------ arch/parisc/kernel/kprobes.c | 4 ++-- arch/powerpc/include/asm/kprobes.h | 2 +- arch/powerpc/kernel/kprobes.c | 16 ++++++++-------- arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- arch/riscv/include/asm/kprobes.h | 2 +- arch/riscv/kernel/probes/kprobes.c | 2 +- arch/riscv/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/s390/include/asm/kprobes.h | 2 +- arch/s390/kernel/kprobes.c | 10 +++++----- arch/s390/kernel/stacktrace.c | 2 +- arch/sh/include/asm/kprobes.h | 2 +- arch/sh/kernel/kprobes.c | 10 +++++----- arch/sparc/include/asm/kprobes.h | 2 +- arch/sparc/kernel/kprobes.c | 10 +++++----- arch/x86/kernel/kprobes/core.c | 18 +++++++++--------- include/linux/kprobes.h | 4 ++-- kernel/trace/trace_output.c | 2 +- 29 files changed, 76 insertions(+), 75 deletions(-) diff --git a/arch/arc/include/asm/kprobes.h b/arch/arc/include/asm/kprobes.h index 2134721dce44..de1566e32cb8 100644 --- a/arch/arc/include/asm/kprobes.h +++ b/arch/arc/include/asm/kprobes.h @@ -46,7 +46,7 @@ struct kprobe_ctlblk { }; int kprobe_fault_handler(struct pt_regs *regs, unsigned long cause); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void trap_is_kprobe(unsigned long address, struct pt_regs *regs); #else #define trap_is_kprobe(address, regs) diff --git a/arch/arc/kernel/kprobes.c b/arch/arc/kernel/kprobes.c index 3cee75c87f97..e71d64119d71 100644 --- a/arch/arc/kernel/kprobes.c +++ b/arch/arc/kernel/kprobes.c @@ -363,8 +363,9 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, static void __used kretprobe_trampoline_holder(void) { - __asm__ __volatile__(".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n" "nop\n"); + __asm__ __volatile__(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n" + "nop\n"); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, @@ -375,7 +376,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->blink = (unsigned long)&kretprobe_trampoline; + regs->blink = (unsigned long)&__kretprobe_trampoline; } static int __kprobes trampoline_probe_handler(struct kprobe *p, @@ -390,7 +391,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -402,7 +403,7 @@ int __init arch_init_kprobes(void) int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *) &kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *) &__kretprobe_trampoline) return 1; return 0; diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 08098ed6f035..67ce7eb8f285 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -373,7 +373,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, * for kretprobe handlers which should normally be interested in r0 only * anyway. */ -void __naked __kprobes kretprobe_trampoline(void) +void __naked __kprobes __kretprobe_trampoline(void) { __asm__ __volatile__ ( "stmdb sp!, {r0 - r11} \n\t" @@ -389,7 +389,7 @@ void __naked __kprobes kretprobe_trampoline(void) : : : "memory"); } -/* Called from kretprobe_trampoline */ +/* Called from __kretprobe_trampoline */ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) { return (void *)kretprobe_trampoline_handler(regs, (void *)regs->ARM_fp); @@ -402,7 +402,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = (void *)regs->ARM_fp; /* Replace the return addr with trampoline addr. */ - regs->ARM_lr = (unsigned long)&kretprobe_trampoline; + regs->ARM_lr = (unsigned long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index 5d38ff4a4806..05cd82eeca13 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -39,7 +39,7 @@ void arch_remove_kprobe(struct kprobe *); int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f627a12984a8..e7ad6da980e8 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -411,7 +411,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = (void *)kernel_stack_pointer(regs); /* replace return addr (x30) with trampoline */ - regs->regs[30] = (long)&kretprobe_trampoline; + regs->regs[30] = (long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 288a84e253cc..520ee8711db1 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -61,7 +61,7 @@ ldp x28, x29, [sp, #S_X28] .endm -SYM_CODE_START(kretprobe_trampoline) +SYM_CODE_START(__kretprobe_trampoline) sub sp, sp, #PT_REGS_SIZE save_all_base_regs @@ -79,4 +79,4 @@ SYM_CODE_START(kretprobe_trampoline) add sp, sp, #PT_REGS_SIZE ret -SYM_CODE_END(kretprobe_trampoline) +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/csky/include/asm/kprobes.h b/arch/csky/include/asm/kprobes.h index b647bbde4d6d..55267cbf5204 100644 --- a/arch/csky/include/asm/kprobes.h +++ b/arch/csky/include/asm/kprobes.h @@ -41,7 +41,7 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); int kprobe_breakpoint_handler(struct pt_regs *regs); int kprobe_single_step_handler(struct pt_regs *regs); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 784c5aba7f66..42920f25e73c 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -394,7 +394,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, { ri->ret_addr = (kprobe_opcode_t *)regs->lr; ri->fp = NULL; - regs->lr = (unsigned long) &kretprobe_trampoline; + regs->lr = (unsigned long) &__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/csky/kernel/probes/kprobes_trampoline.S b/arch/csky/kernel/probes/kprobes_trampoline.S index b1fe3af24f03..ba48ad04a847 100644 --- a/arch/csky/kernel/probes/kprobes_trampoline.S +++ b/arch/csky/kernel/probes/kprobes_trampoline.S @@ -4,7 +4,7 @@ #include -ENTRY(kretprobe_trampoline) +ENTRY(__kretprobe_trampoline) SAVE_REGS_FTRACE mov a0, sp /* pt_regs */ @@ -16,4 +16,4 @@ ENTRY(kretprobe_trampoline) RESTORE_REGS_FTRACE rts -ENDPROC(kretprobe_trampoline) +ENDPROC(__kretprobe_trampoline) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 44c84c20b626..1a7bab1c5d7c 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -392,7 +392,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, __this_cpu_write(current_kprobe, p); } -void kretprobe_trampoline(void) +void __kretprobe_trampoline(void) { } @@ -414,7 +414,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->b0 = (unsigned long)dereference_function_descriptor(kretprobe_trampoline); + regs->b0 = (unsigned long)dereference_function_descriptor(__kretprobe_trampoline); } /* Check the instruction in the slot is break */ @@ -897,14 +897,14 @@ static struct kprobe trampoline_p = { int __init arch_init_kprobes(void) { trampoline_p.addr = - dereference_function_descriptor(kretprobe_trampoline); + dereference_function_descriptor(__kretprobe_trampoline); return register_kprobe(&trampoline_p); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) { if (p->addr == - dereference_function_descriptor(kretprobe_trampoline)) + dereference_function_descriptor(__kretprobe_trampoline)) return 1; return 0; diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index b33bd2498651..6c7f3b143fdc 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -460,14 +460,14 @@ static void __used kretprobe_trampoline_holder(void) /* Keep the assembler from reordering and placing JR here. */ ".set noreorder\n\t" "nop\n\t" - ".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n\t" + ".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n\t" "nop\n\t" ".set pop" : : : "memory"); } -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) @@ -476,7 +476,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->regs[31] = (unsigned long)kretprobe_trampoline; + regs->regs[31] = (unsigned long)__kretprobe_trampoline; } /* @@ -496,14 +496,14 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)__kretprobe_trampoline) return 1; return 0; } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *)kretprobe_trampoline, + .addr = (kprobe_opcode_t *)__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 4a35ac6e2ca2..e2bdb5a5f93e 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -175,7 +175,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) return 1; } -void kretprobe_trampoline(void) +void __kretprobe_trampoline(void) { asm volatile("nop"); asm volatile("nop"); @@ -217,6 +217,6 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { trampoline_p.addr = (kprobe_opcode_t *) - dereference_function_descriptor(kretprobe_trampoline); + dereference_function_descriptor(__kretprobe_trampoline); return register_kprobe(&trampoline_p); } diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 4fc0e15e23a5..bab364152b29 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -51,7 +51,7 @@ extern kprobe_opcode_t optprobe_template_end[]; #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); extern void arch_remove_kprobe(struct kprobe *p); /* Architecture specific copy of original instruction */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 43c77142a262..86d77ff056a6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -237,7 +237,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)kretprobe_trampoline; + regs->link = (unsigned long)__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -403,12 +403,12 @@ NOKPROBE_SYMBOL(kprobe_handler); * - When the probed function returns, this probe * causes the handlers to fire */ -asm(".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" +asm(".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" "nop\n" "blr\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n"); + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); /* * Called when the probe at kretprobe trampoline is hit @@ -427,7 +427,7 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * as it is used to determine the return address from the trap. * For (2), since nip is not honoured with optprobes, we instead setup * the link register properly so that the subsequent 'blr' in - * kretprobe_trampoline jumps back to the right instruction. + * __kretprobe_trampoline jumps back to the right instruction. * * For nip, we should set the address to the previous instruction since * we end up emulating it in kprobe_handler(), which increments the nip @@ -543,7 +543,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) NOKPROBE_SYMBOL(kprobe_fault_handler); static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -554,7 +554,7 @@ int __init arch_init_kprobes(void) int arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return 1; return 0; diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 325ba544883c..ce1903064031 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -56,7 +56,7 @@ static unsigned long can_optimize(struct kprobe *p) * has a 'nop' instruction, which can be emulated. * So further checks can be skipped. */ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return addr + sizeof(kprobe_opcode_t); /* diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 9e4a4a7af380..a2443d61728e 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -155,7 +155,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum * Mark stacktraces with kretprobed functions on them * as unreliable. */ - if (ip == (unsigned long)kretprobe_trampoline) + if (ip == (unsigned long)__kretprobe_trampoline) return -EINVAL; #endif diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index 9ea9b5ec3113..217ef89f22b9 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -40,7 +40,7 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); bool kprobe_breakpoint_handler(struct pt_regs *regs); bool kprobe_single_step_handler(struct pt_regs *regs); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 62d477cf11da..e6e950b7cf32 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -355,7 +355,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, { ri->ret_addr = (kprobe_opcode_t *)regs->ra; ri->fp = NULL; - regs->ra = (unsigned long) &kretprobe_trampoline; + regs->ra = (unsigned long) &__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/riscv/kernel/probes/kprobes_trampoline.S b/arch/riscv/kernel/probes/kprobes_trampoline.S index 6e85d021e2a2..7bdb09ded39b 100644 --- a/arch/riscv/kernel/probes/kprobes_trampoline.S +++ b/arch/riscv/kernel/probes/kprobes_trampoline.S @@ -75,7 +75,7 @@ REG_L x31, PT_T6(sp) .endm -ENTRY(kretprobe_trampoline) +ENTRY(__kretprobe_trampoline) addi sp, sp, -(PT_SIZE_ON_STACK) save_all_base_regs @@ -90,4 +90,4 @@ ENTRY(kretprobe_trampoline) addi sp, sp, PT_SIZE_ON_STACK ret -ENDPROC(kretprobe_trampoline) +ENDPROC(__kretprobe_trampoline) diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 09cdb632a490..5eb722c984e4 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -70,7 +70,7 @@ struct kprobe_ctlblk { }; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); int kprobe_fault_handler(struct pt_regs *regs, int trapnr); int kprobe_exceptions_notify(struct notifier_block *self, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 5fa86e54f129..c505c0ee5f47 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -242,7 +242,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long) &kretprobe_trampoline; + regs->gprs[14] = (unsigned long) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -334,8 +334,8 @@ NOKPROBE_SYMBOL(kprobe_handler); */ static void __used kretprobe_trampoline_holder(void) { - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline: bcr 0,0\n"); + asm volatile(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline: bcr 0,0\n"); } /* @@ -509,7 +509,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, NOKPROBE_SYMBOL(kprobe_exceptions_notify); static struct kprobe trampoline = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -520,6 +520,6 @@ int __init arch_init_kprobes(void) int arch_trampoline_kprobe(struct kprobe *p) { - return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline; + return p->addr == (kprobe_opcode_t *) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 101477b3e263..b7bb1981e9ee 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -46,7 +46,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * Mark stacktraces with kretprobed functions on them * as unreliable. */ - if (state.ip == (unsigned long)kretprobe_trampoline) + if (state.ip == (unsigned long)__kretprobe_trampoline) return -EINVAL; #endif diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h index 6171682f7798..eeba83e0a7d2 100644 --- a/arch/sh/include/asm/kprobes.h +++ b/arch/sh/include/asm/kprobes.h @@ -26,7 +26,7 @@ typedef insn_size_t kprobe_opcode_t; struct kprobe; void arch_remove_kprobe(struct kprobe *); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 8e76a35e6e33..aed1ea8e2c2f 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -207,7 +207,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->pr = (unsigned long)kretprobe_trampoline; + regs->pr = (unsigned long)__kretprobe_trampoline; } static int __kprobes kprobe_handler(struct pt_regs *regs) @@ -293,13 +293,13 @@ no_kprobe: */ static void __used kretprobe_trampoline_holder(void) { - asm volatile (".globl kretprobe_trampoline\n" - "kretprobe_trampoline:\n\t" + asm volatile (".globl __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n\t" "nop\n"); } /* - * Called when we hit the probe point at kretprobe_trampoline + * Called when we hit the probe point at __kretprobe_trampoline */ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { @@ -442,7 +442,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *)&kretprobe_trampoline, + .addr = (kprobe_opcode_t *)&__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; diff --git a/arch/sparc/include/asm/kprobes.h b/arch/sparc/include/asm/kprobes.h index bfcaa6326c20..06c2bc767ef7 100644 --- a/arch/sparc/include/asm/kprobes.h +++ b/arch/sparc/include/asm/kprobes.h @@ -24,7 +24,7 @@ do { flushi(&(p)->ainsn.insn[0]); \ flushi(&(p)->ainsn.insn[1]); \ } while (0) -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index 401534236c2e..535c7b35cb59 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -440,7 +440,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, /* Replace the return addr with trampoline addr */ regs->u_regs[UREG_RETPC] = - ((unsigned long)kretprobe_trampoline) - 8; + ((unsigned long)__kretprobe_trampoline) - 8; } /* @@ -465,13 +465,13 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, static void __used kretprobe_trampoline_holder(void) { - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n" + asm volatile(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n" "\tnop\n" "\tnop\n"); } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -482,7 +482,7 @@ int __init arch_init_kprobes(void) int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return 1; return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0c59ef5971de..79cd23dba5b5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -809,7 +809,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = sara; /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; + *sara = (unsigned long) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -1019,9 +1019,9 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); */ asm( ".text\n" - ".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" + ".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" /* We don't bother saving the ss register */ #ifdef CONFIG_X86_64 " pushq %rsp\n" @@ -1045,14 +1045,14 @@ asm( " popfl\n" #endif " ret\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n" + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" ); -NOKPROBE_SYMBOL(kretprobe_trampoline); -STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +NOKPROBE_SYMBOL(__kretprobe_trampoline); +STACK_FRAME_NON_STANDARD(__kretprobe_trampoline); /* - * Called from kretprobe_trampoline + * Called from __kretprobe_trampoline */ __used __visible void *trampoline_handler(struct pt_regs *regs) { @@ -1061,7 +1061,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) #ifdef CONFIG_X86_32 regs->gs = 0; #endif - regs->ip = (unsigned long)&kretprobe_trampoline; + regs->ip = (unsigned long)&__kretprobe_trampoline; regs->orig_ax = ~0UL; return (void *)kretprobe_trampoline_handler(regs, ®s->sp); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 96f5df93e36e..b6b2370f4a4c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,14 +188,14 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* * Since some architecture uses structured function pointer, * use dereference_function_descriptor() to get real function address. */ static nokprobe_inline void *kretprobe_trampoline_addr(void) { - return dereference_kernel_function_descriptor(kretprobe_trampoline); + return dereference_kernel_function_descriptor(__kretprobe_trampoline); } /* If the trampoline handler called from a kprobe, use this version */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c2ca40e8595b..5a5949c659d0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -349,7 +349,7 @@ EXPORT_SYMBOL_GPL(trace_output_call); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { - static const char tramp_name[] = "kretprobe_trampoline"; + static const char tramp_name[] = "__kretprobe_trampoline"; int size = sizeof(tramp_name); if (strncmp(tramp_name, name, size) == 0) From 03bac0df2886882c43e6d0bfff9dee84a184fc7e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:41:04 +0900 Subject: [PATCH 016/101] kprobes: Add kretprobe_find_ret_addr() for searching return address Introduce kretprobe_find_ret_addr() and is_kretprobe_trampoline(). These APIs will be used by the ORC stack unwinder and ftrace, so that they can check whether the given address points kretprobe trampoline code and query the correct return address in that case. Link: https://lkml.kernel.org/r/163163046461.489837.1044778356430293962.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 22 ++++++++ kernel/kprobes.c | 113 ++++++++++++++++++++++++++++++---------- 2 files changed, 107 insertions(+), 28 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index b6b2370f4a4c..6d47a9da1e0a 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -505,6 +505,28 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) } #endif /* !CONFIG_OPTPROBES */ +#ifdef CONFIG_KRETPROBES +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return (void *)addr == kretprobe_trampoline_addr(); +} + +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur); +#else +static nokprobe_inline bool is_kretprobe_trampoline(unsigned long addr) +{ + return false; +} + +static nokprobe_inline +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + return 0; +} +#endif + /* Returns true if kprobes handled the fault */ static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, unsigned int trap) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6ed755111eea..833f07f33115 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1863,45 +1863,87 @@ static struct notifier_block kprobe_exceptions_nb = { #ifdef CONFIG_KRETPROBES +/* This assumes the 'tsk' is the current task or the is not running. */ +static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk, + struct llist_node **cur) +{ + struct kretprobe_instance *ri = NULL; + struct llist_node *node = *cur; + + if (!node) + node = tsk->kretprobe_instances.first; + else + node = node->next; + + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + if (ri->ret_addr != kretprobe_trampoline_addr()) { + *cur = node; + return ri->ret_addr; + } + node = node->next; + } + return NULL; +} +NOKPROBE_SYMBOL(__kretprobe_find_ret_addr); + +/** + * kretprobe_find_ret_addr -- Find correct return address modified by kretprobe + * @tsk: Target task + * @fp: A frame pointer + * @cur: a storage of the loop cursor llist_node pointer for next call + * + * Find the correct return address modified by a kretprobe on @tsk in unsigned + * long type. If it finds the return address, this returns that address value, + * or this returns 0. + * The @tsk must be 'current' or a task which is not running. @fp is a hint + * to get the currect return address - which is compared with the + * kretprobe_instance::fp field. The @cur is a loop cursor for searching the + * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the + * first call, but '@cur' itself must NOT NULL. + */ +unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, + struct llist_node **cur) +{ + struct kretprobe_instance *ri = NULL; + kprobe_opcode_t *ret; + + if (WARN_ON_ONCE(!cur)) + return 0; + + do { + ret = __kretprobe_find_ret_addr(tsk, cur); + if (!ret) + break; + ri = container_of(*cur, struct kretprobe_instance, llist); + } while (ri->fp != fp); + + return (unsigned long)ret; +} +NOKPROBE_SYMBOL(kretprobe_find_ret_addr); + unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *frame_pointer) { kprobe_opcode_t *correct_ret_addr = NULL; struct kretprobe_instance *ri = NULL; - struct llist_node *first, *node; + struct llist_node *first, *node = NULL; struct kretprobe *rp; - /* Find all nodes for this frame. */ - first = node = current->kretprobe_instances.first; - while (node) { - ri = container_of(node, struct kretprobe_instance, llist); - - BUG_ON(ri->fp != frame_pointer); - - if (ri->ret_addr != kretprobe_trampoline_addr()) { - correct_ret_addr = ri->ret_addr; - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - goto found; - } - - node = node->next; + /* Find correct address and all nodes for this frame. */ + correct_ret_addr = __kretprobe_find_ret_addr(current, &node); + if (!correct_ret_addr) { + pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); + BUG_ON(1); } - pr_err("kretprobe: Return address not found, not execute handler. Maybe there is a bug in the kernel.\n"); - BUG_ON(1); -found: - /* Unlink all nodes for this frame. */ - current->kretprobe_instances.first = node->next; - node->next = NULL; - - /* Run them.. */ + /* Run the user handler of the nodes. */ + first = current->kretprobe_instances.first; while (first) { ri = container_of(first, struct kretprobe_instance, llist); - first = first->next; + + if (WARN_ON_ONCE(ri->fp != frame_pointer)) + break; rp = get_kretprobe(ri); if (rp && rp->handler) { @@ -1912,6 +1954,21 @@ found: rp->handler(ri, regs); __this_cpu_write(current_kprobe, prev); } + if (first == node) + break; + + first = first->next; + } + + /* Unlink all nodes for this frame. */ + first = current->kretprobe_instances.first; + current->kretprobe_instances.first = node->next; + node->next = NULL; + + /* Recycle free instances. */ + while (first) { + ri = container_of(first, struct kretprobe_instance, llist); + first = first->next; recycle_rp_inst(ri); } From e028c4f7ac7ca8c96126fe46c54ab3d56ffe6a66 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Sep 2021 23:41:13 +0900 Subject: [PATCH 017/101] objtool: Add frame-pointer-specific function ignore Add a CONFIG_FRAME_POINTER-specific version of STACK_FRAME_NON_STANDARD() for the case where a function is intentionally missing frame pointer setup, but otherwise needs objtool/ORC coverage when frame pointers are disabled. Link: https://lkml.kernel.org/r/163163047364.489837.17377799909553689661.stgit@devnote2 Signed-off-by: Josh Poimboeuf Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/objtool.h | 12 ++++++++++++ tools/include/linux/objtool.h | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 7e72d975cb76..aca52db2f3f3 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -66,6 +66,17 @@ struct unwind_hint { static void __used __section(".discard.func_stack_frame_non_standard") \ *__func_stack_frame_non_standard_##func = func +/* + * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore + * for the case where a function is intentionally missing frame pointer setup, + * but otherwise needs objtool/ORC coverage when frame pointers are disabled. + */ +#ifdef CONFIG_FRAME_POINTER +#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func) +#else +#define STACK_FRAME_NON_STANDARD_FP(func) +#endif + #else /* __ASSEMBLY__ */ /* @@ -127,6 +138,7 @@ struct unwind_hint { #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ "\n\t" #define STACK_FRAME_NON_STANDARD(func) +#define STACK_FRAME_NON_STANDARD_FP(func) #else #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 7e72d975cb76..aca52db2f3f3 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -66,6 +66,17 @@ struct unwind_hint { static void __used __section(".discard.func_stack_frame_non_standard") \ *__func_stack_frame_non_standard_##func = func +/* + * STACK_FRAME_NON_STANDARD_FP() is a frame-pointer-specific function ignore + * for the case where a function is intentionally missing frame pointer setup, + * but otherwise needs objtool/ORC coverage when frame pointers are disabled. + */ +#ifdef CONFIG_FRAME_POINTER +#define STACK_FRAME_NON_STANDARD_FP(func) STACK_FRAME_NON_STANDARD(func) +#else +#define STACK_FRAME_NON_STANDARD_FP(func) +#endif + #else /* __ASSEMBLY__ */ /* @@ -127,6 +138,7 @@ struct unwind_hint { #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ "\n\t" #define STACK_FRAME_NON_STANDARD(func) +#define STACK_FRAME_NON_STANDARD_FP(func) #else #define ANNOTATE_INTRA_FUNCTION_CALL .macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 From 5b284b1933688ff18099b2cb8e83456bdd149e10 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Sep 2021 23:41:23 +0900 Subject: [PATCH 018/101] objtool: Ignore unwind hints for ignored functions If a function is ignored, also ignore its hints. This is useful for the case where the function ignore is conditional on frame pointers, e.g. STACK_FRAME_NON_STANDARD_FP(). Link: https://lkml.kernel.org/r/163163048317.489837.10988954983369863209.stgit@devnote2 Signed-off-by: Josh Poimboeuf Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e5947fbb9e7a..67cbdcfcabae 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2909,7 +2909,7 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) } while (&insn->list != &file->insn_list && (!sec || insn->sec == sec)) { - if (insn->hint && !insn->visited) { + if (insn->hint && !insn->visited && !insn->ignore) { ret = validate_branch(file, insn->func, insn, state); if (ret && backtrace) BT_FUNC("<=== (hint)", insn); From eb4a3f7d78c7cf03654dfebdb2df64bd00a7af10 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 14 Sep 2021 23:41:32 +0900 Subject: [PATCH 019/101] x86/kprobes: Add UNWIND_HINT_FUNC on kretprobe_trampoline() Add UNWIND_HINT_FUNC on __kretprobe_trampoline() code so that ORC information is generated on the __kretprobe_trampoline() correctly. Also, this uses STACK_FRAME_NON_STANDARD_FP(), CONFIG_FRAME_POINTER- -specific version of STACK_FRAME_NON_STANDARD(). Link: https://lkml.kernel.org/r/163163049242.489837.11970969750993364293.stgit@devnote2 Signed-off-by: Josh Poimboeuf Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/unwind_hints.h | 5 +++++ arch/x86/kernel/kprobes/core.c | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 8e574c0afef8..8b33674288ea 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -52,6 +52,11 @@ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm +#else + +#define UNWIND_HINT_FUNC \ + UNWIND_HINT(ORC_REG_SP, 8, UNWIND_HINT_TYPE_FUNC, 0) + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 79cd23dba5b5..d1436d7463fd 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1025,6 +1025,7 @@ asm( /* We don't bother saving the ss register */ #ifdef CONFIG_X86_64 " pushq %rsp\n" + UNWIND_HINT_FUNC " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" @@ -1035,6 +1036,7 @@ asm( " popfq\n" #else " pushl %esp\n" + UNWIND_HINT_FUNC " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" @@ -1048,8 +1050,15 @@ asm( ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" ); NOKPROBE_SYMBOL(__kretprobe_trampoline); -STACK_FRAME_NON_STANDARD(__kretprobe_trampoline); - +/* + * __kretprobe_trampoline() skips updating frame pointer. The frame pointer + * saved in trampoline_handler() points to the real caller function's + * frame pointer. Thus the __kretprobe_trampoline() doesn't have a + * standard stack frame with CONFIG_FRAME_POINTER=y. + * Let's mark it non-standard function. Anyway, FP unwinder can correctly + * unwind without the hint. + */ +STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); /* * Called from __kretprobe_trampoline From bb6121b11c22912ae558f853036f8ac37eb45973 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:41:41 +0900 Subject: [PATCH 020/101] ARC: Add instruction_pointer_set() API Add instruction_pointer_set() API for arc. Link: https://lkml.kernel.org/r/163163050148.489837.15187799269793560256.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arc/include/asm/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index 4c3c9be5bd16..cca8d6583e31 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -149,6 +149,11 @@ static inline long regs_return_value(struct pt_regs *regs) return (long)regs->r0; } +static inline void instruction_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + instruction_pointer(regs) = val; +} #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PTRACE_H */ From c1f76fe58f6983205ad14045dbc303416d5e990a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:41:52 +0900 Subject: [PATCH 021/101] ia64: Add instruction_pointer_set() API Add instruction_pointer_set() API for ia64. Link: https://lkml.kernel.org/r/163163051195.489837.1039597819838213481.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/ia64/include/asm/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 08179135905c..8a2d0f72b324 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -51,6 +51,11 @@ * the canonical representation by adding to instruction pointer. */ # define instruction_pointer(regs) ((regs)->cr_iip + ia64_psr(regs)->ri) +# define instruction_pointer_set(regs, val) \ +({ \ + ia64_psr(regs)->ri = (val & 0xf); \ + regs->cr_iip = (val & ~0xfULL); \ +}) static inline unsigned long user_stack_pointer(struct pt_regs *regs) { From 7391dd19027cec4e0edf81b7c27079ae0ecd2d6b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:02 +0900 Subject: [PATCH 022/101] arm: kprobes: Make space for instruction pointer on stack Since arm's __kretprobe_trampoline() saves partial 'pt_regs' on the stack, 'regs->ARM_pc' (instruction pointer) is not accessible from the kretprobe handler. This means if instruction_pointer_set() is used from kretprobe handler, it will break the data on the stack. Make space for instruction pointer (ARM_pc) on the stack in the __kretprobe_trampoline() for fixing this problem. Link: https://lkml.kernel.org/r/163163052262.489837.10327621053231461255.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arm/probes/kprobes/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 67ce7eb8f285..95f23b47ba27 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -376,11 +376,13 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, void __naked __kprobes __kretprobe_trampoline(void) { __asm__ __volatile__ ( + "sub sp, sp, #16 \n\t" "stmdb sp!, {r0 - r11} \n\t" "mov r0, sp \n\t" "bl trampoline_handler \n\t" "mov lr, r0 \n\t" "ldmia sp!, {r0 - r11} \n\t" + "add sp, sp, #16 \n\t" #ifdef CONFIG_THUMB2_KERNEL "bx lr \n\t" #else From df91c5bccb0c2cb868b54bd68a6ddf1fcbede6b1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:12 +0900 Subject: [PATCH 023/101] kprobes: Enable stacktrace from pt_regs in kretprobe handler Since the ORC unwinder from pt_regs requires setting up regs->ip correctly, set the correct return address to the regs->ip before calling user kretprobe handler. This allows the kretrprobe handler to trace stack from the kretprobe's pt_regs by stack_trace_save_regs() (eBPF will do this), instead of stack tracing from the handler context by stack_trace_save() (ftrace will do this). Link: https://lkml.kernel.org/r/163163053237.489837.4272653874525136832.stgit@devnote2 Suggested-by: Josh Poimboeuf Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Acked-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 833f07f33115..ebc587b9a346 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1937,6 +1937,13 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, BUG_ON(1); } + /* + * Set the return address as the instruction pointer, because if the + * user handler calls stack_trace_save_regs() with this 'regs', + * the stack trace will start from the instruction pointer. + */ + instruction_pointer_set(regs, (unsigned long)correct_ret_addr); + /* Run the user handler of the nodes. */ first = current->kretprobe_instances.first; while (first) { From 1f36839308cf8d7d9d35586029f8ae4322e18ef5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:22 +0900 Subject: [PATCH 024/101] x86/kprobes: Push a fake return address at kretprobe_trampoline Change __kretprobe_trampoline() to push the address of the __kretprobe_trampoline() as a fake return address at the bottom of the stack frame. This fake return address will be replaced with the correct return address in the trampoline_handler(). With this change, the ORC unwinder can check whether the return address is modified by kretprobes or not. Link: https://lkml.kernel.org/r/163163054185.489837.14338744048957727386.stgit@devnote2 Signed-off-by: Masami Hiramatsu Suggested-by: Josh Poimboeuf Tested-by: Andrii Nakryiko Acked-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/kprobes/core.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d1436d7463fd..7e1111c19605 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1022,28 +1022,33 @@ asm( ".global __kretprobe_trampoline\n" ".type __kretprobe_trampoline, @function\n" "__kretprobe_trampoline:\n" - /* We don't bother saving the ss register */ #ifdef CONFIG_X86_64 - " pushq %rsp\n" + /* Push a fake return address to tell the unwinder it's a kretprobe. */ + " pushq $__kretprobe_trampoline\n" UNWIND_HINT_FUNC + /* Save the 'sp - 8', this will be fixed later. */ + " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movq %rax, 19*8(%rsp)\n" RESTORE_REGS_STRING + /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ + " addq $8, %rsp\n" " popfq\n" #else - " pushl %esp\n" + /* Push a fake return address to tell the unwinder it's a kretprobe. */ + " pushl $__kretprobe_trampoline\n" UNWIND_HINT_FUNC + /* Save the 'sp - 4', this will be fixed later. */ + " pushl %esp\n" " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" - /* Replace saved sp with true return address. */ - " movl %eax, 15*4(%esp)\n" RESTORE_REGS_STRING + /* In trampoline_handler(), 'regs->flags' is copied to 'regs->sp'. */ + " addl $4, %esp\n" " popfl\n" #endif " ret\n" @@ -1063,8 +1068,10 @@ STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); /* * Called from __kretprobe_trampoline */ -__used __visible void *trampoline_handler(struct pt_regs *regs) +__used __visible void trampoline_handler(struct pt_regs *regs) { + unsigned long *frame_pointer; + /* fixup registers */ regs->cs = __KERNEL_CS; #ifdef CONFIG_X86_32 @@ -1072,8 +1079,17 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) #endif regs->ip = (unsigned long)&__kretprobe_trampoline; regs->orig_ax = ~0UL; + regs->sp += sizeof(long); + frame_pointer = ®s->sp + 1; - return (void *)kretprobe_trampoline_handler(regs, ®s->sp); + /* Replace fake return address with real one. */ + *frame_pointer = kretprobe_trampoline_handler(regs, frame_pointer); + + /* + * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() + * can do RET right after POPF. + */ + regs->sp = regs->flags; } NOKPROBE_SYMBOL(trampoline_handler); From 19138af1bd880d52318bbb164de72a482e59a45c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:31 +0900 Subject: [PATCH 025/101] x86/unwind: Recover kretprobe trampoline entry Since the kretprobe replaces the function return address with the kretprobe_trampoline on the stack, x86 unwinders can not continue the stack unwinding at that point, or record kretprobe_trampoline instead of correct return address. To fix this issue, find the correct return address from task's kretprobe_instances as like as function-graph tracer does. With this fix, the unwinder can correctly unwind the stack from kretprobe event on x86, as below. <...>-135 [003] ...1 6.722338: r_full_proxy_read_0: (vfs_read+0xab/0x1a0 <- full_proxy_read) <...>-135 [003] ...1 6.722377: => kretprobe_trace_func+0x209/0x2f0 => kretprobe_dispatcher+0x4a/0x70 => __kretprobe_trampoline_handler+0xca/0x150 => trampoline_handler+0x44/0x70 => kretprobe_trampoline+0x2a/0x50 => vfs_read+0xab/0x1a0 => ksys_read+0x5f/0xe0 => do_syscall_64+0x33/0x40 => entry_SYSCALL_64_after_hwframe+0x44/0xae Link: https://lkml.kernel.org/r/163163055130.489837.5161749078833497255.stgit@devnote2 Reported-by: Daniel Xu Signed-off-by: Masami Hiramatsu Suggested-by: Josh Poimboeuf Tested-by: Andrii Nakryiko Acked-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/unwind.h | 23 +++++++++++++++++++++++ arch/x86/kernel/unwind_frame.c | 3 +-- arch/x86/kernel/unwind_guess.c | 3 +-- arch/x86/kernel/unwind_orc.c | 21 +++++++++++++++++---- 4 files changed, 42 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 70fc159ebe69..fca2e783e3ce 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -15,6 +16,7 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; + struct llist_node *kr_cur; bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; @@ -99,6 +101,27 @@ void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, void *orc, size_t orc_size) {} #endif +static inline +unsigned long unwind_recover_kretprobe(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) +{ + return is_kretprobe_trampoline(addr) ? + kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) : + addr; +} + +/* Recover the return address modified by kretprobe and ftrace_graph. */ +static inline +unsigned long unwind_recover_ret_addr(struct unwind_state *state, + unsigned long addr, unsigned long *addr_p) +{ + unsigned long ret; + + ret = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + return unwind_recover_kretprobe(state, ret, addr_p); +} + /* * This disables KASAN checking when reading a value from another task's stack, * since the other task could be running on another CPU and could have poisoned diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index d7c44b257f7f..8e1c50c86e5d 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -240,8 +240,7 @@ static bool update_stack_state(struct unwind_state *state, else { addr_p = unwind_get_return_address_ptr(state); addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, addr_p); + state->ip = unwind_recover_ret_addr(state, addr, addr_p); } /* Save the original stack pointer for unwind_dump(): */ diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index c49f10ffd8cd..884d68a6e714 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -15,8 +15,7 @@ unsigned long unwind_get_return_address(struct unwind_state *state) addr = READ_ONCE_NOCHECK(*state->sp); - return ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, state->sp); + return unwind_recover_ret_addr(state, addr, state->sp); } EXPORT_SYMBOL_GPL(unwind_get_return_address); diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a1202536fc57..e6f7592790af 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -534,9 +534,8 @@ bool unwind_next_frame(struct unwind_state *state) if (!deref_stack_reg(state, ip_p, &state->ip)) goto err; - state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, - state->ip, (void *)ip_p); - + state->ip = unwind_recover_ret_addr(state, state->ip, + (unsigned long *)ip_p); state->sp = sp; state->regs = NULL; state->prev_regs = NULL; @@ -549,7 +548,18 @@ bool unwind_next_frame(struct unwind_state *state) (void *)orig_ip); goto err; } - + /* + * There is a small chance to interrupt at the entry of + * __kretprobe_trampoline() where the ORC info doesn't exist. + * That point is right after the RET to __kretprobe_trampoline() + * which was modified return address. + * At that point, the @addr_p of the unwind_recover_kretprobe() + * (this has to point the address of the stack entry storing + * the modified return address) must be "SP - (a stack entry)" + * because SP is incremented by the RET. + */ + state->ip = unwind_recover_kretprobe(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); state->regs = (struct pt_regs *)sp; state->prev_regs = NULL; state->full_regs = true; @@ -562,6 +572,9 @@ bool unwind_next_frame(struct unwind_state *state) (void *)orig_ip); goto err; } + /* See UNWIND_HINT_TYPE_REGS case comment. */ + state->ip = unwind_recover_kretprobe(state, state->ip, + (unsigned long *)(state->sp - sizeof(long))); if (state->full_regs) state->prev_regs = state->regs; From 7da89495d500d6a1e6fe1019587c3b611c7bd217 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:40 +0900 Subject: [PATCH 026/101] tracing: Show kretprobe unknown indicator only for kretprobe_trampoline ftrace shows "[unknown/kretprobe'd]" indicator all addresses in the kretprobe_trampoline, but the modified address by kretprobe should be only kretprobe_trampoline+0. Link: https://lkml.kernel.org/r/163163056044.489837.794883849706638013.stgit@devnote2 Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5a5949c659d0..3547e7176ff7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -346,22 +347,12 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_output_call); -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) +static inline const char *kretprobed(const char *name, unsigned long addr) { - static const char tramp_name[] = "__kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) + if (is_kretprobe_trampoline(addr)) return "[unknown/kretprobe'd]"; return name; } -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) @@ -374,7 +365,7 @@ trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) sprint_symbol(str, address); else kallsyms_lookup(address, NULL, NULL, NULL, str); - name = kretprobed(str); + name = kretprobed(str, address); if (name && strlen(name)) { trace_seq_puts(s, name); From bf094cffea2a6503ce84062f9f0243bef77c58f9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:51 +0900 Subject: [PATCH 027/101] x86/kprobes: Fixup return address in generic trampoline handler In x86, the fake return address on the stack saved by __kretprobe_trampoline() will be replaced with the real return address after returning from trampoline_handler(). Before fixing the return address, the real return address can be found in the 'current->kretprobe_instances'. However, since there is a window between updating the 'current->kretprobe_instances' and fixing the address on the stack, if an interrupt happens at that timing and the interrupt handler does stacktrace, it may fail to unwind because it can not get the correct return address from 'current->kretprobe_instances'. This will eliminate that window by fixing the return address right before updating 'current->kretprobe_instances'. Link: https://lkml.kernel.org/r/163163057094.489837.9044470370440745866.stgit@devnote2 Signed-off-by: Masami Hiramatsu Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/kprobes/core.c | 18 ++++++++++++++++-- include/linux/kprobes.h | 3 +++ kernel/kprobes.c | 11 +++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 7e1111c19605..fce99e249d61 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1065,6 +1065,16 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline); */ STACK_FRAME_NON_STANDARD_FP(__kretprobe_trampoline); +/* This is called from kretprobe_trampoline_handler(). */ +void arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr) +{ + unsigned long *frame_pointer = ®s->sp + 1; + + /* Replace fake return address with real one. */ + *frame_pointer = (unsigned long)correct_ret_addr; +} + /* * Called from __kretprobe_trampoline */ @@ -1082,8 +1092,12 @@ __used __visible void trampoline_handler(struct pt_regs *regs) regs->sp += sizeof(long); frame_pointer = ®s->sp + 1; - /* Replace fake return address with real one. */ - *frame_pointer = kretprobe_trampoline_handler(regs, frame_pointer); + /* + * The return address at 'frame_pointer' is recovered by the + * arch_kretprobe_fixup_return() which called from the + * kretprobe_trampoline_handler(). + */ + kretprobe_trampoline_handler(regs, frame_pointer); /* * Copy FLAGS to 'pt_regs::sp' so that __kretprobe_trapmoline() diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 6d47a9da1e0a..e974caf39d3e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,6 +188,9 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); +void arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr); + void __kretprobe_trampoline(void); /* * Since some architecture uses structured function pointer, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebc587b9a346..b62af9fc3607 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1922,6 +1922,15 @@ unsigned long kretprobe_find_ret_addr(struct task_struct *tsk, void *fp, } NOKPROBE_SYMBOL(kretprobe_find_ret_addr); +void __weak arch_kretprobe_fixup_return(struct pt_regs *regs, + kprobe_opcode_t *correct_ret_addr) +{ + /* + * Do nothing by default. Please fill this to update the fake return + * address on the stack with the correct one on each arch if possible. + */ +} + unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *frame_pointer) { @@ -1967,6 +1976,8 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, first = first->next; } + arch_kretprobe_fixup_return(regs, correct_ret_addr); + /* Unlink all nodes for this frame. */ first = current->kretprobe_instances.first; current->kretprobe_instances.first = node->next; From 6954e415264eeb5ee6be0d22d789ad12c995ee64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 21:03:49 -0400 Subject: [PATCH 028/101] tracing: Place trace_pid_list logic into abstract functions Instead of having the logic that does trace_pid_list open coded, wrap it in abstract functions. This will allow a rewrite of the logic that implements the trace_pid_list without affecting the users. Note, this causes a change in behavior. Every time a pid is written into the set_*_pid file, it creates a new list and uses RCU to update it. If pid_max is lowered, but there was a pid currently in the list that was higher than pid_max, those pids will now be removed on updating the list. The old behavior kept that from happening. The rewrite of the pid_list logic will no longer depend on pid_max, and will return the old behavior. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/ftrace.c | 6 +- kernel/trace/pid_list.c | 160 ++++++++++++++++++++++++++++++++++++ kernel/trace/pid_list.h | 13 +++ kernel/trace/trace.c | 78 ++++++------------ kernel/trace/trace.h | 14 +++- kernel/trace/trace_events.c | 6 +- 7 files changed, 217 insertions(+), 61 deletions(-) create mode 100644 kernel/trace/pid_list.c create mode 100644 kernel/trace/pid_list.h diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6de5d4d63165..bedc5caceec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7efbc8aaf7f6..3eec6792f115 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7184,10 +7184,10 @@ static void clear_ftrace_pids(struct trace_array *tr, int type) synchronize_rcu(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } void ftrace_clear_pids(struct trace_array *tr) @@ -7428,7 +7428,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_rcu(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { /* Register a probe to set whether to ignore the tracing of a task */ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c new file mode 100644 index 000000000000..4483ef70b562 --- /dev/null +++ b/kernel/trace/pid_list.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 VMware Inc, Steven Rostedt + */ +#include +#include +#include "trace.h" + +/** + * trace_pid_list_is_set - test if the pid is set in the list + * @pid_list: The pid list to test + * @pid: The pid to to see if set in the list. + * + * Tests if @pid is is set in the @pid_list. This is usually called + * from the scheduler when a task is scheduled. Its pid is checked + * if it should be traced or not. + * + * Return true if the pid is in the list, false otherwise. + */ +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (pid >= pid_list->pid_max) + return false; + + return test_bit(pid, pid_list->pids); +} + +/** + * trace_pid_list_set - add a pid to the list + * @pid_list: The pid list to add the @pid to. + * @pid: The pid to add. + * + * Adds @pid to @pid_list. This is usually done explicitly by a user + * adding a task to be traced, or indirectly by the fork function + * when children should be traced and a task's pid is in the list. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + set_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_clear - remove a pid from the list + * @pid_list: The pid list to remove the @pid from. + * @pid: The pid to remove. + * + * Removes @pid from @pid_list. This is usually done explicitly by a user + * removing tasks from tracing, or indirectly by the exit function + * when a task that is set to be traced exits. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + clear_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_next - return the next pid in the list + * @pid_list: The pid list to examine. + * @pid: The pid to start from + * @next: The pointer to place the pid that is set starting from @pid. + * + * Looks for the next consecutive pid that is in @pid_list starting + * at the pid specified by @pid. If one is set (including @pid), then + * that pid is placed into @next. + * + * Return 0 when a pid is found, -1 if there are no more pids included. + */ +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next) +{ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + if (pid < pid_list->pid_max) { + *next = pid; + return 0; + } + return -1; +} + +/** + * trace_pid_list_first - return the first pid in the list + * @pid_list: The pid list to examine. + * @pid: The pointer to place the pid first found pid that is set. + * + * Looks for the first pid that is set in @pid_list, and places it + * into @pid if found. + * + * Return 0 when a pid is found, -1 if there are no pids set. + */ +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) +{ + unsigned int first; + + first = find_first_bit(pid_list->pids, pid_list->pid_max); + + if (first < pid_list->pid_max) { + *pid = first; + return 0; + } + return -1; +} + +/** + * trace_pid_list_alloc - create a new pid_list + * + * Allocates a new pid_list to store pids into. + * + * Returns the pid_list on success, NULL otherwise. + */ +struct trace_pid_list *trace_pid_list_alloc(void) +{ + struct trace_pid_list *pid_list; + + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return NULL; + + pid_list->pid_max = READ_ONCE(pid_max); + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return NULL; + } + return pid_list; +} + +/** + * trace_pid_list_free - Frees an allocated pid_list. + * + * Frees the memory for a pid_list that was allocated. + */ +void trace_pid_list_free(struct trace_pid_list *pid_list) +{ + if (!pid_list) + return; + + vfree(pid_list->pids); + kfree(pid_list); +} diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h new file mode 100644 index 000000000000..80d0ecfe1536 --- /dev/null +++ b/kernel/trace/pid_list.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Do not include this file directly. */ + +#ifndef _TRACE_INTERNAL_PID_LIST_H +#define _TRACE_INTERNAL_PID_LIST_H + +struct trace_pid_list { + int pid_max; + unsigned long *pids; +}; + +#endif /* _TRACE_INTERNAL_PID_LIST_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..dcced07a45e6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -512,12 +512,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } -void trace_free_pid_list(struct trace_pid_list *pid_list) -{ - vfree(pid_list->pids); - kfree(pid_list); -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -528,14 +522,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); + return trace_pid_list_is_set(filtered_pids, search_pid); } /** @@ -592,15 +579,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, return; } - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - /* "self" is set for forks, and NULL for exits */ if (self) - set_bit(task->pid, pid_list->pids); + trace_pid_list_set(pid_list, task->pid); else - clear_bit(task->pid, pid_list->pids); + trace_pid_list_clear(pid_list, task->pid); } /** @@ -617,18 +600,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { - unsigned long pid = (unsigned long)v; + long pid = (unsigned long)v; + unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; + + pid = next; /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); - - return NULL; + return (void *)(pid + 1); } /** @@ -645,12 +629,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; + unsigned int first; loff_t l = 0; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) + if (trace_pid_list_first(pid_list, &first) < 0) return NULL; + pid = first; + /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) @@ -686,7 +672,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, unsigned long val; int nr_pids = 0; ssize_t read = 0; - ssize_t ret = 0; + ssize_t ret; loff_t pos; pid_t pid; @@ -699,34 +685,23 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * the user. If the operation fails, then the current list is * not modified. */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } - pid_list->pid_max = READ_ONCE(pid_max); - - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - trace_parser_put(&parser); - kfree(pid_list); - return -ENOMEM; - } - if (filtered_pids) { /* copy the current bits to the new max */ - for_each_set_bit(pid, filtered_pids->pids, - filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } + ret = 0; while (cnt > 0) { pos = 0; @@ -742,12 +717,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val >= pid_list->pid_max) - break; pid = (pid_t)val; - set_bit(pid, pid_list->pids); + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } nr_pids++; trace_parser_clear(&parser); @@ -756,13 +732,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_put(&parser); if (ret < 0) { - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); read = ret; pid_list = NULL; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b7c0f8e160fb..2ec500186992 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,8 @@ #include #include +#include "pid_list.h" + #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ #include /* some archs define it here */ @@ -188,10 +190,14 @@ struct trace_options { struct trace_option_dentry *topts; }; -struct trace_pid_list { - int pid_max; - unsigned long *pids; -}; +struct trace_pid_list *trace_pid_list_alloc(void); +void trace_pid_list_free(struct trace_pid_list *pid_list); +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid); +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next); enum { TRACE_PIDS = BIT(0), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 830b3b9940f4..bf54d2803261 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,10 +885,10 @@ static void __ftrace_clear_event_pids(struct trace_array *tr, int type) tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) @@ -1967,7 +1967,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { tracepoint_synchronize_unregister(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } From 8d6e90983ade25ec7925211ac31d9ccaf64b7edf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 22:20:57 -0400 Subject: [PATCH 029/101] tracing: Create a sparse bitmask for pid filtering When the trace_pid_list was created, the default pid max was 32768. Creating a bitmask that can hold one bit for all 32768 took up 4096 (one page). Having a one page bitmask was not much of a problem, and that was used for mapping pids. But today, systems are bigger and can run more tasks, and now the default pid_max is usually set to 4194304. Which means to handle that many pids requires 524288 bytes. Worse yet, the pid_max can be set to 2^30 (1073741824 or 1G) which would take 134217728 (128M) of memory to store this array. Since the pid_list array is very sparsely populated, it is a huge waste of memory to store all possible bits for each pid when most will not be set. Instead, use a page table scheme to store the array, and allow this to handle up to 30 bit pids. The pid_mask will start out with 256 entries for the first 8 MSB bits. This will cost 1K for 32 bit architectures and 2K for 64 bit. Each of these will have a 256 array to store the next 8 bits of the pid (another 1 or 2K). These will hold an 2K byte bitmask (which will cover the LSB 14 bits or 16384 pids). When the trace_pid_list is allocated, it will have the 1/2K upper bits allocated, and then it will allocate a cache for the next upper chunks and the lower chunks (default 6 of each). Then when a bit is "set", these chunks will be pulled from the free list and added to the array. If the free list gets down to a lever (default 2), it will trigger an irqwork that will refill the cache back up. On clearing a bit, if the clear causes the bitmask to be zero, that chunk will then be placed back into the free cache for later use, keeping the need to allocate more down to a minimum. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 401 ++++++++++++++++++++++++++++++++++++---- kernel/trace/pid_list.h | 79 +++++++- 2 files changed, 445 insertions(+), 35 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4483ef70b562..cbf8031b2b99 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -2,10 +2,119 @@ /* * Copyright (C) 2021 VMware Inc, Steven Rostedt */ -#include +#include +#include #include #include "trace.h" +/* See pid_list.h for details */ + +static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list) +{ + union lower_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->lower_list) + return NULL; + + chunk = pid_list->lower_list; + pid_list->lower_list = chunk->next; + pid_list->free_lower_chunks--; + WARN_ON_ONCE(pid_list->free_lower_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_lower_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list) +{ + union upper_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->upper_list) + return NULL; + + chunk = pid_list->upper_list; + pid_list->upper_list = chunk->next; + pid_list->free_upper_chunks--; + WARN_ON_ONCE(pid_list->free_upper_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_upper_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline void put_lower_chunk(struct trace_pid_list *pid_list, + union lower_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; +} + +static inline void put_upper_chunk(struct trace_pid_list *pid_list, + union upper_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; +} + +static inline bool upper_empty(union upper_chunk *chunk) +{ + /* + * If chunk->data has no lower chunks, it will be the same + * as a zeroed bitmask. Use find_first_bit() to test it + * and if it doesn't find any bits set, then the array + * is empty. + */ + int bit = find_first_bit((unsigned long *)chunk->data, + sizeof(chunk->data) * 8); + return bit >= sizeof(chunk->data) * 8; +} + +static inline int pid_split(unsigned int pid, unsigned int *upper1, + unsigned int *upper2, unsigned int *lower) +{ + /* MAX_PID should cover all pids */ + BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT); + + /* In case a bad pid is passed in, then fail */ + if (unlikely(pid >= MAX_PID)) + return -1; + + *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK; + *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK; + *lower = pid & LOWER_MASK; + + return 0; +} + +static inline unsigned int pid_join(unsigned int upper1, + unsigned int upper2, unsigned int lower) +{ + return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) | + ((upper2 & UPPER_MASK) << UPPER2_SHIFT) | + (lower & LOWER_MASK); +} + /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test @@ -19,14 +128,30 @@ */ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + bool ret = false; + + if (!pid_list) return false; - return test_bit(pid, pid_list->pids); + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return false; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + + return ret; } /** @@ -42,13 +167,44 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + int ret; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) return -EINVAL; - set_bit(pid, pid_list->pids); - - return 0; + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) { + upper_chunk = get_upper_chunk(pid_list); + if (!upper_chunk) { + ret = -ENOMEM; + goto out; + } + pid_list->upper[upper1] = upper_chunk; + } + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) { + lower_chunk = get_lower_chunk(pid_list); + if (!lower_chunk) { + ret = -ENOMEM; + goto out; + } + upper_chunk->data[upper2] = lower_chunk; + } + set_bit(lower, lower_chunk->data); + ret = 0; + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + return ret; } /** @@ -64,12 +220,41 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) return -EINVAL; - clear_bit(pid, pid_list->pids); + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) + goto out; + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + goto out; + + clear_bit(lower, lower_chunk->data); + + /* if there's no more bits set, add it to the free list */ + if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) { + put_lower_chunk(pid_list, lower_chunk); + upper_chunk->data[upper2] = NULL; + if (upper_empty(upper_chunk)) { + put_upper_chunk(pid_list, upper_chunk); + pid_list->upper[upper1] = NULL; + } + } + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -88,13 +273,45 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, unsigned int *next) { - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; - if (pid < pid_list->pid_max) { - *next = pid; - return 0; + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { + upper_chunk = pid_list->upper[upper1]; + + if (!upper_chunk) + continue; + + for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + continue; + + lower = find_next_bit(lower_chunk->data, LOWER_MAX, + lower); + if (lower < LOWER_MAX) + goto found; + } } - return -1; + + found: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + if (upper1 > UPPER_MASK) + return -1; + + *next = pid_join(upper1, upper2, lower); + return 0; } /** @@ -109,15 +326,79 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, */ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) { - unsigned int first; + return trace_pid_list_next(pid_list, 0, pid); +} - first = find_first_bit(pid_list->pids, pid_list->pid_max); +static void pid_list_refill_irq(struct irq_work *iwork) +{ + struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, + refill_irqwork); + union upper_chunk *upper; + union lower_chunk *lower; + union upper_chunk **upper_next = &upper; + union lower_chunk **lower_next = &lower; + int upper_count; + int lower_count; + int ucnt = 0; + int lcnt = 0; - if (first < pid_list->pid_max) { - *pid = first; - return 0; + again: + raw_spin_lock(&pid_list->lock); + upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; + lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + raw_spin_unlock(&pid_list->lock); + + if (upper_count <= 0 && lower_count <= 0) + return; + + while (upper_count-- > 0) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *upper_next = chunk; + upper_next = &chunk->next; + ucnt++; } - return -1; + + while (lower_count-- > 0) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *lower_next = chunk; + lower_next = &chunk->next; + lcnt++; + } + + raw_spin_lock(&pid_list->lock); + if (upper) { + *upper_next = pid_list->upper_list; + pid_list->upper_list = upper; + pid_list->free_upper_chunks += ucnt; + } + if (lower) { + *lower_next = pid_list->lower_list; + pid_list->lower_list = lower; + pid_list->free_lower_chunks += lcnt; + } + raw_spin_unlock(&pid_list->lock); + + /* + * On success of allocating all the chunks, both counters + * will be less than zero. If they are not, then an allocation + * failed, and we should not try again. + */ + if (upper_count >= 0 || lower_count >= 0) + return; + /* + * When the locks were released, free chunks could have + * been used and allocation needs to be done again. Might as + * well allocate it now. + */ + goto again; } /** @@ -130,18 +411,41 @@ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) struct trace_pid_list *trace_pid_list_alloc(void) { struct trace_pid_list *pid_list; + int i; - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + /* According to linux/thread.h, pids can be no bigger that 30 bits */ + WARN_ON_ONCE(pid_max > (1 << 30)); + + pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) return NULL; - pid_list->pid_max = READ_ONCE(pid_max); + init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - return NULL; + raw_spin_lock_init(&pid_list->lock); + + for (i = 0; i < CHUNK_ALLOC; i++) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; } + + for (i = 0; i < CHUNK_ALLOC; i++) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; + } + return pid_list; } @@ -152,9 +456,40 @@ struct trace_pid_list *trace_pid_list_alloc(void) */ void trace_pid_list_free(struct trace_pid_list *pid_list) { + union upper_chunk *upper; + union lower_chunk *lower; + int i, j; + if (!pid_list) return; - vfree(pid_list->pids); + irq_work_sync(&pid_list->refill_irqwork); + + while (pid_list->lower_list) { + union lower_chunk *chunk; + + chunk = pid_list->lower_list; + pid_list->lower_list = pid_list->lower_list->next; + kfree(chunk); + } + + while (pid_list->upper_list) { + union upper_chunk *chunk; + + chunk = pid_list->upper_list; + pid_list->upper_list = pid_list->upper_list->next; + kfree(chunk); + } + + for (i = 0; i < UPPER1_SIZE; i++) { + upper = pid_list->upper[i]; + if (upper) { + for (j = 0; j < UPPER2_SIZE; j++) { + lower = upper->data[j]; + kfree(lower); + } + kfree(upper); + } + } kfree(pid_list); } diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 80d0ecfe1536..62e73f1ac85f 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -5,9 +5,84 @@ #ifndef _TRACE_INTERNAL_PID_LIST_H #define _TRACE_INTERNAL_PID_LIST_H +/* + * In order to keep track of what pids to trace, a tree is created much + * like page tables are used. This creates a sparse bit map, where + * the tree is filled in when needed. A PID is at most 30 bits (see + * linux/thread.h), and is broken up into 3 sections based on the bit map + * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the + * "upper2" section and the 14 LSB is the "lower" section. + * + * A trace_pid_list structure holds the "upper1" section, in an + * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where + * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk" + * structures, where each has an array of size 2K bytes representing a bitmask + * of the 14 LSB of the PID (256 * 8 = 2048) + * + * When a trace_pid_list is allocated, it includes the 256 pointer array + * of the upper1 unions. Then a "cache" of upper and lower is allocated + * where these will be assigned as needed. + * + * When a bit is set in the pid_list bitmask, the pid to use has + * the 8 MSB masked, and this is used to index the array in the + * pid_list to find the next upper union. If the element is NULL, + * then one is retrieved from the upper_list cache. If none is + * available, then -ENOMEM is returned. + * + * The next 8 MSB is used to index into the "upper2" section. If this + * element is NULL, then it is retrieved from the lower_list cache. + * Again, if one is not available -ENOMEM is returned. + * + * Finally the 14 LSB of the PID is used to set the bit in the 16384 + * bitmask (made up of 2K bytes). + * + * When the second upper section or the lower section has their last + * bit cleared, they are added back to the free list to be reused + * when needed. + */ + +#define UPPER_BITS 8 +#define UPPER_MAX (1 << UPPER_BITS) +#define UPPER1_SIZE (1 << UPPER_BITS) +#define UPPER2_SIZE (1 << UPPER_BITS) + +#define LOWER_BITS 14 +#define LOWER_MAX (1 << LOWER_BITS) +#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG) + +#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS) +#define UPPER2_SHIFT LOWER_BITS +#define LOWER_MASK (LOWER_MAX - 1) + +#define UPPER_MASK (UPPER_MAX - 1) + +/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */ +#define MAX_PID (1 << 30) + +/* Just keep 6 chunks of both upper and lower in the cache on alloc */ +#define CHUNK_ALLOC 6 + +/* Have 2 chunks free, trigger a refill of the cache */ +#define CHUNK_REALLOC 2 + +union lower_chunk { + union lower_chunk *next; + unsigned long data[LOWER_SIZE]; // 2K in size +}; + +union upper_chunk { + union upper_chunk *next; + union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size +}; + struct trace_pid_list { - int pid_max; - unsigned long *pids; + raw_spinlock_t lock; + struct irq_work refill_irqwork; + union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size + union upper_chunk *upper_list; + union lower_chunk *lower_list; + int free_upper_chunks; + int free_lower_chunks; }; #endif /* _TRACE_INTERNAL_PID_LIST_H */ From b30a779d5c557e99b93917f33d441948c9aead97 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 7 Oct 2021 09:53:53 -0400 Subject: [PATCH 030/101] tracing: Initialize upper and lower vars in pid_list_refill_irq() The upper and lower variables are set as link lists to add into the sparse array. If they are NULL, after the needed allocations are done, then there is nothing to add. But they need to be initialized to NULL for this to work. Link: https://lore.kernel.org/all/221bc7ba-a475-1cb9-1bbe-730bb9c2d448@canonical.com/ Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index cbf8031b2b99..a2ef1d18126a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -333,8 +333,8 @@ static void pid_list_refill_irq(struct irq_work *iwork) { struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, refill_irqwork); - union upper_chunk *upper; - union lower_chunk *lower; + union upper_chunk *upper = NULL; + union lower_chunk *lower = NULL; union upper_chunk **upper_next = &upper; union lower_chunk **lower_next = &lower; int upper_count; From 49d67e445742bbcb03106b735b2ab39f6e5c56bc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 18 Aug 2021 11:24:50 -0400 Subject: [PATCH 031/101] tracefs: Have tracefs directories not set OTH permission bits by default The tracefs file system is by default mounted such that only root user can access it. But there are legitimate reasons to create a group and allow those added to the group to have access to tracing. By changing the permissions of the tracefs mount point to allow access, it will allow group access to the tracefs directory. There should not be any real reason to allow all access to the tracefs directory as it contains sensitive information. Have the default permission of directories being created not have any OTH (other) bits set, such that an admin that wants to give permission to a group has to first disable all OTH bits in the file system. Link: https://lkml.kernel.org/r/20210818153038.664127804@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- fs/tracefs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1261e8b41edb..925a621b432e 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -432,7 +432,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + /* Do not set bits for OTH */ + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; From 21ccc9cd72116289469e5519b6159c675a2fa58f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 18 Aug 2021 11:24:51 -0400 Subject: [PATCH 032/101] tracing: Disable "other" permission bits in the tracefs files When building the files in the tracefs file system, do not by default set any permissions for OTH (other). This will make it easier for admins who want to define a group for accessing tracefs and not having to first disable all the permission bits for "other" in the file system. As tracing can leak sensitive information, it should never by default allowing all users access. An admin can still set the permission bits for others to have access, which may be useful for creating a honeypot and seeing who takes advantage of it and roots the machine. Link: https://lkml.kernel.org/r/20210818153038.864149276@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 +++++---- kernel/trace/trace.c | 73 ++++++++++++++------------- kernel/trace/trace.h | 3 ++ kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_events.c | 42 +++++++-------- kernel/trace/trace_events_synth.c | 4 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 6 +-- kernel/trace/trace_kprobe.c | 8 +-- kernel/trace/trace_osnoise.c | 14 ++--- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_recursion_record.c | 4 +- kernel/trace/trace_stack.c | 6 +-- kernel/trace/trace_stat.c | 6 +-- kernel/trace/trace_uprobe.c | 4 +- 15 files changed, 103 insertions(+), 96 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3eec6792f115..0a0dbc2d411b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -988,8 +988,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); + entry = tracefs_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); if (!entry) pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } @@ -6109,10 +6110,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent) { - trace_create_file("set_ftrace_filter", 0644, parent, + trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent, ops, &ftrace_filter_fops); - trace_create_file("set_ftrace_notrace", 0644, parent, + trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent, ops, &ftrace_notrace_fops); } @@ -6139,19 +6140,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { - trace_create_file("available_filter_functions", 0444, + trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("enabled_functions", 0444, + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0644, d_tracer, + trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0644, d_tracer, + trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -7494,10 +7495,10 @@ static const struct file_operations ftrace_no_pid_fops = { void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - trace_create_file("set_ftrace_pid", 0644, d_tracer, + trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer, tr, &ftrace_pid_fops); - trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, - tr, &ftrace_no_pid_fops); + trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE, + d_tracer, tr, &ftrace_no_pid_fops); } void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcced07a45e6..985390cb8441 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1690,7 +1690,8 @@ static void trace_create_maxlat_file(struct trace_array *tr, { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, d_tracer, &tr->max_latency, &tracing_max_lat_fops); } @@ -1727,8 +1728,8 @@ void latency_fsnotify(struct trace_array *tr) #else #define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", 0644, d_tracer, \ - &tr->max_latency, &tracing_max_lat_fops) + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, &tr->max_latency, &tracing_max_lat_fops) #endif @@ -6054,7 +6055,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("eval_map", 0444, d_tracer, + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); } @@ -8567,27 +8568,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) } /* per cpu trace_pipe */ - trace_create_cpu_file("trace_pipe", 0444, d_cpu, + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_cpu_file("trace", 0644, d_cpu, + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops); - trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops); - trace_create_cpu_file("stats", 0444, d_cpu, + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); - trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", 0644, d_cpu, + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); #endif } @@ -8793,8 +8794,8 @@ create_trace_option_file(struct trace_array *tr, topt->opt = opt; topt->tr = tr; - topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops); } @@ -8869,7 +8870,7 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, + return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } @@ -9394,28 +9395,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) struct trace_event_file *file; int cpu; - trace_create_file("available_tracers", 0444, d_tracer, + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops); - trace_create_file("current_tracer", 0644, d_tracer, + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", 0644, d_tracer, + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops); - trace_create_file("trace_options", 0644, d_tracer, + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); - trace_create_file("trace", 0644, d_tracer, + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops); - trace_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", 0644, d_tracer, + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops); - trace_create_file("buffer_total_size_kb", 0444, d_tracer, + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, @@ -9426,25 +9427,25 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) file = __find_event_file(tr, "ftrace", "print"); if (file && file->dir) - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); tr->trace_marker_file = file; trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", 0644, d_tracer, tr, + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); - trace_create_file("tracing_on", 0644, d_tracer, + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops); - trace_create_file("timestamp_mode", 0444, d_tracer, tr, + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; - trace_create_file("buffer_percent", 0444, d_tracer, + trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer, tr, &buffer_percent_fops); create_trace_options_dir(tr); @@ -9457,11 +9458,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - trace_create_file("error_log", 0644, d_tracer, + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) @@ -9654,19 +9655,19 @@ static __init int tracer_init_tracefs(void) init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, NULL, + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, NULL, + trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, NULL, + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, NULL, + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, NULL, + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); @@ -9678,7 +9679,7 @@ static __init int tracer_init_tracefs(void) #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, NULL, + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2ec500186992..6c3808132b16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -29,6 +29,9 @@ #include /* some archs define it here */ #endif +#define TRACE_MODE_WRITE 0640 +#define TRACE_MODE_READ 0440 + enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 1110112e55bd..e34e8182ee4b 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -262,7 +262,7 @@ static __init int init_dynamic_event(void) if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, NULL, + entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, NULL, &dynamic_events_ops); /* Event list interface */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bf54d2803261..4021b9a79f93 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2312,7 +2312,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", TRACE_MODE_WRITE, + dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); @@ -2320,7 +2321,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, dir->entry, dir, + trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, &ftrace_system_enable_fops); } @@ -2402,12 +2403,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, file->dir, file, + trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, + trace_create_file("id", TRACE_MODE_READ, file->dir, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2423,22 +2424,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); + trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", 0444, file->dir, file, + trace_create_file("hist", TRACE_MODE_READ, file->dir, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", 0444, file->dir, file, + trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, &event_hist_debug_fops); #endif - trace_create_file("format", 0444, file->dir, call, + trace_create_file("format", TRACE_MODE_READ, file->dir, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT @@ -3433,7 +3434,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warn("Could not create tracefs 'set_event' entry\n"); @@ -3446,7 +3447,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - entry = trace_create_file("enable", 0644, d_events, + entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); if (!entry) { pr_warn("Could not create tracefs 'enable' entry\n"); @@ -3455,24 +3456,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", 0644, parent, + entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); - entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, - tr, &ftrace_set_event_notrace_pid_fops); + entry = tracefs_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", 0444, d_events, + entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", 0444, d_events, + entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); if (!entry) @@ -3689,8 +3691,8 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", 0444, NULL, - tr, &ftrace_avail_fops); + entry = tracefs_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d54094b7a9d7..22db3ce95e74 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2227,8 +2227,8 @@ static __init int trace_events_synth_init(void) if (err) goto err; - entry = tracefs_create_file("synthetic_events", 0644, NULL, - NULL, &synth_events_fops); + entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE, + NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; goto err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0de6837722da..6b5ff3ba4251 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1340,7 +1340,7 @@ static __init int init_graph_tracefs(void) if (ret) return 0; - trace_create_file("max_graph_depth", 0644, NULL, + trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1b83d75eb103..d0a730d99a33 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -782,21 +782,21 @@ static int init_tracefs(void) if (!top_dir) return -ENOMEM; - hwlat_sample_window = tracefs_create_file("window", 0640, + hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE, top_dir, &hwlat_window, &trace_min_max_fops); if (!hwlat_sample_window) goto err; - hwlat_sample_width = tracefs_create_file("width", 0644, + hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE, top_dir, &hwlat_width, &trace_min_max_fops); if (!hwlat_sample_width) goto err; - hwlat_thread_mode = trace_create_file("mode", 0644, + hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE, top_dir, NULL, &thread_mode_fops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0e1e7ce5f7ed..33272a7b6912 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1925,16 +1925,16 @@ static __init int init_kprobe_trace(void) if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, NULL, - NULL, &kprobe_events_ops); + entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, NULL, - NULL, &kprobe_profile_ops); + entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index ce053619f289..c4f14fb98aaa 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1856,38 +1856,38 @@ static int init_tracefs(void) if (!top_dir) return 0; - tmp = tracefs_create_file("period_us", 0640, top_dir, + tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir, &osnoise_period, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("runtime_us", 0644, top_dir, + tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir, &osnoise_runtime, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_in, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_total, &trace_min_max_fops); if (!tmp) goto err; - tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); if (!tmp) goto err; #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE - tmp = tracefs_create_file("print_stack", 0640, top_dir, + tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, &osnoise_print_stack, &trace_min_max_fops); if (!tmp) goto err; #endif - tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, + tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) goto err; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 4b320fe7df70..29f6e95439b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -384,7 +384,7 @@ static __init int init_trace_printk_function_export(void) if (ret) return 0; - trace_create_file("printk_formats", 0444, NULL, + trace_create_file("printk_formats", TRACE_MODE_READ, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index b2edac1fe156..4d4b78c8ca25 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -226,8 +226,8 @@ __init static int create_recursed_functions(void) { struct dentry *dentry; - dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, - &recursed_functions_fops); + dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); if (!dentry) pr_warn("WARNING: Failed to create recursed_functions\n"); return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 63c285042051..5a48dba912ea 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -559,14 +559,14 @@ static __init int stack_trace_init(void) if (ret) return 0; - trace_create_file("stack_max_size", 0644, NULL, + trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, NULL, + trace_create_file("stack_trace", TRACE_MODE_READ, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, NULL, + trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8d141c3825a9..bb247beec447 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && (ret = tracing_stat_init())) return ret; - session->file = tracefs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); + session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE, + stat_dir, session, + &tracing_stat_fops); if (!session->file) return -ENOMEM; return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 225ce569bf8f..0a5c0db3137e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1655,10 +1655,10 @@ static __init int init_uprobe_trace(void) if (ret) return 0; - trace_create_file("uprobe_events", 0644, NULL, + trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, NULL, + trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, NULL, &uprobe_profile_ops); return 0; } From 6644c654ea70e0d8b8d5111e1272f8f29df00f21 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Thu, 9 Sep 2021 17:02:16 +0800 Subject: [PATCH 033/101] ftrace: Cleanup ftrace_dyn_arch_init() Most of ARCHs use empty ftrace_dyn_arch_init(), introduce a weak common ftrace_dyn_arch_init() to cleanup them. Link: https://lkml.kernel.org/r/20210909090216.1955240-1-o451686892@gmail.com Acked-by: Heiko Carstens (s390) Acked-by: Helge Deller (parisc) Signed-off-by: Weizhao Ouyang Signed-off-by: Steven Rostedt (VMware) --- arch/arm/kernel/ftrace.c | 5 ----- arch/arm64/kernel/ftrace.c | 5 ----- arch/csky/kernel/ftrace.c | 5 ----- arch/ia64/kernel/ftrace.c | 6 ------ arch/microblaze/kernel/ftrace.c | 5 ----- arch/nds32/kernel/ftrace.c | 5 ----- arch/parisc/kernel/ftrace.c | 5 ----- arch/riscv/kernel/ftrace.c | 5 ----- arch/s390/kernel/ftrace.c | 5 ----- arch/sh/kernel/ftrace.c | 5 ----- arch/sparc/kernel/ftrace.c | 5 ----- arch/x86/kernel/ftrace.c | 5 ----- kernel/trace/ftrace.c | 5 +++++ 13 files changed, 5 insertions(+), 61 deletions(-) diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 3c83b5d29697..a006585e1c09 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -193,11 +193,6 @@ int ftrace_make_nop(struct module *mod, return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 7f467bd9db7a..fc62dfe73f93 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -236,11 +236,6 @@ void arch_ftrace_update_code(int command) command |= FTRACE_MAY_SLEEP; ftrace_modify_all_code(command); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/csky/kernel/ftrace.c b/arch/csky/kernel/ftrace.c index b4a7ec1517ff..50bfcf129078 100644 --- a/arch/csky/kernel/ftrace.c +++ b/arch/csky/kernel/ftrace.c @@ -133,11 +133,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) (unsigned long)func, true, true); return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c index b2ab2d58fb30..d6360fd404ab 100644 --- a/arch/ia64/kernel/ftrace.c +++ b/arch/ia64/kernel/ftrace.c @@ -194,9 +194,3 @@ int ftrace_update_ftrace_func(ftrace_func_t func) flush_icache_range(addr, addr + 16); return 0; } - -/* run from kstop_machine */ -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c index 224eea40e1ee..188749d62709 100644 --- a/arch/microblaze/kernel/ftrace.c +++ b/arch/microblaze/kernel/ftrace.c @@ -163,11 +163,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ret; } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 0e23e3a8df6b..f0ef4842d191 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -84,11 +84,6 @@ void _ftrace_caller(unsigned long parent_ip) /* restore all state needed by the compiler epilogue */ } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - static unsigned long gen_sethi_insn(unsigned long addr) { unsigned long opcode = 0x46000000; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 0a1e75af5382..01581f715737 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -94,11 +94,6 @@ int ftrace_disable_ftrace_graph_caller(void) #endif #ifdef CONFIG_DYNAMIC_FTRACE - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} int ftrace_update_ftrace_func(ftrace_func_t func) { return 0; diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 7f1e5203de88..4716f4cdc038 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -154,11 +154,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1d94ffdf347b..5165bf344f95 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -262,11 +262,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return 0; } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - void arch_ftrace_update_code(int command) { if (ftrace_shared_hotpatch_trampoline(NULL)) diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 295c43315bbe..930001bb8c6a 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -252,11 +252,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_modify_code(rec->ip, old, new); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 684b84ce397f..eaead3da8e03 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -82,11 +82,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) new = ftrace_call_replace(ip, (unsigned long)func); return ftrace_modify_code(ip, old, new); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b3ce3b4a2a2..23d221a9a3cd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -252,11 +252,6 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0a0dbc2d411b..2c3e9760df7f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6847,6 +6847,11 @@ void __init ftrace_free_init_mem(void) ftrace_free_mem(NULL, start, end); } +int __init __weak ftrace_dyn_arch_init(void) +{ + return 0; +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; From bdac5c2b243f68ec15f8203c3348ae79fee8e8d8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 Sep 2021 15:23:20 +0900 Subject: [PATCH 034/101] bootconfig: Allocate xbc_data inside xbc_init() Allocate 'xbc_data' in the xbc_init() so that it does not need to care about the ownership of the copied data. Link: https://lkml.kernel.org/r/163177339986.682366.898762699429769117.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 +- init/main.c | 13 ++----------- lib/bootconfig.c | 33 +++++++++++++++++++++------------ tools/bootconfig/main.c | 6 +++--- 4 files changed, 27 insertions(+), 27 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 537e1b991f11..62e09b788172 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -271,7 +271,7 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node, } /* XBC node initializer */ -int __init xbc_init(char *buf, const char **emsg, int *epos); +int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); /* XBC cleanup data structures */ diff --git a/init/main.c b/init/main.c index 81a79a77db46..d894989d86bc 100644 --- a/init/main.c +++ b/init/main.c @@ -409,7 +409,7 @@ static void __init setup_boot_config(void) const char *msg; int pos; u32 size, csum; - char *data, *copy, *err; + char *data, *err; int ret; /* Cut out the bootconfig data even if we have no bootconfig option */ @@ -442,16 +442,7 @@ static void __init setup_boot_config(void) return; } - copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); - if (!copy) { - pr_err("Failed to allocate memory for bootconfig\n"); - return; - } - - memcpy(copy, data, size); - copy[size] = '\0'; - - ret = xbc_init(copy, &msg, &pos); + ret = xbc_init(data, size, &msg, &pos); if (ret < 0) { if (pos < 0) pr_err("Failed to init bootconfig: %s.\n", msg); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 5ae248b29373..66b02fddfea8 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -789,6 +789,7 @@ static int __init xbc_verify_tree(void) */ void __init xbc_destroy_all(void) { + memblock_free_ptr(xbc_data, xbc_data_size); xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; @@ -799,19 +800,20 @@ void __init xbc_destroy_all(void) /** * xbc_init() - Parse given XBC file and build XBC internal tree - * @buf: boot config text + * @data: The boot config text original data + * @size: The size of @data * @emsg: A pointer of const char * to store the error message * @epos: A pointer of int to store the error position * - * This parses the boot config text in @buf. @buf must be a - * null terminated string and smaller than XBC_DATA_MAX. + * This parses the boot config text in @data. @size must be smaller + * than XBC_DATA_MAX. * Return the number of stored nodes (>0) if succeeded, or -errno * if there is any error. * In error cases, @emsg will be updated with an error message and * @epos will be updated with the error position which is the byte offset * of @buf. If the error is not a parser error, @epos will be -1. */ -int __init xbc_init(char *buf, const char **emsg, int *epos) +int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) { char *p, *q; int ret, c; @@ -824,28 +826,35 @@ int __init xbc_init(char *buf, const char **emsg, int *epos) *emsg = "Bootconfig is already initialized"; return -EBUSY; } - - ret = strlen(buf); - if (ret > XBC_DATA_MAX - 1 || ret == 0) { + if (size > XBC_DATA_MAX || size == 0) { if (emsg) - *emsg = ret ? "Config data is too big" : + *emsg = size ? "Config data is too big" : "Config data is empty"; return -ERANGE; } + xbc_data = memblock_alloc(size + 1, SMP_CACHE_BYTES); + if (!xbc_data) { + if (emsg) + *emsg = "Failed to allocate bootconfig data"; + return -ENOMEM; + } + memcpy(xbc_data, data, size); + xbc_data[size] = '\0'; + xbc_data_size = size + 1; + xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, SMP_CACHE_BYTES); if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; + xbc_destroy_all(); return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); - xbc_data = buf; - xbc_data_size = ret + 1; - last_parent = NULL; - p = buf; + last_parent = NULL; + p = xbc_data; do { q = strpbrk(p, "{}=+;:\n#"); if (!q) { diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index fd67496a947f..7269c9e35335 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -229,7 +229,7 @@ static int load_xbc_from_initrd(int fd, char **buf) return -EINVAL; } - ret = xbc_init(*buf, &msg, NULL); + ret = xbc_init(*buf, size, &msg, NULL); /* Wrong data */ if (ret < 0) { pr_err("parse error: %s.\n", msg); @@ -269,7 +269,7 @@ static int init_xbc_with_error(char *buf, int len) if (!copy) return -ENOMEM; - ret = xbc_init(buf, &msg, &pos); + ret = xbc_init(buf, len, &msg, &pos); if (ret < 0) show_xbc_error(copy, msg, pos); free(copy); @@ -382,7 +382,7 @@ static int apply_xbc(const char *path, const char *xbc_path) memcpy(data, buf, size); /* Check the data format */ - ret = xbc_init(buf, &msg, &pos); + ret = xbc_init(buf, size, &msg, &pos); if (ret < 0) { show_xbc_error(data, msg, pos); free(data); From e306220cb7b7c2948f191414ab06851e143b54c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 16 Sep 2021 15:23:29 +0900 Subject: [PATCH 035/101] bootconfig: Add xbc_get_info() for the node information Add xbc_get_info() API which allows user to get the number of used xbc_nodes and the size of bootconfig data. This is also useful for checking the bootconfig is initialized or not. Link: https://lkml.kernel.org/r/163177340877.682366.4360676589783197627.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 ++ init/main.c | 1 + lib/bootconfig.c | 21 +++++++++++++++++++++ tools/bootconfig/main.c | 1 + 4 files changed, 25 insertions(+) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 62e09b788172..f955bb7eabbb 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -273,6 +273,8 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node, /* XBC node initializer */ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); +/* XBC node and size information */ +int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ void __init xbc_destroy_all(void); diff --git a/init/main.c b/init/main.c index d894989d86bc..afaed805de19 100644 --- a/init/main.c +++ b/init/main.c @@ -450,6 +450,7 @@ static void __init setup_boot_config(void) pr_err("Failed to parse bootconfig: %s at %d.\n", msg, pos); } else { + xbc_get_info(&ret, NULL); pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 66b02fddfea8..b088fe5c0001 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -34,6 +34,27 @@ static int xbc_err_pos __initdata; static int open_brace[XBC_DEPTH_MAX] __initdata; static int brace_index __initdata; +/** + * xbc_get_info() - Get the information of loaded boot config + * node_size: A pointer to store the number of nodes. + * data_size: A pointer to store the size of bootconfig data. + * + * Get the number of used nodes in @node_size if it is not NULL, + * and the size of bootconfig data in @data_size if it is not NULL. + * Return 0 if the boot config is initialized, or return -ENODEV. + */ +int __init xbc_get_info(int *node_size, size_t *data_size) +{ + if (!xbc_data) + return -ENODEV; + + if (node_size) + *node_size = xbc_node_num; + if (data_size) + *data_size = xbc_data_size; + return 0; +} + static int __init xbc_parse_error(const char *msg, const char *p) { xbc_err_msg = msg; diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 7269c9e35335..4f2a8d884745 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -391,6 +391,7 @@ static int apply_xbc(const char *path, const char *xbc_path) return ret; } printf("Apply %s to %s\n", xbc_path, path); + xbc_get_info(&ret, NULL); printf("\tNumber of nodes: %d\n", ret); printf("\tSize: %u bytes\n", (unsigned int)size); printf("\tChecksum: %d\n", (unsigned int)csum); From f30f00cc9664e6c6c9dc31846db5d8c5134fadd8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:32 +0900 Subject: [PATCH 036/101] tools/bootconfig: Run test script when build all Run the bootconfig test script when build all target so that user can notice any issue when build it. Link: https://lkml.kernel.org/r/163187295173.2366983.18295281097397499118.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bootconfig/Makefile b/tools/bootconfig/Makefile index da5975775337..f1eec3ccbe18 100644 --- a/tools/bootconfig/Makefile +++ b/tools/bootconfig/Makefile @@ -15,7 +15,7 @@ CFLAGS = -Wall -g -I$(CURDIR)/include ALL_TARGETS := bootconfig ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) -all: $(ALL_PROGRAMS) +all: $(ALL_PROGRAMS) test $(OUTPUT)bootconfig: main.c $(LIBSRC) $(CC) $(filter %.c,$^) $(CFLAGS) -o $@ From 115d4d08aeb942133d025a425dd611092893d774 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:39 +0900 Subject: [PATCH 037/101] bootconfig: Rename xbc_destroy_all() to xbc_exit() Avoid using this noisy name and use more calm one. This is just a name change. No functional change. Link: https://lkml.kernel.org/r/163187295918.2366983.5231840238429996027.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 2 +- init/main.c | 2 +- lib/bootconfig.c | 8 ++++---- tools/bootconfig/main.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index f955bb7eabbb..7eb7a7f8ade7 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -277,7 +277,7 @@ int __init xbc_init(const char *buf, size_t size, const char **emsg, int *epos); int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ -void __init xbc_destroy_all(void); +void __init xbc_exit(void); /* Debug dump functions */ void __init xbc_debug_dump(void); diff --git a/init/main.c b/init/main.c index afaed805de19..f1428234e1e4 100644 --- a/init/main.c +++ b/init/main.c @@ -462,7 +462,7 @@ static void __init setup_boot_config(void) static void __init exit_boot_config(void) { - xbc_destroy_all(); + xbc_exit(); } #else /* !CONFIG_BOOT_CONFIG */ diff --git a/lib/bootconfig.c b/lib/bootconfig.c index b088fe5c0001..a3ce5a0c3ca4 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -802,13 +802,13 @@ static int __init xbc_verify_tree(void) } /** - * xbc_destroy_all() - Clean up all parsed bootconfig + * xbc_exit() - Clean up all parsed bootconfig * * This clears all data structures of parsed bootconfig on memory. * If you need to reuse xbc_init() with new boot config, you can * use this. */ -void __init xbc_destroy_all(void) +void __init xbc_exit(void) { memblock_free_ptr(xbc_data, xbc_data_size); xbc_data = NULL; @@ -869,7 +869,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; - xbc_destroy_all(); + xbc_exit(); return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); @@ -925,7 +925,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) *epos = xbc_err_pos; if (emsg) *emsg = xbc_err_msg; - xbc_destroy_all(); + xbc_exit(); } else ret = xbc_node_num; diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 4f2a8d884745..4252c23bd35d 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -397,7 +397,7 @@ static int apply_xbc(const char *path, const char *xbc_path) printf("\tChecksum: %d\n", (unsigned int)csum); /* TODO: Check the options by schema */ - xbc_destroy_all(); + xbc_exit(); free(buf); /* Remove old boot config if exists */ From f3668cde8562997b47a9edbc915da32279d4a743 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:46 +0900 Subject: [PATCH 038/101] bootconfig: Split parse-tree part from xbc_init Split bootconfig data parser to build tree code from xbc_init(). This is an internal cosmetic change. Link: https://lkml.kernel.org/r/163187296647.2366983.15590065167920474865.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- lib/bootconfig.c | 147 +++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 69 deletions(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index a3ce5a0c3ca4..b7e5a32b30d3 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -801,79 +801,12 @@ static int __init xbc_verify_tree(void) return 0; } -/** - * xbc_exit() - Clean up all parsed bootconfig - * - * This clears all data structures of parsed bootconfig on memory. - * If you need to reuse xbc_init() with new boot config, you can - * use this. - */ -void __init xbc_exit(void) -{ - memblock_free_ptr(xbc_data, xbc_data_size); - xbc_data = NULL; - xbc_data_size = 0; - xbc_node_num = 0; - memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); - xbc_nodes = NULL; - brace_index = 0; -} - -/** - * xbc_init() - Parse given XBC file and build XBC internal tree - * @data: The boot config text original data - * @size: The size of @data - * @emsg: A pointer of const char * to store the error message - * @epos: A pointer of int to store the error position - * - * This parses the boot config text in @data. @size must be smaller - * than XBC_DATA_MAX. - * Return the number of stored nodes (>0) if succeeded, or -errno - * if there is any error. - * In error cases, @emsg will be updated with an error message and - * @epos will be updated with the error position which is the byte offset - * of @buf. If the error is not a parser error, @epos will be -1. - */ -int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) +/* Need to setup xbc_data and xbc_nodes before call this. */ +static int __init xbc_parse_tree(void) { char *p, *q; int ret, c; - if (epos) - *epos = -1; - - if (xbc_data) { - if (emsg) - *emsg = "Bootconfig is already initialized"; - return -EBUSY; - } - if (size > XBC_DATA_MAX || size == 0) { - if (emsg) - *emsg = size ? "Config data is too big" : - "Config data is empty"; - return -ERANGE; - } - - xbc_data = memblock_alloc(size + 1, SMP_CACHE_BYTES); - if (!xbc_data) { - if (emsg) - *emsg = "Failed to allocate bootconfig data"; - return -ENOMEM; - } - memcpy(xbc_data, data, size); - xbc_data[size] = '\0'; - xbc_data_size = size + 1; - - xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, - SMP_CACHE_BYTES); - if (!xbc_nodes) { - if (emsg) - *emsg = "Failed to allocate bootconfig nodes"; - xbc_exit(); - return -ENOMEM; - } - memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); - last_parent = NULL; p = xbc_data; do { @@ -917,6 +850,82 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) } } while (!ret); + return ret; +} + +/** + * xbc_exit() - Clean up all parsed bootconfig + * + * This clears all data structures of parsed bootconfig on memory. + * If you need to reuse xbc_init() with new boot config, you can + * use this. + */ +void __init xbc_exit(void) +{ + memblock_free_ptr(xbc_data, xbc_data_size); + xbc_data = NULL; + xbc_data_size = 0; + xbc_node_num = 0; + memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_nodes = NULL; + brace_index = 0; +} + +/** + * xbc_init() - Parse given XBC file and build XBC internal tree + * @data: The boot config text original data + * @size: The size of @data + * @emsg: A pointer of const char * to store the error message + * @epos: A pointer of int to store the error position + * + * This parses the boot config text in @data. @size must be smaller + * than XBC_DATA_MAX. + * Return the number of stored nodes (>0) if succeeded, or -errno + * if there is any error. + * In error cases, @emsg will be updated with an error message and + * @epos will be updated with the error position which is the byte offset + * of @buf. If the error is not a parser error, @epos will be -1. + */ +int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) +{ + int ret; + + if (epos) + *epos = -1; + + if (xbc_data) { + if (emsg) + *emsg = "Bootconfig is already initialized"; + return -EBUSY; + } + if (size > XBC_DATA_MAX || size == 0) { + if (emsg) + *emsg = size ? "Config data is too big" : + "Config data is empty"; + return -ERANGE; + } + + xbc_data = memblock_alloc(size + 1, SMP_CACHE_BYTES); + if (!xbc_data) { + if (emsg) + *emsg = "Failed to allocate bootconfig data"; + return -ENOMEM; + } + memcpy(xbc_data, data, size); + xbc_data[size] = '\0'; + xbc_data_size = size + 1; + + xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, + SMP_CACHE_BYTES); + if (!xbc_nodes) { + if (emsg) + *emsg = "Failed to allocate bootconfig nodes"; + xbc_exit(); + return -ENOMEM; + } + memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); + + ret = xbc_parse_tree(); if (!ret) ret = xbc_verify_tree(); From 9b81c9bfff4651abb28bfa6d83c8b879e467963b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:02:53 +0900 Subject: [PATCH 039/101] bootconfig: Remove unused debug function Remove unused xbc_debug_dump() from bootconfig for clean up the code. Link: https://lkml.kernel.org/r/163187297371.2366983.12943349701785875450.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 3 --- lib/bootconfig.c | 21 --------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 7eb7a7f8ade7..85cdfd381877 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -279,7 +279,4 @@ int __init xbc_get_info(int *node_size, size_t *data_size); /* XBC cleanup data structures */ void __init xbc_exit(void); -/* Debug dump functions */ -void __init xbc_debug_dump(void); - #endif diff --git a/lib/bootconfig.c b/lib/bootconfig.c index b7e5a32b30d3..953789171858 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -4,15 +4,12 @@ * Masami Hiramatsu */ -#define pr_fmt(fmt) "bootconfig: " fmt - #include #include #include #include #include #include -#include #include /* @@ -940,21 +937,3 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) return ret; } - -/** - * xbc_debug_dump() - Dump current XBC node list - * - * Dump the current XBC node list on printk buffer for debug. - */ -void __init xbc_debug_dump(void) -{ - int i; - - for (i = 0; i < xbc_node_num; i++) { - pr_debug("[%d] %s (%s) .next=%d, .child=%d .parent=%d\n", i, - xbc_node_get_data(xbc_nodes + i), - xbc_node_is_value(xbc_nodes + i) ? "value" : "key", - xbc_nodes[i].next, xbc_nodes[i].child, - xbc_nodes[i].parent); - } -} From 160321b2602ff8e42ea77a09f2e3f1b35e3acfaf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:03:01 +0900 Subject: [PATCH 040/101] tools/bootconfig: Print all error message in stderr Print all error message in stderr. This also removes unneeded tools/bootconfig/include/linux/printk.h. Link: https://lkml.kernel.org/r/163187298106.2366983.15210300267326257397.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/include/linux/kernel.h | 2 -- tools/bootconfig/include/linux/printk.h | 14 -------------- tools/bootconfig/main.c | 2 ++ 3 files changed, 2 insertions(+), 16 deletions(-) delete mode 100644 tools/bootconfig/include/linux/printk.h diff --git a/tools/bootconfig/include/linux/kernel.h b/tools/bootconfig/include/linux/kernel.h index 2d93320aa374..c4854b8e7023 100644 --- a/tools/bootconfig/include/linux/kernel.h +++ b/tools/bootconfig/include/linux/kernel.h @@ -5,8 +5,6 @@ #include #include -#include - typedef unsigned short u16; typedef unsigned int u32; diff --git a/tools/bootconfig/include/linux/printk.h b/tools/bootconfig/include/linux/printk.h deleted file mode 100644 index 036e667596eb..000000000000 --- a/tools/bootconfig/include/linux/printk.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_PRINTK_H -#define _SKC_LINUX_PRINTK_H - -#include - -#define printk(fmt, ...) printf(fmt, ##__VA_ARGS__) - -#define pr_err printk -#define pr_warn printk -#define pr_info printk -#define pr_debug printk - -#endif diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index 4252c23bd35d..adc6c6e73fa9 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -15,6 +15,8 @@ #include #include +#define pr_err(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) + static int xbc_show_value(struct xbc_node *node, bool semicolon) { const char *val, *eol; From 4f292c4886bfdfc2a7191ef4ff3f7aac69d1cc3f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:03:08 +0900 Subject: [PATCH 041/101] bootconfig: Replace u16 and u32 with uint16_t and uint32_t Replace u16 and u32 with uint16_t and uint32_t so that the tools/bootconfig only needs . Link: https://lkml.kernel.org/r/163187298835.2366983.9838262576854319669.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 12 ++++++------ lib/bootconfig.c | 16 ++++++++-------- tools/bootconfig/include/linux/kernel.h | 4 +--- tools/bootconfig/main.c | 20 ++++++++++---------- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 85cdfd381877..a6f8dc51f168 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -25,10 +25,10 @@ * The checksum will be used with the BOOTCONFIG_MAGIC and the size for * embedding the bootconfig in the initrd image. */ -static inline __init u32 xbc_calc_checksum(void *data, u32 size) +static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size) { unsigned char *p = data; - u32 ret = 0; + uint32_t ret = 0; while (size--) ret += *p++; @@ -38,10 +38,10 @@ static inline __init u32 xbc_calc_checksum(void *data, u32 size) /* XBC tree node */ struct xbc_node { - u16 next; - u16 child; - u16 parent; - u16 data; + uint16_t next; + uint16_t child; + uint16_t parent; + uint16_t data; } __attribute__ ((__packed__)); #define XBC_KEY 0 diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 953789171858..a2f5f582181d 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -244,7 +244,7 @@ int __init xbc_node_compose_key_after(struct xbc_node *root, struct xbc_node *node, char *buf, size_t size) { - u16 keys[XBC_DEPTH_MAX]; + uint16_t keys[XBC_DEPTH_MAX]; int depth = 0, ret = 0, total = 0; if (!node || node == root) @@ -359,21 +359,21 @@ const char * __init xbc_node_find_next_key_value(struct xbc_node *root, /* XBC parse and tree build */ -static int __init xbc_init_node(struct xbc_node *node, char *data, u32 flag) +static int __init xbc_init_node(struct xbc_node *node, char *data, uint32_t flag) { unsigned long offset = data - xbc_data; if (WARN_ON(offset >= XBC_DATA_MAX)) return -EINVAL; - node->data = (u16)offset | flag; + node->data = (uint16_t)offset | flag; node->child = 0; node->next = 0; return 0; } -static struct xbc_node * __init xbc_add_node(char *data, u32 flag) +static struct xbc_node * __init xbc_add_node(char *data, uint32_t flag) { struct xbc_node *node; @@ -403,7 +403,7 @@ static inline __init struct xbc_node *xbc_last_child(struct xbc_node *node) return node; } -static struct xbc_node * __init __xbc_add_sibling(char *data, u32 flag, bool head) +static struct xbc_node * __init __xbc_add_sibling(char *data, uint32_t flag, bool head) { struct xbc_node *sib, *node = xbc_add_node(data, flag); @@ -430,17 +430,17 @@ static struct xbc_node * __init __xbc_add_sibling(char *data, u32 flag, bool hea return node; } -static inline struct xbc_node * __init xbc_add_sibling(char *data, u32 flag) +static inline struct xbc_node * __init xbc_add_sibling(char *data, uint32_t flag) { return __xbc_add_sibling(data, flag, false); } -static inline struct xbc_node * __init xbc_add_head_sibling(char *data, u32 flag) +static inline struct xbc_node * __init xbc_add_head_sibling(char *data, uint32_t flag) { return __xbc_add_sibling(data, flag, true); } -static inline __init struct xbc_node *xbc_add_child(char *data, u32 flag) +static inline __init struct xbc_node *xbc_add_child(char *data, uint32_t flag) { struct xbc_node *node = xbc_add_sibling(data, flag); diff --git a/tools/bootconfig/include/linux/kernel.h b/tools/bootconfig/include/linux/kernel.h index c4854b8e7023..39f306c18dd0 100644 --- a/tools/bootconfig/include/linux/kernel.h +++ b/tools/bootconfig/include/linux/kernel.h @@ -3,11 +3,9 @@ #define _SKC_LINUX_KERNEL_H #include +#include #include -typedef unsigned short u16; -typedef unsigned int u32; - #define unlikely(cond) (cond) #define __init diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index adc6c6e73fa9..fb7c9fb953d7 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -178,7 +178,7 @@ static int load_xbc_from_initrd(int fd, char **buf) { struct stat stat; int ret; - u32 size = 0, csum = 0, rcsum; + uint32_t size = 0, csum = 0, rcsum; char magic[BOOTCONFIG_MAGIC_LEN]; const char *msg; @@ -202,11 +202,11 @@ static int load_xbc_from_initrd(int fd, char **buf) if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) return pr_errno("Failed to lseek for size", -errno); - if (read(fd, &size, sizeof(u32)) < 0) + if (read(fd, &size, sizeof(uint32_t)) < 0) return pr_errno("Failed to read size", -errno); size = le32toh(size); - if (read(fd, &csum, sizeof(u32)) < 0) + if (read(fd, &csum, sizeof(uint32_t)) < 0) return pr_errno("Failed to read checksum", -errno); csum = le32toh(csum); @@ -364,7 +364,7 @@ static int apply_xbc(const char *path, const char *xbc_path) size_t total_size; struct stat stat; const char *msg; - u32 size, csum; + uint32_t size, csum; int pos, pad; int ret, fd; @@ -378,7 +378,7 @@ static int apply_xbc(const char *path, const char *xbc_path) /* Backup the bootconfig data */ data = calloc(size + BOOTCONFIG_ALIGN + - sizeof(u32) + sizeof(u32) + BOOTCONFIG_MAGIC_LEN, 1); + sizeof(uint32_t) + sizeof(uint32_t) + BOOTCONFIG_MAGIC_LEN, 1); if (!data) return -ENOMEM; memcpy(data, buf, size); @@ -426,17 +426,17 @@ static int apply_xbc(const char *path, const char *xbc_path) } /* To align up the total size to BOOTCONFIG_ALIGN, get padding size */ - total_size = stat.st_size + size + sizeof(u32) * 2 + BOOTCONFIG_MAGIC_LEN; + total_size = stat.st_size + size + sizeof(uint32_t) * 2 + BOOTCONFIG_MAGIC_LEN; pad = ((total_size + BOOTCONFIG_ALIGN - 1) & (~BOOTCONFIG_ALIGN_MASK)) - total_size; size += pad; /* Add a footer */ p = data + size; - *(u32 *)p = htole32(size); - p += sizeof(u32); + *(uint32_t *)p = htole32(size); + p += sizeof(uint32_t); - *(u32 *)p = htole32(csum); - p += sizeof(u32); + *(uint32_t *)p = htole32(csum); + p += sizeof(uint32_t); memcpy(p, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); p += BOOTCONFIG_MAGIC_LEN; From 4ee1b4cac236650979a5d9b745bb0a83efde3a46 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 17 Sep 2021 19:03:16 +0900 Subject: [PATCH 042/101] bootconfig: Cleanup dummy headers in tools/bootconfig Cleanup dummy headers in tools/bootconfig/include except for tools/bootconfig/include/linux/bootconfig.h. For this change, I use __KERNEL__ macro to split kernel header #include and introduce xbc_alloc_mem() and xbc_free_mem(). Link: https://lkml.kernel.org/r/163187299574.2366983.18371329724128746091.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 10 +++++ lib/bootconfig.c | 43 +++++++++++++++++--- tools/bootconfig/Makefile | 2 +- tools/bootconfig/include/linux/bootconfig.h | 45 ++++++++++++++++++++- tools/bootconfig/include/linux/bug.h | 12 ------ tools/bootconfig/include/linux/ctype.h | 7 ---- tools/bootconfig/include/linux/errno.h | 7 ---- tools/bootconfig/include/linux/kernel.h | 14 ------- tools/bootconfig/include/linux/memblock.h | 11 ----- tools/bootconfig/include/linux/string.h | 32 --------------- tools/bootconfig/main.c | 1 - 11 files changed, 93 insertions(+), 91 deletions(-) delete mode 100644 tools/bootconfig/include/linux/bug.h delete mode 100644 tools/bootconfig/include/linux/ctype.h delete mode 100644 tools/bootconfig/include/linux/errno.h delete mode 100644 tools/bootconfig/include/linux/kernel.h delete mode 100644 tools/bootconfig/include/linux/memblock.h delete mode 100644 tools/bootconfig/include/linux/string.h diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index a6f8dc51f168..a4665c7ab07c 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -7,8 +7,18 @@ * Author: Masami Hiramatsu */ +#ifdef __KERNEL__ #include #include +#else /* !__KERNEL__ */ +/* + * NOTE: This is only for tools/bootconfig, because tools/bootconfig will + * run the parser sanity test. + * This does NOT mean linux/bootconfig.h is available in the user space. + * However, if you change this file, please make sure the tools/bootconfig + * has no issue on building and running. + */ +#endif #define BOOTCONFIG_MAGIC "#BOOTCONFIG\n" #define BOOTCONFIG_MAGIC_LEN 12 diff --git a/lib/bootconfig.c b/lib/bootconfig.c index a2f5f582181d..a056ae137750 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -4,6 +4,7 @@ * Masami Hiramatsu */ +#ifdef __KERNEL__ #include #include #include @@ -11,6 +12,16 @@ #include #include #include +#else /* !__KERNEL__ */ +/* + * NOTE: This is only for tools/bootconfig, because tools/bootconfig will + * run the parser sanity test. + * This does NOT mean lib/bootconfig.c is available in the user space. + * However, if you change this file, please make sure the tools/bootconfig + * has no issue on building and running. + */ +#include +#endif /* * Extra Boot Config (XBC) is given as tree-structured ascii text of @@ -31,6 +42,29 @@ static int xbc_err_pos __initdata; static int open_brace[XBC_DEPTH_MAX] __initdata; static int brace_index __initdata; +#ifdef __KERNEL__ +static inline void *xbc_alloc_mem(size_t size) +{ + return memblock_alloc(size, SMP_CACHE_BYTES); +} + +static inline void xbc_free_mem(void *addr, size_t size) +{ + memblock_free_ptr(addr, size); +} + +#else /* !__KERNEL__ */ + +static inline void *xbc_alloc_mem(size_t size) +{ + return malloc(size); +} + +static inline void xbc_free_mem(void *addr, size_t size) +{ + free(addr); +} +#endif /** * xbc_get_info() - Get the information of loaded boot config * node_size: A pointer to store the number of nodes. @@ -859,11 +893,11 @@ static int __init xbc_parse_tree(void) */ void __init xbc_exit(void) { - memblock_free_ptr(xbc_data, xbc_data_size); + xbc_free_mem(xbc_data, xbc_data_size); xbc_data = NULL; xbc_data_size = 0; xbc_node_num = 0; - memblock_free_ptr(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); + xbc_free_mem(xbc_nodes, sizeof(struct xbc_node) * XBC_NODE_MAX); xbc_nodes = NULL; brace_index = 0; } @@ -902,7 +936,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) return -ERANGE; } - xbc_data = memblock_alloc(size + 1, SMP_CACHE_BYTES); + xbc_data = xbc_alloc_mem(size + 1); if (!xbc_data) { if (emsg) *emsg = "Failed to allocate bootconfig data"; @@ -912,8 +946,7 @@ int __init xbc_init(const char *data, size_t size, const char **emsg, int *epos) xbc_data[size] = '\0'; xbc_data_size = size + 1; - xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, - SMP_CACHE_BYTES); + xbc_nodes = xbc_alloc_mem(sizeof(struct xbc_node) * XBC_NODE_MAX); if (!xbc_nodes) { if (emsg) *emsg = "Failed to allocate bootconfig nodes"; diff --git a/tools/bootconfig/Makefile b/tools/bootconfig/Makefile index f1eec3ccbe18..566c3e0ee561 100644 --- a/tools/bootconfig/Makefile +++ b/tools/bootconfig/Makefile @@ -17,7 +17,7 @@ ALL_PROGRAMS := $(patsubst %,$(OUTPUT)%,$(ALL_TARGETS)) all: $(ALL_PROGRAMS) test -$(OUTPUT)bootconfig: main.c $(LIBSRC) +$(OUTPUT)bootconfig: main.c include/linux/bootconfig.h $(LIBSRC) $(CC) $(filter %.c,$^) $(CFLAGS) -o $@ test: $(ALL_PROGRAMS) test-bootconfig.sh diff --git a/tools/bootconfig/include/linux/bootconfig.h b/tools/bootconfig/include/linux/bootconfig.h index de7f30f99af3..6784296a0692 100644 --- a/tools/bootconfig/include/linux/bootconfig.h +++ b/tools/bootconfig/include/linux/bootconfig.h @@ -2,10 +2,53 @@ #ifndef _BOOTCONFIG_LINUX_BOOTCONFIG_H #define _BOOTCONFIG_LINUX_BOOTCONFIG_H -#include "../../../../include/linux/bootconfig.h" +#include +#include +#include +#include +#include +#include +#include + #ifndef fallthrough # define fallthrough #endif +#define WARN_ON(cond) \ + ((cond) ? printf("Internal warning(%s:%d, %s): %s\n", \ + __FILE__, __LINE__, __func__, #cond) : 0) + +#define unlikely(cond) (cond) + +/* Copied from lib/string.c */ +static inline char *skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} + +static inline char *strim(char *s) +{ + size_t size; + char *end; + + size = strlen(s); + if (!size) + return s; + + end = s + size - 1; + while (end >= s && isspace(*end)) + end--; + *(end + 1) = '\0'; + + return skip_spaces(s); +} + +#define __init +#define __initdata + +#include "../../../../include/linux/bootconfig.h" + #endif diff --git a/tools/bootconfig/include/linux/bug.h b/tools/bootconfig/include/linux/bug.h deleted file mode 100644 index 7b65a389c0dd..000000000000 --- a/tools/bootconfig/include/linux/bug.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_BUG_H -#define _SKC_LINUX_BUG_H - -#include -#include - -#define WARN_ON(cond) \ - ((cond) ? printf("Internal warning(%s:%d, %s): %s\n", \ - __FILE__, __LINE__, __func__, #cond) : 0) - -#endif diff --git a/tools/bootconfig/include/linux/ctype.h b/tools/bootconfig/include/linux/ctype.h deleted file mode 100644 index c56ecc136448..000000000000 --- a/tools/bootconfig/include/linux/ctype.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_CTYPE_H -#define _SKC_LINUX_CTYPE_H - -#include - -#endif diff --git a/tools/bootconfig/include/linux/errno.h b/tools/bootconfig/include/linux/errno.h deleted file mode 100644 index 5d9f91ec2fda..000000000000 --- a/tools/bootconfig/include/linux/errno.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_ERRNO_H -#define _SKC_LINUX_ERRNO_H - -#include - -#endif diff --git a/tools/bootconfig/include/linux/kernel.h b/tools/bootconfig/include/linux/kernel.h deleted file mode 100644 index 39f306c18dd0..000000000000 --- a/tools/bootconfig/include/linux/kernel.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_KERNEL_H -#define _SKC_LINUX_KERNEL_H - -#include -#include -#include - -#define unlikely(cond) (cond) - -#define __init -#define __initdata - -#endif diff --git a/tools/bootconfig/include/linux/memblock.h b/tools/bootconfig/include/linux/memblock.h deleted file mode 100644 index f2e506f7d57f..000000000000 --- a/tools/bootconfig/include/linux/memblock.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XBC_LINUX_MEMBLOCK_H -#define _XBC_LINUX_MEMBLOCK_H - -#include - -#define SMP_CACHE_BYTES 0 -#define memblock_alloc(size, align) malloc(size) -#define memblock_free_ptr(paddr, size) free(paddr) - -#endif diff --git a/tools/bootconfig/include/linux/string.h b/tools/bootconfig/include/linux/string.h deleted file mode 100644 index 8267af75153a..000000000000 --- a/tools/bootconfig/include/linux/string.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SKC_LINUX_STRING_H -#define _SKC_LINUX_STRING_H - -#include - -/* Copied from lib/string.c */ -static inline char *skip_spaces(const char *str) -{ - while (isspace(*str)) - ++str; - return (char *)str; -} - -static inline char *strim(char *s) -{ - size_t size; - char *end; - - size = strlen(s); - if (!size) - return s; - - end = s + size - 1; - while (end >= s && isspace(*end)) - end--; - *(end + 1) = '\0'; - - return skip_spaces(s); -} - -#endif diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index fb7c9fb953d7..156b62a163c5 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -12,7 +12,6 @@ #include #include -#include #include #define pr_err(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__) From 43c9dd8ddf4efdce126e0a0b176d729c72445b0f Mon Sep 17 00:00:00 2001 From: Carles Pey Date: Sat, 18 Sep 2021 19:30:43 +0400 Subject: [PATCH 043/101] ftrace: Add unit test for removing trace function A self test is provided for the trace function removal functionality. Link: https://lkml.kernel.org/r/20210918153043.318016-2-carles.pey@gmail.com Signed-off-by: Carles Pey Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adf7ef194005..875b4f1a0476 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -287,6 +287,40 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) if (trace_selftest_test_probe3_cnt != 4) goto out_free; + /* Remove trace function from probe 3 */ + func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME); + len1 = strlen(func1_name); + + ftrace_set_filter(&test_probe3, func1_name, len1, 0); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out_free; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 3) + goto out_free; + if (trace_selftest_test_probe3_cnt != 5) + goto out_free; + ret = 0; out_free: unregister_ftrace_function(dyn_ops); From affc659246293df42ba2d184c674cc959c05aa02 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Sep 2021 08:03:42 +0800 Subject: [PATCH 044/101] tracing: in_irq() cleanup Replace the obsolete and ambiguos macro in_irq() with new macro in_hardirq(). Link: https://lkml.kernel.org/r/20210930000342.6016-1-changbin.du@gmail.com Reviewed-by: Petr Mladek Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6c3808132b16..6b60ab9475ed 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -890,7 +890,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) * is set, and called by an interrupt handler, we still * want to trace it. */ - if (in_irq()) + if (in_hardirq()) trace_recursion_set(TRACE_IRQ_BIT); else trace_recursion_clear(TRACE_IRQ_BIT); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6b5ff3ba4251..203204cadf92 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -120,7 +120,7 @@ static inline int ftrace_graph_ignore_irqs(void) if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; - return in_irq(); + return in_hardirq(); } int trace_graph_entry(struct ftrace_graph_ent *trace) From 34cdd18b8d245f3e901e5325313c27de727ab80d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Jun 2020 16:56:16 -0400 Subject: [PATCH 045/101] tracing: Use linker magic instead of recasting ftrace_ops_list_func() In an effort to enable -Wcast-function-type in the top-level Makefile to support Control Flow Integrity builds, all function casts need to be removed. This means that ftrace_ops_list_func() can no longer be defined as ftrace_ops_no_ops(). The reason for ftrace_ops_no_ops() is to use that when an architecture calls ftrace_ops_list_func() with only two parameters (called from assembly). And to make sure there's no C side-effects, those archs call ftrace_ops_no_ops() which only has two parameters, as ftrace_ops_list_func() has four parameters. Instead of a typecast, use vmlinux.lds.h to define ftrace_ops_list_func() to arch_ftrace_ops_list_func() that will define the proper set of parameters. Link: https://lore.kernel.org/r/20200614070154.6039-1-oscar.carter@gmx.com Link: https://lkml.kernel.org/r/20200617165616.52241bde@oasis.local.home Link: https://lore.kernel.org/all/20211005053922.GA702049@embeddedor/ Requested-by: Oscar Carter Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 10 ++++++++-- include/linux/ftrace.h | 12 ++++++++++-- kernel/trace/ftrace.c | 23 ++++++++++------------- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f2984af2b85b..8771c435f34b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -164,16 +164,22 @@ * Need to also make ftrace_stub_graph point to ftrace_stub * so that the same stub location may have different protocols * and not mess up with C verifiers. + * + * ftrace_ops_list_func will be defined as arch_ftrace_ops_list_func + * as some archs will have a different prototype for that function + * but ftrace_ops_list_func() will have a single prototype. */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; \ - ftrace_stub_graph = ftrace_stub; + ftrace_stub_graph = ftrace_stub; \ + ftrace_ops_list_func = arch_ftrace_ops_list_func; #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; +# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; \ + ftrace_ops_list_func = arch_ftrace_ops_list_func; # else # define MCOUNT_REC() # endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 832e65f06754..12fcfa2d23ea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -30,16 +30,26 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +#ifdef CONFIG_FUNCTION_TRACER +struct ftrace_ops; +struct ftrace_regs; /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that * does. Or at least does enough to prevent any unwelcome side effects. + * + * Also define the function prototype that these architectures use + * to call the ftrace_ops_list_func(). */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); #else # define FTRACE_FORCE_LIST_FUNC 0 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); #endif +#endif /* CONFIG_FUNCTION_TRACER */ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING @@ -88,8 +98,6 @@ extern int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -struct ftrace_ops; - #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c3e9760df7f..8b5801881271 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -119,14 +119,9 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -#if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs); -#else -/* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) -#endif +/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); static inline void ftrace_ops_init(struct ftrace_ops *ops) { @@ -7032,21 +7027,23 @@ out: * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORTS_FTRACE_OPS. + * + * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be + * arch_ftrace_ops_list_func. */ #if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } -NOKPROBE_SYMBOL(ftrace_ops_list_func); #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } -NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif +NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support From 7ce1bb83a14019f8c396d57ec704d19478747716 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 13 Oct 2021 21:52:17 -0700 Subject: [PATCH 046/101] tracing/cfi: Fix cmp_entries_* functions signature mismatch If CONFIG_CFI_CLANG=y, attempting to read an event histogram will cause the kernel to panic due to failed CFI check. 1. echo 'hist:keys=common_pid' >> events/sched/sched_switch/trigger 2. cat events/sched/sched_switch/hist 3. kernel panics on attempting to read hist This happens because the sort() function expects a generic int (*)(const void *, const void *) pointer for the compare function. To prevent this CFI failure, change tracing map cmp_entries_* function signatures to match this. Also, fix the build error reported by the kernel test robot [1]. [1] https://lore.kernel.org/r/202110141140.zzi4dRh4-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211014045217.3265162-1-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 40 ++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index d6bddb157ef2..39bb56d2dcbe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -834,29 +834,35 @@ int tracing_map_init(struct tracing_map *map) return err; } -static int cmp_entries_dup(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_dup(const void *A, const void *B) { + const struct tracing_map_sort_entry *a, *b; int ret = 0; - if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + if (memcmp(a->key, b->key, a->elt->map->key_size)) ret = 1; return ret; } -static int cmp_entries_sum(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_sum(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -873,18 +879,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a, return ret; } -static int cmp_entries_key(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_key(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -989,10 +998,8 @@ static void sort_secondary(struct tracing_map *map, struct tracing_map_sort_key *primary_key, struct tracing_map_sort_key *secondary_key) { - int (*primary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); - int (*secondary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*primary_fn)(const void *, const void *); + int (*secondary_fn)(const void *, const void *); unsigned i, start = 0, n_sub = 1; if (is_key(map, primary_key->field_idx)) @@ -1061,8 +1068,7 @@ int tracing_map_sort_entries(struct tracing_map *map, unsigned int n_sort_keys, struct tracing_map_sort_entry ***sort_entries) { - int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*cmp_entries_fn)(const void *, const void *); struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; From 9b84fadc444de5456ab5f5487e2108311c724c3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 13:42:40 -0400 Subject: [PATCH 047/101] tracing: Reuse logic from perf's get_recursion_context() Instead of having branches that adds noise to the branch prediction, use the addition logic to set the bit for the level of interrupt context that the state is currently in. This copies the logic from perf's get_recursion_context() function. Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 11 ++++++----- kernel/trace/ring_buffer.c | 12 ++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..f6da7a03bff0 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -137,12 +137,13 @@ enum { static __always_inline int trace_get_context_bit(void) { unsigned long pc = preempt_count(); + unsigned char bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - return TRACE_CTX_NORMAL; - else - return pc & NMI_MASK ? TRACE_CTX_NMI : - pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return TRACE_CTX_NORMAL - bit; } #ifdef CONFIG_FTRACE_RECORD_RECURSION diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c5a3fbf19617..15d4380006e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3168,13 +3168,13 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; unsigned long pc = preempt_count(); - int bit; + int bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - bit = RB_CTX_NORMAL; - else - bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* From 91ebe8bcbff9d2ff21303e73bf7434f39a98b255 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 15:01:19 -0400 Subject: [PATCH 048/101] tracing/perf: Add interrupt_context_level() helper Now that there are three different instances of doing the addition trick to the preempt_count() and NMI_MASK, HARDIRQ_MASK and SOFTIRQ_OFFSET macros, it deserves a helper function defined in the preempt.h header. Add the interrupt_context_level() helper and replace the three instances that do that logic with it. Link: https://lore.kernel.org/all/20211015142541.4badd8a9@gandalf.local.home/ Signed-off-by: Steven Rostedt (VMware) --- include/linux/preempt.h | 21 +++++++++++++++++++++ include/linux/trace_recursion.h | 7 +------ kernel/events/internal.h | 7 +------ kernel/trace/ring_buffer.c | 7 +------ 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..b32e3dabe28b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -77,6 +77,27 @@ /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include +/** + * interrupt_context_level - return interrupt context level + * + * Returns the current interrupt context level. + * 0 - normal context + * 1 - softirq context + * 2 - hardirq context + * 3 - NMI context + */ +static __always_inline unsigned char interrupt_context_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + level += !!(pc & (NMI_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return level; +} + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index f6da7a03bff0..1d8cce02c3fb 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -136,12 +136,7 @@ enum { static __always_inline int trace_get_context_bit(void) { - unsigned long pc = preempt_count(); - unsigned char bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char bit = interrupt_context_level(); return TRACE_CTX_NORMAL - bit; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..082832738c8f 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - unsigned int pc = preempt_count(); - unsigned char rctx = 0; - - rctx += !!(pc & (NMI_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15d4380006e3..f6520d0a4c8c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3167,12 +3167,7 @@ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; - unsigned long pc = preempt_count(); - int bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; From 1e85010e17c1d72627eaf14d75d22e4d693abf70 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:29 +0200 Subject: [PATCH 049/101] x86/ftrace: Remove extra orig rax move There's identical move 2 lines earlier. Link: https://lkml.kernel.org/r/20211008091336.33616-2-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 7c273846c687..a8eb084a7a9a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -251,7 +251,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * If ORIG_RAX is anything but zero, make this a call to that. * See arch_ftrace_set_direct_caller(). */ - movq ORIG_RAX(%rsp), %rax testq %rax, %rax SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) jnz 1f From 8646698aefad7547dc7acee8f8c2099d7653dc70 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Oct 2021 11:13:30 +0200 Subject: [PATCH 050/101] x86/ftrace: Remove fault protection code in prepare_ftrace_return Removing the fault protection code when writing return_hooker to stack. As Steven noted: > That protection was there from the beginning due to being "paranoid", > considering ftrace was bricking network cards. But that protection > would not have even protected against that. Link: https://lkml.kernel.org/r/20211008091336.33616-3-jolsa@kernel.org Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/ftrace.c | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 23d221a9a3cd..7f12eacdf1ae 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -620,12 +620,10 @@ int ftrace_disable_ftrace_graph_caller(void) * Hook the return address and push it in the stack of return addrs * in current thread info. */ -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; - unsigned long old; - int faulted; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -645,37 +643,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; - /* - * Protect against fault, even if it shouldn't - * happen. This tool is too much intrusive to - * ignore such a protection. - */ - asm volatile( - "1: " _ASM_MOV " (%[parent]), %[old]\n" - "2: " _ASM_MOV " %[return_hooker], (%[parent])\n" - " movl $0, %[faulted]\n" - "3:\n" - - ".section .fixup, \"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 4b) - - : [old] "=&r" (old), [faulted] "=r" (faulted) - : [parent] "r" (parent), [return_hooker] "r" (return_hooker) - : "memory" - ); - - if (unlikely(faulted)) { - ftrace_graph_stop(); - WARN_ON(1); - return; - } - - if (function_graph_enter(old, self_addr, frame_pointer, parent)) - *parent = old; + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) + *parent = return_hooker; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ From 4a30e4c9305142ba40ab09a02a2e8ff3c1f3751f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 20 Oct 2021 23:35:55 -0400 Subject: [PATCH 051/101] ftrace/x86_64: Have function graph tracer depend on DYNAMIC_FTRACE The function graph tracer is going to now depend on ARCH_SUPPORTS_FTRACE_OPS, as that also means that it can support ftrace args. Since ARCH_SUPPORTS_FTRACE_OPS depends on DYNAMIC_FTRACE, this means that the function graph tracer for x86_64 will need to depend on DYNAMIC_FTRACE. Link: https://lkml.kernel.org/r/20211020233555.16b0dbf2@rorschach.local.home Signed-off-by: Steven Rostedt (VMware) --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ab83c22d274e..e39e2bd6acc5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -197,7 +197,7 @@ config X86 select HAVE_FAST_GUP select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 || (X86_64 && DYNAMIC_FTRACE) select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT From 0c0593b45c9b4e5b212ffb3fb28bb8d3c0ec0dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Oct 2021 11:13:31 +0200 Subject: [PATCH 052/101] x86/ftrace: Make function graph use ftrace directly We don't need special hook for graph tracer entry point, but instead we can use graph_ops::func function to install the return_hooker. This moves the graph tracing setup _before_ the direct trampoline prepares the stack, so the return_hooker will be called when the direct trampoline is finished. This simplifies the code, because we don't need to take into account the direct trampoline setup when preparing the graph tracer hooker and we can allow function graph tracer on entries registered with direct trampoline. Link: https://lkml.kernel.org/r/20211008091336.33616-4-jolsa@kernel.org [fixed compile error reported by kernel test robot ] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/ftrace.h | 9 +++++++-- arch/x86/kernel/ftrace.c | 37 ++++++++++++++++++++++++++++++++--- arch/x86/kernel/ftrace_64.S | 29 +-------------------------- include/linux/ftrace.h | 9 +++++++++ kernel/trace/fgraph.c | 6 ++++-- 5 files changed, 55 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9f3130f40807..024d9797646e 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -57,6 +57,13 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) #define ftrace_instruction_pointer_set(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) + +struct ftrace_ops; +#define ftrace_graph_func ftrace_graph_func +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#else +#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -65,8 +72,6 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR - #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7f12eacdf1ae..c39f906cdc4e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -522,7 +522,7 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer); /* @@ -536,7 +536,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) void *ptr; if (ops && ops->trampoline) { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \ + defined(CONFIG_FUNCTION_GRAPH_TRACER) /* * We only know about function graph tracer setting as static * trampoline. @@ -584,8 +585,9 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); @@ -613,7 +615,17 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } +#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ +int ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} +int ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* !CONFIG_DYNAMIC_FTRACE */ /* @@ -624,6 +636,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; + int bit; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -643,7 +656,25 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; + bit = ftrace_test_recursion_trylock(ip, *parent); + if (bit < 0) + return; + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) *parent = return_hooker; + + ftrace_test_recursion_unlock(bit); } + +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + + prepare_ftrace_return(ip, (unsigned long *)stack, 0); +} +#endif + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a8eb084a7a9a..7a879901f103 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -174,11 +174,6 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_caller); SYM_FUNC_START(ftrace_epilogue) -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - jmp ftrace_stub -#endif - /* * This is weak to keep gas from relaxing the jumps. * It is also used to copy the retq for trampolines. @@ -288,15 +283,6 @@ SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace -fgraph_trace: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) retq @@ -314,25 +300,12 @@ trace: CALL_NOSPEC r8 restore_mcount_regs - jmp fgraph_trace + jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(ftrace_graph_caller) - /* Saves rbp into %rdx and fills first parameter */ - save_mcount_regs - - leaq MCOUNT_REG_SIZE+8(%rsp), %rsi - movq $0, %rdx /* No framepointers needed */ - call prepare_ftrace_return - - restore_mcount_regs - - retq -SYM_FUNC_END(ftrace_graph_caller) - SYM_FUNC_START(return_to_handler) subq $24, %rsp diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 12fcfa2d23ea..16a7baaba702 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -803,6 +803,15 @@ static inline bool is_ftrace_trampoline(unsigned long addr) } #endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifndef ftrace_graph_func +#define ftrace_graph_func ftrace_stub +#define FTRACE_OPS_GRAPH_STUB FTRACE_OPS_FL_STUB +#else +#define FTRACE_OPS_GRAPH_STUB 0 +#endif +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index b8a0d1d564fb..22061d38fc00 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -115,6 +115,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS /* * Skip graph tracing if the return location is served by direct trampoline, * since call sequence and return addresses are unpredictable anyway. @@ -124,6 +125,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (ftrace_direct_func_count && ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) return -EBUSY; +#endif trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -333,10 +335,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ static struct ftrace_ops graph_ops = { - .func = ftrace_stub, + .func = ftrace_graph_func, .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | - FTRACE_OPS_FL_STUB, + FTRACE_OPS_GRAPH_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, /* trampoline_size is only needed for dynamically allocated tramps */ From 130c08065848a98163b243b55e99f66c24609efb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:32 +0200 Subject: [PATCH 053/101] tracing: Add trampoline/graph selftest Adding selftest for checking that direct trampoline can co-exist together with graph tracer on same function. This is supported for CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS config option, which is defined only for x86_64 for now. Link: https://lkml.kernel.org/r/20211008091336.33616-5-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 54 ++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 875b4f1a0476..3404a245417e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,6 +784,8 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +noinline __noclone static void trace_direct_tramp(void) { } + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -794,6 +796,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, { int ret; unsigned long count; + char *func_name __maybe_unused; #ifdef CONFIG_DYNAMIC_FTRACE if (ftrace_filter_param) { @@ -842,8 +845,57 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } - /* Don't test dynamic tracing, the function tracer already did */ +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + tracing_reset_online_cpus(&tr->array_buffer); + set_graph_array(tr); + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* + * Register direct function together with graph tracer + * and make sure we get graph trace. + */ + ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + ret = register_ftrace_graph(&fgraph_ops); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + count = 0; + + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + unregister_ftrace_graph(&fgraph_ops); + + ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + tracing_start(); + + if (!ret && !count) { + ret = -1; + goto out; + } +#endif + + /* Don't test dynamic tracing, the function tracer already did */ out: /* Stop it if we failed */ if (ret) From 4e341cad6b7a58376bfc6d1c8347727d094a6274 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Oct 2021 13:43:57 -0400 Subject: [PATCH 054/101] tracing: Fix selftest config check for function graph start up test There's a new test in trace_selftest_startup_function_graph() that requires the use of ftrace args being supported as well does some tricks with dynamic tracing. Although this code checks HAVE_DYNAMIC_FTRACE_WITH_ARGS it fails to check DYNAMIC_FTRACE, and the kernel fails to build due to that dependency. Also only define the prototype of trace_direct_tramp() if it is used. Link: https://lkml.kernel.org/r/20211021134357.7f48e173@gandalf.local.home Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3404a245417e..afd937a46496 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,7 +784,11 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +#if defined(CONFIG_DYNAMIC_FTRACE) && \ + defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) +#define TEST_DIRECT_TRAMP noinline __noclone static void trace_direct_tramp(void) { } +#endif /* * Pretty much the same than for the function tracer from which the selftest @@ -845,7 +849,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#ifdef TEST_DIRECT_TRAMP tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); From 1904a8144598031af85406873c5fbec806ee3fd7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:33 +0200 Subject: [PATCH 055/101] ftrace: Add ftrace_add_rec_direct function Factor out the code that adds (ip, addr) tuple to direct_functions hash in new ftrace_add_rec_direct function. It will be used in following patches. Link: https://lkml.kernel.org/r/20211008091336.33616-6-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 64 +++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b5801881271..ccbd8377e580 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2390,6 +2390,39 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } +static struct ftrace_func_entry* +ftrace_add_rec_direct(unsigned long ip, unsigned long addr, + struct ftrace_hash **free_hash) +{ + struct ftrace_func_entry *entry; + + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + return NULL; + + *free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + return entry; +} + static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -5106,39 +5139,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } ret = -ENOMEM; - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - goto out_unlock; - - free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_unlock; - direct = ftrace_find_direct_func(addr); if (!direct) { direct = ftrace_alloc_direct_func(addr); - if (!direct) { - kfree(entry); + if (!direct) goto out_unlock; - } } - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); + entry = ftrace_add_rec_direct(ip, addr, &free_hash); + if (!entry) + goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); if (ret) From f64dd4627ec6edc39bf1430fe6dbc923d2300a88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:34 +0200 Subject: [PATCH 056/101] ftrace: Add multi direct register/unregister interface Adding interface to register multiple direct functions within single call. Adding following functions: register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The register_ftrace_direct_multi registers direct function (addr) with all functions in ops filter. The ops filter can be updated before with ftrace_set_filter_ip calls. All requested functions must not have direct function currently registered, otherwise register_ftrace_direct_multi will fail. The unregister_ftrace_direct_multi unregisters ops related direct functions. Link: https://lkml.kernel.org/r/20211008091336.33616-7-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 11 ++++ kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 16a7baaba702..0158261cac9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -324,7 +324,10 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long old_addr, unsigned long new_addr); unsigned long ftrace_find_rec_direct(unsigned long ip); +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); #else +struct ftrace_ops; # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -354,6 +357,14 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip) { return 0; } +static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccbd8377e580..a05b25fb77d8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5401,6 +5401,148 @@ int modify_ftrace_direct(unsigned long ip, return ret; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ + FTRACE_OPS_FL_SAVE_REGS) + +static int check_direct_multi(struct ftrace_ops *ops) +{ + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS) + return -EINVAL; + return 0; +} + +static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr) +{ + struct ftrace_func_entry *entry, *del; + int size, i; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (del && del->direct == addr) { + remove_hash_entry(direct_functions, del); + kfree(del); + } + } + } +} + +/** + * register_ftrace_direct_multi - Call a custom trampoline directly + * for multiple functions registered in @ops + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the trampoline to call at @ops functions + * + * This is used to connect a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * The location that it calls (@addr) must be able to handle a direct call, + * and save the parameters of the function being traced, and restore them + * (or inject new ones if needed), before returning. + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was already registered with this call or + * when there are no functions in @ops object. + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_func_entry *entry, *new; + int err = -EBUSY, size, i; + + if (ops->func || ops->trampoline) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + return -EINVAL; + + hash = ops->func_hash->filter_hash; + if (ftrace_hash_empty(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered.. */ + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_find_rec_direct(entry->ip)) + goto out_unlock; + } + } + + /* ... and insert them to direct_functions hash. */ + err = -ENOMEM; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + if (!new) + goto out_remove; + entry->direct = addr; + } + } + + ops->func = call_direct_funcs; + ops->flags = MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + + err = register_ftrace_function(ops); + + out_remove: + if (err) + remove_direct_functions_hash(hash, addr); + + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + return err; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); + +/** + * unregister_ftrace_direct_multi - Remove calls to custom trampoline + * previously registered by register_ftrace_direct_multi for @ops object. + * @ops: The address of the struct ftrace_ops object + * + * This is used to remove a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was not properly registered. + */ +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = unregister_ftrace_function(ops); + remove_direct_functions_hash(hash, addr); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From ccf5a89efd6f0a9483cea8acd4a0822b1a47e59a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:35 +0200 Subject: [PATCH 057/101] ftrace: Add multi direct modify interface Adding interface to modify registered direct function for ftrace_ops. Adding following function: modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The function changes the currently registered direct function for all attached functions. Link: https://lkml.kernel.org/r/20211008091336.33616-8-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 ++++ kernel/trace/ftrace.c | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0158261cac9f..9999e29187de 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -326,6 +326,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); + #else struct ftrace_ops; # define ftrace_direct_func_count 0 @@ -365,6 +367,10 @@ static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigne { return -ENODEV; } +static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a05b25fb77d8..30120342176e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5543,6 +5543,68 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_func_entry *entry, *iter; + int i, size; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + mutex_lock(&ftrace_lock); + + /* + * Shutdown the ops, change 'direct' pointer for each + * ops entry in direct_functions hash and startup the + * ops back again. + * + * Note there is no callback called for @ops object after + * this ftrace_shutdown call until ftrace_startup is called + * later on. + */ + err = ftrace_shutdown(ops, 0); + if (err) + goto out_unlock; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(iter, &hash->buckets[i], hlist) { + entry = __ftrace_lookup_ip(direct_functions, iter->ip); + if (!entry) + continue; + entry->direct = addr; + } + } + + err = ftrace_startup(ops, 0); + + out_unlock: + mutex_unlock(&ftrace_lock); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** From 5fae941b9a6f95773df644e7cf304bf199707876 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:36 +0200 Subject: [PATCH 058/101] ftrace/samples: Add multi direct interface test module Adding simple module that uses multi direct interface: register_ftrace_direct_multi unregister_ftrace_direct_multi The init function registers trampoline for 2 functions, and exit function unregisters them. Link: https://lkml.kernel.org/r/20211008091336.33616-9-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- samples/ftrace/Makefile | 1 + samples/ftrace/ftrace-direct-multi.c | 52 ++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 samples/ftrace/ftrace-direct-multi.c diff --git a/samples/ftrace/Makefile b/samples/ftrace/Makefile index 4ce896e10b2e..ab1d1c05c288 100644 --- a/samples/ftrace/Makefile +++ b/samples/ftrace/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-too.o obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-modify.o +obj-$(CONFIG_SAMPLE_FTRACE_DIRECT) += ftrace-direct-multi.o CFLAGS_sample-trace-array.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_ARRAY) += sample-trace-array.o diff --git a/samples/ftrace/ftrace-direct-multi.c b/samples/ftrace/ftrace-direct-multi.c new file mode 100644 index 000000000000..2a5b1fb7ac14 --- /dev/null +++ b/samples/ftrace/ftrace-direct-multi.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include /* for handle_mm_fault() */ +#include +#include + +void my_direct_func(unsigned long ip) +{ + trace_printk("ip %lx\n", ip); +} + +extern void my_tramp(void *); + +asm ( +" .pushsection .text, \"ax\", @progbits\n" +" .type my_tramp, @function\n" +" .globl my_tramp\n" +" my_tramp:" +" pushq %rbp\n" +" movq %rsp, %rbp\n" +" pushq %rdi\n" +" movq 8(%rbp), %rdi\n" +" call my_direct_func\n" +" popq %rdi\n" +" leave\n" +" ret\n" +" .size my_tramp, .-my_tramp\n" +" .popsection\n" +); + +static struct ftrace_ops direct; + +static int __init ftrace_direct_multi_init(void) +{ + ftrace_set_filter_ip(&direct, (unsigned long) wake_up_process, 0, 0); + ftrace_set_filter_ip(&direct, (unsigned long) schedule, 0, 0); + + return register_ftrace_direct_multi(&direct, (unsigned long) my_tramp); +} + +static void __exit ftrace_direct_multi_exit(void) +{ + unregister_ftrace_direct_multi(&direct, (unsigned long) my_tramp); +} + +module_init(ftrace_direct_multi_init); +module_exit(ftrace_direct_multi_exit); + +MODULE_AUTHOR("Jiri Olsa"); +MODULE_DESCRIPTION("Example use case of using register_ftrace_direct_multi()"); +MODULE_LICENSE("GPL"); From ed29271894aa92826d308231593b7ee7ac5a4932 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Oct 2021 16:11:14 -0400 Subject: [PATCH 059/101] ftrace/direct: Do not disable when switching direct callers Currently to switch a set of "multi" direct trampolines from one trampoline to another, a full shutdown of the current set needs to be done, followed by an update to what trampoline the direct callers would call, and then re-enabling the callers. This leaves a time when the functions will not be calling anything, and events may be missed. Instead, use a trick to allow all the functions with direct trampolines attached will always call either the new or old trampoline while the switch is happening. To do this, first attach a "dummy" callback via ftrace to all the functions that the current direct trampoline is attached to. This will cause the functions to call the "list func" instead of the direct trampoline. The list function will call the direct trampoline "helper" that will set the function it should call as it returns back to the ftrace trampoline. At this moment, the direct caller descriptor can safely update the direct call trampoline. The list function will pick either the new or old function (depending on the memory coherency model of the architecture). Now removing the dummy function from each of the locations of the direct trampoline caller, will put back the direct call, but now to the new trampoline. A better visual is: [ Changing direct call from my_direct_1 to my_direct_2 ] : call my_direct_1 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call ftrace_caller : [..] call ftrace_ops_list_func ftrace_ops_list_func() { ops->func() -> direct_helper -> set rax to my_direct_1 or my_direct_2 } call rax (to either my_direct_1 or my_direct_2 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call my_direct_2 Link: https://lore.kernel.org/all/20211014162819.5c85618b@gandalf.local.home/ Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30120342176e..f90ed00a6d5b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5561,8 +5561,12 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); */ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; int i, size; int err; @@ -5572,21 +5576,22 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return -EINVAL; mutex_lock(&direct_mutex); - mutex_lock(&ftrace_lock); + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function(&tmp_ops); + if (err) + goto out_direct; /* - * Shutdown the ops, change 'direct' pointer for each - * ops entry in direct_functions hash and startup the - * ops back again. - * - * Note there is no callback called for @ops object after - * this ftrace_shutdown call until ftrace_startup is called - * later on. + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. */ - err = ftrace_shutdown(ops, 0); - if (err) - goto out_unlock; + mutex_lock(&ftrace_lock); + hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -5597,10 +5602,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } } - err = ftrace_startup(ops, 0); + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); - out_unlock: mutex_unlock(&ftrace_lock); + out_direct: mutex_unlock(&direct_mutex); return err; } From bce5c81cb31f7f124ce231ec79df9e85a8bac132 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 19 Oct 2021 09:25:20 -0400 Subject: [PATCH 060/101] tracing: Explain the trace recursion transition bit better The current text of the explanation of the transition bit in the trace recursion protection is not very clear. Improve the text, so that when all the archs no longer have the issue of tracing between a start of a new (interrupt) context and updating the preempt_count to reflect the new context, that it may be removed. Link: https://lore.kernel.org/all/20211018220203.064a42ed@gandalf.local.home/ Suggested-by: Petr Mladek Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 1d8cce02c3fb..24f284eb55a7 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -168,8 +168,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. + * If an interrupt occurs during a trace, and another trace + * happens in that interrupt but before the preempt_count is + * updated to reflect the new interrupt context, then this + * will think a recursion occurred, and the event will be dropped. + * Let a single instance happen via the TRANSITION_BIT to + * not drop those events. */ bit = TRACE_TRANSITION_BIT; if (val & (1 << bit)) { From 8720aeecc246837bc6da64c5118dc3177c162e14 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Oct 2021 17:33:13 +0200 Subject: [PATCH 061/101] tracing: use %ps format string to print symbols clang started warning about excessive stack usage in hist_trigger_print_key() kernel/trace/trace_events_hist.c:4723:13: error: stack frame size (1336) exceeds limit (1024) in function 'hist_trigger_print_key' [-Werror,-Wframe-larger-than] The problem is that there are two 512-byte arrays on the stack if hist_trigger_stacktrace_print() gets inlined. I don't think this has changed in the past five years, but something probably changed the inlining decisions made by the compiler, so the problem is now made more obvious. Rather than printing the symbol names into separate buffers, it seems we can simply use the special %ps format string modifier to print the pointers symbolically and get rid of both buffers. Marking hist_trigger_stacktrace_print() would be a simpler way of avoiding the warning, but that would not address the excessive stack usage. Link: https://lkml.kernel.org/r/20211019153337.294790-1-arnd@kernel.org Fixes: 69a0200c2e25 ("tracing: Add hist trigger support for stacktraces as keys") Link: https://lore.kernel.org/all/20211015095704.49a99859@gandalf.local.home/ Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..b64aed538628 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4706,7 +4706,6 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned long *stacktrace_entries, unsigned int max_entries) { - char str[KSYM_SYMBOL_LEN]; unsigned int spaces = 8; unsigned int i; @@ -4715,8 +4714,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, return; seq_printf(m, "%*c", 1 + spaces, ' '); - sprint_symbol(str, stacktrace_entries[i]); - seq_printf(m, "%s\n", str); + seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]); } } @@ -4726,7 +4724,6 @@ static void hist_trigger_print_key(struct seq_file *m, struct tracing_map_elt *elt) { struct hist_field *key_field; - char str[KSYM_SYMBOL_LEN]; bool multiline = false; const char *field_name; unsigned int i; @@ -4747,14 +4744,12 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %llx", field_name, uval); } else if (key_field->flags & HIST_FIELD_FL_SYM) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol_no_offset(str, uval); - seq_printf(m, "%s: [%llx] %-45s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-45ps", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol(str, uval); - seq_printf(m, "%s: [%llx] %-55s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-55pS", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { struct hist_elt_data *elt_data = elt->private_data; char *comm; From e44e81c5b90f698025eadceb7eef8661eda117d5 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 21 Oct 2021 09:54:24 +0900 Subject: [PATCH 062/101] kprobes: convert tests to kunit This converts the kprobes testcases to use the kunit framework. It adds a dependency on CONFIG_KUNIT, and the output will change to TAP: TAP version 14 1..1 # Subtest: kprobes_test 1..4 random: crng init done ok 1 - test_kprobe ok 2 - test_kprobes ok 3 - test_kretprobe ok 4 - test_kretprobes ok 1 - kprobes_test Note that the kprobes testcases are no longer run immediately after kprobes initialization, but as a late initcall when kunit is initialized. kprobes itself is initialized with an early initcall, so the order is still correct. Signed-off-by: Sven Schnelle Acked-by: Masami Hiramatsu Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 3 - kernel/test_kprobes.c | 234 ++++++++++++------------------------------ lib/Kconfig.debug | 3 +- 3 files changed, 67 insertions(+), 173 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b62af9fc3607..4676627cb066 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2581,9 +2581,6 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); - - if (!err) - init_test_probes(); return err; } early_initcall(init_kprobes); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 76c997fdbc9d..e78f18144145 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -5,18 +5,17 @@ * Copyright IBM Corp. 2008 */ -#define pr_fmt(fmt) "Kprobe smoke test: " fmt - #include #include #include +#include #define div_factor 3 static u32 rand1, preh_val, posth_val; -static int errors, handler_errors, num_tests; static u32 (*target)(u32 value); static u32 (*target2)(u32 value); +static struct kunit *current_test; static noinline u32 kprobe_target(u32 value) { @@ -25,10 +24,7 @@ static noinline u32 kprobe_target(u32 value) static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) { - if (preemptible()) { - handler_errors++; - pr_err("pre-handler is preemptible\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); preh_val = (rand1 / div_factor); return 0; } @@ -36,14 +32,8 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { - if (preemptible()) { - handler_errors++; - pr_err("post-handler is preemptible\n"); - } - if (preh_val != (rand1 / div_factor)) { - handler_errors++; - pr_err("incorrect value in post_handler\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor)); posth_val = preh_val + div_factor; } @@ -53,30 +43,14 @@ static struct kprobe kp = { .post_handler = kp_post_handler }; -static int test_kprobe(void) +static void test_kprobe(struct kunit *test) { - int ret; - - ret = register_kprobe(&kp); - if (ret < 0) { - pr_err("register_kprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_kprobe(&kp)); + target(rand1); unregister_kprobe(&kp); - - if (preh_val == 0) { - pr_err("kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler not called\n"); - handler_errors++; - } - - return 0; + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); } static noinline u32 kprobe_target2(u32 value) @@ -93,10 +67,7 @@ static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, unsigned long flags) { - if (preh_val != (rand1 / div_factor) + 1) { - handler_errors++; - pr_err("incorrect value in post_handler2\n"); - } + KUNIT_EXPECT_EQ(current_test, preh_val, (rand1 / div_factor) + 1); posth_val = preh_val + div_factor; } @@ -106,51 +77,31 @@ static struct kprobe kp2 = { .post_handler = kp_post_handler2 }; -static int test_kprobes(void) +static void test_kprobes(struct kunit *test) { - int ret; struct kprobe *kps[2] = {&kp, &kp2}; + current_test = test; + /* addr and flags should be cleard for reusing kprobe. */ kp.addr = NULL; kp.flags = 0; - ret = register_kprobes(kps, 2); - if (ret < 0) { - pr_err("register_kprobes returned %d\n", ret); - return ret; - } + + KUNIT_EXPECT_EQ(test, 0, register_kprobes(kps, 2)); + preh_val = 0; + posth_val = 0; + target(rand1); + + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); preh_val = 0; posth_val = 0; - ret = target(rand1); - - if (preh_val == 0) { - pr_err("kprobe pre_handler not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler not called\n"); - handler_errors++; - } - - preh_val = 0; - posth_val = 0; - ret = target2(rand1); - - if (preh_val == 0) { - pr_err("kprobe pre_handler2 not called\n"); - handler_errors++; - } - - if (posth_val == 0) { - pr_err("kprobe post_handler2 not called\n"); - handler_errors++; - } + target2(rand1); + KUNIT_EXPECT_NE(test, 0, preh_val); + KUNIT_EXPECT_NE(test, 0, posth_val); unregister_kprobes(kps, 2); - return 0; - } #ifdef CONFIG_KRETPROBES @@ -158,10 +109,7 @@ static u32 krph_val; static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { - if (preemptible()) { - handler_errors++; - pr_err("kretprobe entry handler is preemptible\n"); - } + KUNIT_EXPECT_FALSE(current_test, preemptible()); krph_val = (rand1 / div_factor); return 0; } @@ -170,19 +118,9 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); - if (preemptible()) { - handler_errors++; - pr_err("kretprobe return handler is preemptible\n"); - } - if (ret != (rand1 / div_factor)) { - handler_errors++; - pr_err("incorrect value in kretprobe handler\n"); - } - if (krph_val == 0) { - handler_errors++; - pr_err("call to kretprobe entry handler failed\n"); - } - + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, ret, rand1 / div_factor); + KUNIT_EXPECT_NE(current_test, krph_val, 0); krph_val = rand1; return 0; } @@ -193,39 +131,21 @@ static struct kretprobe rp = { .kp.symbol_name = "kprobe_target" }; -static int test_kretprobe(void) +static void test_kretprobe(struct kunit *test) { - int ret; - - ret = register_kretprobe(&rp); - if (ret < 0) { - pr_err("register_kretprobe returned %d\n", ret); - return ret; - } - - ret = target(rand1); + current_test = test; + KUNIT_EXPECT_EQ(test, 0, register_kretprobe(&rp)); + target(rand1); unregister_kretprobe(&rp); - if (krph_val != rand1) { - pr_err("kretprobe handler not called\n"); - handler_errors++; - } - - return 0; + KUNIT_EXPECT_EQ(test, krph_val, rand1); } static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) { unsigned long ret = regs_return_value(regs); - if (ret != (rand1 / div_factor) + 1) { - handler_errors++; - pr_err("incorrect value in kretprobe handler2\n"); - } - if (krph_val == 0) { - handler_errors++; - pr_err("call to kretprobe entry handler failed\n"); - } - + KUNIT_EXPECT_EQ(current_test, ret, (rand1 / div_factor) + 1); + KUNIT_EXPECT_NE(current_test, krph_val, 0); krph_val = rand1; return 0; } @@ -236,78 +156,54 @@ static struct kretprobe rp2 = { .kp.symbol_name = "kprobe_target2" }; -static int test_kretprobes(void) +static void test_kretprobes(struct kunit *test) { - int ret; struct kretprobe *rps[2] = {&rp, &rp2}; + current_test = test; /* addr and flags should be cleard for reusing kprobe. */ rp.kp.addr = NULL; rp.kp.flags = 0; - ret = register_kretprobes(rps, 2); - if (ret < 0) { - pr_err("register_kretprobe returned %d\n", ret); - return ret; - } + KUNIT_EXPECT_EQ(test, 0, register_kretprobes(rps, 2)); krph_val = 0; - ret = target(rand1); - if (krph_val != rand1) { - pr_err("kretprobe handler not called\n"); - handler_errors++; - } + target(rand1); + KUNIT_EXPECT_EQ(test, krph_val, rand1); krph_val = 0; - ret = target2(rand1); - if (krph_val != rand1) { - pr_err("kretprobe handler2 not called\n"); - handler_errors++; - } + target2(rand1); + KUNIT_EXPECT_EQ(test, krph_val, rand1); unregister_kretprobes(rps, 2); - return 0; } #endif /* CONFIG_KRETPROBES */ -int init_test_probes(void) +static int kprobes_test_init(struct kunit *test) { - int ret; - target = kprobe_target; target2 = kprobe_target2; do { rand1 = prandom_u32(); } while (rand1 <= div_factor); - - pr_info("started\n"); - num_tests++; - ret = test_kprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kprobes(); - if (ret < 0) - errors++; - -#ifdef CONFIG_KRETPROBES - num_tests++; - ret = test_kretprobe(); - if (ret < 0) - errors++; - - num_tests++; - ret = test_kretprobes(); - if (ret < 0) - errors++; -#endif /* CONFIG_KRETPROBES */ - - if (errors) - pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); - else if (handler_errors) - pr_err("BUG: %d error(s) running handlers\n", handler_errors); - else - pr_info("passed successfully\n"); - return 0; } + +static struct kunit_case kprobes_testcases[] = { + KUNIT_CASE(test_kprobe), + KUNIT_CASE(test_kprobes), +#ifdef CONFIG_KRETPROBES + KUNIT_CASE(test_kretprobe), + KUNIT_CASE(test_kretprobes), +#endif + {} +}; + +static struct kunit_suite kprobes_test_suite = { + .name = "kprobes_test", + .init = kprobes_test_init, + .test_cases = kprobes_testcases, +}; + +kunit_test_suites(&kprobes_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a9b6dcdac4f..6ceb11a43e4c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2080,9 +2080,10 @@ config TEST_DIV64 If unsure, say N. config KPROBES_SANITY_TEST - bool "Kprobes sanity tests" + tristate "Kprobes sanity tests" depends on DEBUG_KERNEL depends on KPROBES + depends on KUNIT help This option provides for testing basic kprobes functionality on boot. Samples of kprobe and kretprobe are inserted and From 811b93ffaa488a4733270d8c8bc6c773334ab351 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:54:42 +0900 Subject: [PATCH 063/101] x86/unwind: Compile kretprobe fixup code only if CONFIG_KRETPROBES=y Compile kretprobe related stacktrace entry recovery code and unwind_state::kr_cur field only when CONFIG_KRETPROBES=y. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/unwind.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index fca2e783e3ce..2a1f8734416d 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -16,7 +16,9 @@ struct unwind_state { unsigned long stack_mask; struct task_struct *task; int graph_idx; +#ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; +#endif bool error; #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; @@ -105,9 +107,13 @@ static inline unsigned long unwind_recover_kretprobe(struct unwind_state *state, unsigned long addr, unsigned long *addr_p) { +#ifdef CONFIG_KRETPROBES return is_kretprobe_trampoline(addr) ? kretprobe_find_ret_addr(state->task, addr_p, &state->kr_cur) : addr; +#else + return addr; +#endif } /* Recover the return address modified by kretprobe and ftrace_graph. */ From f8717410621571b182d3f2c45ffc52796c418787 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:54:51 +0900 Subject: [PATCH 064/101] arm64: kprobes: Record frame pointer with kretprobe instance Record the frame pointer instead of stack address with kretprobe instance as the identifier on the instance list. Since arm64 always enable CONFIG_FRAME_POINTER, we can use the actual frame pointer (x29). This will allow the stacktrace code to find the original return address from the FP alone. Signed-off-by: Masami Hiramatsu Acked-by: Will Deacon Acked-by: Mark Rutland Signed-off-by: Steven Rostedt (VMware) --- arch/arm64/kernel/probes/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index e7ad6da980e8..d9dfa82c1f18 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -401,14 +401,14 @@ int __init arch_populate_kprobe_blacklist(void) void __kprobes __used *trampoline_probe_handler(struct pt_regs *regs) { - return (void *)kretprobe_trampoline_handler(regs, (void *)kernel_stack_pointer(regs)); + return (void *)kretprobe_trampoline_handler(regs, (void *)regs->regs[29]); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { ri->ret_addr = (kprobe_opcode_t *)regs->regs[30]; - ri->fp = (void *)kernel_stack_pointer(regs); + ri->fp = (void *)regs->regs[29]; /* replace return addr (x30) with trampoline */ regs->regs[30] = (long)&__kretprobe_trampoline; From fc6d647638a8412800dfd10ad687709cb4aee373 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:00 +0900 Subject: [PATCH 065/101] arm64: kprobes: Make a frame pointer on __kretprobe_trampoline Make a frame pointer (make the x29 register points the address of pt_regs->regs[29]) on __kretprobe_trampoline. This frame pointer will be used by the stacktracer when it is called from the kretprobe handlers. In this case, the stack tracer will unwind stack to trampoline_probe_handler() and find the next frame pointer in the stack frame of the __kretprobe_trampoline(). Signed-off-by: Masami Hiramatsu Acked-by: Will Deacon Signed-off-by: Steven Rostedt (VMware) --- arch/arm64/kernel/probes/kprobes_trampoline.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 520ee8711db1..9a6499bed58b 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -66,6 +66,9 @@ SYM_CODE_START(__kretprobe_trampoline) save_all_base_regs + /* Setup a frame pointer. */ + add x29, sp, #S_FP + mov x0, sp bl trampoline_probe_handler /* @@ -74,6 +77,7 @@ SYM_CODE_START(__kretprobe_trampoline) */ mov lr, x0 + /* The frame pointer (x29) is restored with other registers. */ restore_all_base_regs add sp, sp, #PT_REGS_SIZE From cd9bc2c9258816dc934b300705076519d7375b81 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:09 +0900 Subject: [PATCH 066/101] arm64: Recover kretprobe modified return address in stacktrace Since the kretprobe replaces the function return address with the kretprobe_trampoline on the stack, stack unwinder shows it instead of the correct return address. This checks whether the next return address is the __kretprobe_trampoline(), and if so, try to find the correct return address from the kretprobe instance list. For this purpose this adds 'kr_cur' loop cursor to memorize the current kretprobe instance. With this fix, now arm64 can enable CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE, and pass the kprobe self tests. Signed-off-by: Masami Hiramatsu Acked-by: Will Deacon Signed-off-by: Steven Rostedt (VMware) --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/stacktrace.h | 4 ++++ arch/arm64/kernel/stacktrace.c | 7 +++++++ 3 files changed, 12 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5c7ae4c3954b..edde5171ffb2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -11,6 +11,7 @@ config ARM64 select ACPI_PPTT if ACPI select ARCH_HAS_DEBUG_WX select ARCH_BINFMT_ELF_STATE + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 8aebc00c1718..a4e046ef4568 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,9 @@ struct stackframe { #ifdef CONFIG_FUNCTION_GRAPH_TRACER int graph; #endif +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 8982a2b78acf..c30624fff6ac 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -41,6 +41,9 @@ void start_backtrace(struct stackframe *frame, unsigned long fp, #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame->graph = 0; #endif +#ifdef CONFIG_KRETPROBES + frame->kr_cur = NULL; +#endif /* * Prime the first unwind. @@ -129,6 +132,10 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->pc = ret_stack->ret; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(frame->pc)) + frame->pc = kretprobe_find_ret_addr(tsk, (void *)frame->fp, &frame->kr_cur); +#endif frame->pc = ptrauth_strip_insn_pac(frame->pc); From b3ea5d56f212ad81328c82454829a736197ebccc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:17 +0900 Subject: [PATCH 067/101] ARM: clang: Do not rely on lr register for stacktrace Currently the stacktrace on clang compiled arm kernel uses the 'lr' register to find the first frame address from pt_regs. However, that is wrong after calling another function, because the 'lr' register is used by 'bl' instruction and never be recovered. As same as gcc arm kernel, directly use the frame pointer (r11) of the pt_regs to find the first frame address. Note that this fixes kretprobe stacktrace issue only with CONFIG_UNWINDER_FRAME_POINTER=y. For the CONFIG_UNWINDER_ARM, we need another fix. Signed-off-by: Masami Hiramatsu Reviewed-by: Nick Desaulniers Signed-off-by: Steven Rostedt (VMware) --- arch/arm/kernel/stacktrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index 76ea4178a55c..db798eac7431 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -54,8 +54,7 @@ int notrace unwind_frame(struct stackframe *frame) frame->sp = frame->fp; frame->fp = *(unsigned long *)(fp); - frame->pc = frame->lr; - frame->lr = *(unsigned long *)(fp + 4); + frame->pc = *(unsigned long *)(fp + 4); #else /* check current frame pointer is within bounds */ if (fp < low + 12 || fp > high - 4) From 7e9bf33b812471ee57a03ec7f9b544ca437cc706 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:26 +0900 Subject: [PATCH 068/101] ARM: kprobes: Make a frame pointer on __kretprobe_trampoline Currently kretprobe on ARM just fills r0-r11 of pt_regs, but that is not enough for the stacktrace. Moreover, from the user kretprobe handler, stacktrace needs a frame pointer on the __kretprobe_trampoline. This adds a frame pointer on __kretprobe_trampoline for both gcc and clang case. Those have different frame pointer so we need different but similar stack on pt_regs. Gcc makes the frame pointer (fp) to point the 'pc' address of the {fp, ip (=sp), lr, pc}, this means {r11, r13, r14, r15}. Thus if we save the r11 (fp) on pt_regs->r12, we can make this set on the end of pt_regs. On the other hand, Clang makes the frame pointer to point the 'fp' address of {fp, lr} on stack. Since the next to the pt_regs->lr is pt_regs->sp, I reused the pair of pt_regs->fp and pt_regs->ip. So this stores the 'lr' on pt_regs->ip and make the fp to point pt_regs->fp. For both cases, saves __kretprobe_trampoline address to pt_regs->lr, so that the stack tracer can identify this frame pointer has been made by the __kretprobe_trampoline. Note that if the CONFIG_FRAME_POINTER is not set, this keeps fp as is. Signed-off-by: Masami Hiramatsu Reviewed-by: Nick Desaulniers Signed-off-by: Steven Rostedt (VMware) --- arch/arm/probes/kprobes/core.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 95f23b47ba27..4848404ba51b 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -368,16 +368,36 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, /* * When a retprobed function returns, trampoline_handler() is called, * calling the kretprobe's handler. We construct a struct pt_regs to - * give a view of registers r0-r11 to the user return-handler. This is - * not a complete pt_regs structure, but that should be plenty sufficient - * for kretprobe handlers which should normally be interested in r0 only - * anyway. + * give a view of registers r0-r11, sp, lr, and pc to the user + * return-handler. This is not a complete pt_regs structure, but that + * should be enough for stacktrace from the return handler with or + * without pt_regs. */ void __naked __kprobes __kretprobe_trampoline(void) { __asm__ __volatile__ ( +#ifdef CONFIG_FRAME_POINTER + "ldr lr, =__kretprobe_trampoline \n\t" + /* __kretprobe_trampoline makes a framepointer on pt_regs. */ +#ifdef CONFIG_CC_IS_CLANG + "stmdb sp, {sp, lr, pc} \n\t" + "sub sp, sp, #12 \n\t" + /* In clang case, pt_regs->ip = lr. */ + "stmdb sp!, {r0 - r11, lr} \n\t" + /* fp points regs->r11 (fp) */ + "add fp, sp, #44 \n\t" +#else /* !CONFIG_CC_IS_CLANG */ + /* In gcc case, pt_regs->ip = fp. */ + "stmdb sp, {fp, sp, lr, pc} \n\t" "sub sp, sp, #16 \n\t" "stmdb sp!, {r0 - r11} \n\t" + /* fp points regs->r15 (pc) */ + "add fp, sp, #60 \n\t" +#endif /* CONFIG_CC_IS_CLANG */ +#else /* !CONFIG_FRAME_POINTER */ + "sub sp, sp, #16 \n\t" + "stmdb sp!, {r0 - r11} \n\t" +#endif /* CONFIG_FRAME_POINTER */ "mov r0, sp \n\t" "bl trampoline_handler \n\t" "mov lr, r0 \n\t" From fed240d9c9743815fcbc0ca5c0913292ce1f25e2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 21 Oct 2021 09:55:35 +0900 Subject: [PATCH 069/101] ARM: Recover kretprobe modified return address in stacktrace Since the kretprobe replaces the function return address with the kretprobe_trampoline on the stack, arm unwinder shows it instead of the correct return address. This finds the correct return address from the per-task kretprobe_instances list and verify it is in between the caller fp and callee fp. Note that this supports both GCC and clang if CONFIG_FRAME_POINTER=y and CONFIG_ARM_UNWIND=n. For the ARM unwinder, this is still not working correctly. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arm/Kconfig | 1 + arch/arm/include/asm/stacktrace.h | 9 +++++++++ arch/arm/kernel/return_address.c | 4 ++++ arch/arm/kernel/stacktrace.c | 14 ++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fc196421b2ce..bb4f1872967c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -3,6 +3,7 @@ config ARM bool default y select ARCH_32BIT_OFF_T + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE if HAVE_KRETPROBES && FRAME_POINTER && !ARM_UNWIND select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE diff --git a/arch/arm/include/asm/stacktrace.h b/arch/arm/include/asm/stacktrace.h index 2d76a2e29f05..8f54f9ad8a9b 100644 --- a/arch/arm/include/asm/stacktrace.h +++ b/arch/arm/include/asm/stacktrace.h @@ -3,6 +3,7 @@ #define __ASM_STACKTRACE_H #include +#include struct stackframe { /* @@ -13,6 +14,10 @@ struct stackframe { unsigned long sp; unsigned long lr; unsigned long pc; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; + struct task_struct *tsk; +#endif }; static __always_inline @@ -22,6 +27,10 @@ void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame) frame->sp = regs->ARM_sp; frame->lr = regs->ARM_lr; frame->pc = regs->ARM_pc; +#ifdef CONFIG_KRETPROBES + frame->kr_cur = NULL; + frame->tsk = current; +#endif } extern int unwind_frame(struct stackframe *frame); diff --git a/arch/arm/kernel/return_address.c b/arch/arm/kernel/return_address.c index 7b42ac010fdf..00c11579406c 100644 --- a/arch/arm/kernel/return_address.c +++ b/arch/arm/kernel/return_address.c @@ -42,6 +42,10 @@ void *return_address(unsigned int level) frame.sp = current_stack_pointer; frame.lr = (unsigned long)__builtin_return_address(0); frame.pc = (unsigned long)return_address; +#ifdef CONFIG_KRETPROBES + frame.kr_cur = NULL; + frame.tsk = current; +#endif walk_stackframe(&frame, save_return_addr, &data); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index db798eac7431..75e905508f27 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -65,6 +66,11 @@ int notrace unwind_frame(struct stackframe *frame) frame->sp = *(unsigned long *)(fp - 8); frame->pc = *(unsigned long *)(fp - 4); #endif +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(frame->pc)) + frame->pc = kretprobe_find_ret_addr(frame->tsk, + (void *)frame->fp, &frame->kr_cur); +#endif return 0; } @@ -156,6 +162,10 @@ static noinline void __save_stack_trace(struct task_struct *tsk, frame.lr = (unsigned long)__builtin_return_address(0); frame.pc = (unsigned long)__save_stack_trace; } +#ifdef CONFIG_KRETPROBES + frame.kr_cur = NULL; + frame.tsk = tsk; +#endif walk_stackframe(&frame, save_trace, &data); } @@ -173,6 +183,10 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) frame.sp = regs->ARM_sp; frame.lr = regs->ARM_lr; frame.pc = regs->ARM_pc; +#ifdef CONFIG_KRETPROBES + frame.kr_cur = NULL; + frame.tsk = current; +#endif walk_stackframe(&frame, save_trace, &data); } From 172f7ba9772cae12f099fc563352e905dc9a1921 Mon Sep 17 00:00:00 2001 From: chongjiapeng Date: Tue, 19 Oct 2021 18:48:54 +0800 Subject: [PATCH 070/101] ftrace: Make ftrace_profile_pages_init static This symbol is not used outside of ftrace.c, so marks it static. Fixes the following sparse warning: kernel/trace/ftrace.c:579:5: warning: symbol 'ftrace_profile_pages_init' was not declared. Should it be static? Link: https://lkml.kernel.org/r/1634640534-18280-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Fixes: cafb168a1c92 ("tracing: make the function profiler per cpu") Signed-off-by: chongjiapeng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f90ed00a6d5b..2057ad363772 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -576,7 +576,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int functions; From f604de20c0a47e0e9518940a1810193678c92fa8 Mon Sep 17 00:00:00 2001 From: Viktor Rosendahl Date: Tue, 19 Oct 2021 18:07:01 +0200 Subject: [PATCH 071/101] tools/latency-collector: Use correct size when writing queue_full_warning queue_full_warning is a pointer, so it is wrong to use sizeof to calculate the number of characters of the string it points to. The effect is that we only print out the first few characters of the warning string. The correct way is to use strlen(). We don't need to add 1 to the strlen() because we don't want to write the terminating null character to stdout. Link: https://lkml.kernel.org/r/20211019160701.15587-1-Viktor.Rosendahl@bmw.de Link: https://lore.kernel.org/r/8fd4bb65ef3da67feac9ce3258cdbe9824752cf1.1629198502.git.jing.yangyang@zte.com.cn Link: https://lore.kernel.org/r/20211012025424.180781-1-davidcomponentone@gmail.com Reported-by: Zeal Robot Signed-off-by: Viktor Rosendahl Signed-off-by: Steven Rostedt (VMware) --- tools/tracing/latency/latency-collector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/tracing/latency/latency-collector.c b/tools/tracing/latency/latency-collector.c index 3a2e6bb781a8..59a7f2346eab 100644 --- a/tools/tracing/latency/latency-collector.c +++ b/tools/tracing/latency/latency-collector.c @@ -1538,7 +1538,7 @@ static void tracing_loop(void) mutex_lock(&print_mtx); check_signals(); write_or_die(fd_stdout, queue_full_warning, - sizeof(queue_full_warning)); + strlen(queue_full_warning)); mutex_unlock(&print_mtx); } modified--; From 9e20028b529dfc26666b3f527d34ea4d36f60f66 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Oct 2021 14:07:32 -0700 Subject: [PATCH 072/101] perf/core: allow ftrace for functions in kernel/event/core.c It is useful to trace functions in kernel/event/core.c. Allow ftrace for them by removing $(CC_FLAGS_FTRACE) from Makefile. Link: https://lkml.kernel.org/r/20211006210732.2826289-1-songliubraving@fb.com Cc: Peter Zijlstra Cc: Andrii Nakryiko Cc: KP Singh Signed-off-by: Song Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/events/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 3c022e33c109..8591c180b52b 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -1,10 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifdef CONFIG_FUNCTION_TRACER -CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE) -endif - obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_UPROBES) += uprobes.o - From 9bd985766a43ac0115f13f67783d381ebcba70c6 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:48 +0200 Subject: [PATCH 073/101] trace/osnoise: Fix an ifdef comment s/CONFIG_OSNOISE_TRAECR/CONFIG_OSNOISE_TRACER/ No functional changes. Link: https://lkml.kernel.org/r/33924a16f6e5559ce24952ca7d62561604bfd94a.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- arch/x86/kernel/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/trace.c b/arch/x86/kernel/trace.c index 6b73b6f92ad3..8322e8352777 100644 --- a/arch/x86/kernel/trace.c +++ b/arch/x86/kernel/trace.c @@ -231,4 +231,4 @@ void osnoise_arch_unregister(void) unregister_trace_local_timer_exit(trace_intel_irq_exit, "local_timer"); unregister_trace_local_timer_entry(trace_intel_irq_entry, NULL); } -#endif /* CONFIG_OSNOISE_TRAECR && CONFIG_X86_LOCAL_APIC */ +#endif /* CONFIG_OSNOISE_TRACER && CONFIG_X86_LOCAL_APIC */ From 4d4eac7b5af4c627cdab50c9e3b7bd19c4a144c6 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:49 +0200 Subject: [PATCH 074/101] tracing/doc: Fix typos on the timerlat tracer documentation Fixes a series of typos in the timerlat doc. Link: https://lkml.kernel.org/r/d3763eb376603890baab908141de6660ba18fff8.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: a955d7eac177 ("trace: Add timerlat tracer") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/timerlat-tracer.rst | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/trace/timerlat-tracer.rst b/Documentation/trace/timerlat-tracer.rst index c7cbb557aee7..64d1fe6e9b93 100644 --- a/Documentation/trace/timerlat-tracer.rst +++ b/Documentation/trace/timerlat-tracer.rst @@ -3,7 +3,7 @@ Timerlat tracer ############### The timerlat tracer aims to help the preemptive kernel developers to -find souces of wakeup latencies of real-time threads. Like cyclictest, +find sources of wakeup latencies of real-time threads. Like cyclictest, the tracer sets a periodic timer that wakes up a thread. The thread then computes a *wakeup latency* value as the difference between the *current time* and the *absolute time* that the timer was set to expire. The main @@ -50,14 +50,14 @@ The second is the *timer latency* observed by the thread. The ACTIVATION ID field serves to relate the *irq* execution to its respective *thread* execution. -The *irq*/*thread* splitting is important to clarify at which context +The *irq*/*thread* splitting is important to clarify in which context the unexpected high value is coming from. The *irq* context can be -delayed by hardware related actions, such as SMIs, NMIs, IRQs -or by a thread masking interrupts. Once the timer happens, the delay +delayed by hardware-related actions, such as SMIs, NMIs, IRQs, +or by thread masking interrupts. Once the timer happens, the delay can also be influenced by blocking caused by threads. For example, by -postponing the scheduler execution via preempt_disable(), by the -scheduler execution, or by masking interrupts. Threads can -also be delayed by the interference from other threads and IRQs. +postponing the scheduler execution via preempt_disable(), scheduler +execution, or masking interrupts. Threads can also be delayed by the +interference from other threads and IRQs. Tracer options --------------------- @@ -68,14 +68,14 @@ directory. The timerlat configs are: - cpus: CPUs at which a timerlat thread will execute. - timerlat_period_us: the period of the timerlat thread. - - osnoise/stop_tracing_us: stop the system tracing if a + - stop_tracing_us: stop the system tracing if a timer latency at the *irq* context higher than the configured value happens. Writing 0 disables this option. - stop_tracing_total_us: stop the system tracing if a - timer latency at the *thread* context higher than the configured + timer latency at the *thread* context is higher than the configured value happens. Writing 0 disables this option. - - print_stack: save the stack of the IRQ ocurrence, and print - it afte the *thread context* event". + - print_stack: save the stack of the IRQ occurrence, and print + it after the *thread context* event". timerlat and osnoise ---------------------------- @@ -95,7 +95,7 @@ For example:: timerlat/5-1035 [005] ....... 548.771104: #402268 context thread timer_latency 39960 ns In this case, the root cause of the timer latency does not point to a -single cause, but to multiple ones. Firstly, the timer IRQ was delayed +single cause but to multiple ones. Firstly, the timer IRQ was delayed for 13 us, which may point to a long IRQ disabled section (see IRQ stacktrace section). Then the timer interrupt that wakes up the timerlat thread took 7597 ns, and the qxl:21 device IRQ took 7139 ns. Finally, From e0f3b18be733ac4a3b6deb2ff586bc1936ad0368 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:50 +0200 Subject: [PATCH 075/101] trace/osnoise: Add migrate-disabled field to the osnoise header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix osnoise header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/9cb3d54e29e0588dbba12e81486bd8a09adcd8ca.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c4f14fb98aaa..34f26c632442 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -294,19 +294,19 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# _-----=> irqs-off\n"); seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); - seq_puts(s, "# || / _--=> preempt-depth "); - seq_puts(s, " MAX\n"); - - seq_puts(s, "# || / "); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# ||| / _-=> migrate-disable "); + seq_puts(s, " MAX\n"); + seq_puts(s, "# |||| / delay "); seq_puts(s, " SINGLE Interference counters:\n"); - seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, "# ||||| RUNTIME "); seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US "); seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | | | | | | | |\n"); } #endif /* CONFIG_PREEMPT_RT */ From aeafcb82d99c97ff5c6054a4091eeb12aefca9ab Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:51 +0200 Subject: [PATCH 076/101] trace/timerlat: Add migrate-disabled field to the timerlat header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix timerlat header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/bc0c234ab49946cdd63effa6584e1d5e8662cb44.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 34f26c632442..d11b41784fac 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -378,11 +378,12 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); seq_puts(s, "# || / _--=> preempt-depth\n"); - seq_puts(s, "# || /\n"); - seq_puts(s, "# |||| ACTIVATION\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID "); - seq_puts(s, " CONTEXT LATENCY\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# ||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||| / delay\n"); + seq_puts(s, "# ||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | |\n"); } #endif /* CONFIG_PREEMPT_RT */ From 3c20bd3af535d64771b193bb4dd41ed662c464ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Oct 2021 15:55:50 -0400 Subject: [PATCH 077/101] tracing: Fix missing trace_boot_init_histograms kstrdup NULL checks trace_boot_init_histograms misses NULL pointer checks for kstrdup failure. Link: https://lkml.kernel.org/r/20211015195550.22742-1-mathieu.desnoyers@efficios.com Fixes: 64dc7f6958ef5 ("tracing/boot: Show correct histogram error command") Acked-by: Masami Hiramatsu Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 8d252f63cd78..0580287d7a0d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -430,6 +430,8 @@ trace_boot_init_histograms(struct trace_event_file *file, /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); @@ -439,6 +441,8 @@ trace_boot_init_histograms(struct trace_event_file *file, if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); From 1d6288914264a22c0efdfb3a5748c101c0d12baa Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Thu, 21 Oct 2021 11:52:25 +0800 Subject: [PATCH 078/101] tracing/hwlat: Make some internal symbols static The sparse tool complains as follows: kernel/trace/trace_hwlat.c:82:27: warning: symbol 'hwlat_single_cpu_data' was not declared. Should it be static? kernel/trace/trace_hwlat.c:83:1: warning: symbol '__pcpu_scope_hwlat_per_cpu_data' was not declared. Should it be static? This symbol is not used outside of trace_hwlat.c, so this commit marks it static. Link: https://lkml.kernel.org/r/20211021035225.1050685-1-bobo.shaobowang@huawei.com Signed-off-by: Wang ShaoBo Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d0a730d99a33..56bb7b890578 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -79,8 +79,8 @@ struct hwlat_kthread_data { int nmi_cpu; }; -struct hwlat_kthread_data hwlat_single_cpu_data; -DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); +static struct hwlat_kthread_data hwlat_single_cpu_data; +static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); /* Tells NMIs to call back to the hwlat tracer to record timestamps */ bool trace_hwlat_callback_enabled; From 17b251a290ba84a0c2c5c82df9596cb2e7207ca6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 26 Oct 2021 17:23:36 -0400 Subject: [PATCH 079/101] ftrace/sh: Add arch_ftrace_ops_list_func stub to have compressed image still link Using the linker script to fix an issue where some archs call the function tracer with just the ip (instruction pointer) and pip (parent instruction pointer) where as more up to date archs also pass in the associated ftrace_ops and the ftrace_regs pointer, the generic code will be called either with two parameters or four. To avoid any C undefined behavior of calling two parameters to four or four to two parameter function, two functions are created, where a preprocessor macro uses the one that matches the architecture. As the function pointers for them may be different, a typecast is used. But this triggers issues with newer compilers that will fail due to -Werror. A linker trick is now used to map the generic function to the function that is used (note the generic function is only used to set the default function callback). The linker trick defines ftrace_ops_list_func (the generic function) to arch_ftrace_ops_list_func (the arch defined one). Link: https://lore.kernel.org/all/20200617165616.52241bde@oasis.local.home/ But this fails sh arch because their linker script is included in their compressed image that does not define arch_ftrace_ops_list_func at all sh4-linux-ld:arch/sh/boot/compressed/../../kernel/vmlinux.lds:32: undefined symbol `arch_ftrace_ops_list_func' referenced in expression Included a stub by that name in the misc.c to allow the code to compile and link, even though it's not used. This is similar to what was done for ftrace_stub: b83b43ffc6e4b ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Link: https://lkml.kernel.org/r/20211021221627.5d7270de@rorschach.local.home Cc: Yoshinori Sato Cc: Rich Felker Cc: linux-sh@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- arch/sh/boot/compressed/misc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c index a03b6680a9d9..ca05c99a3d5b 100644 --- a/arch/sh/boot/compressed/misc.c +++ b/arch/sh/boot/compressed/misc.c @@ -115,6 +115,9 @@ void __stack_chk_fail(void) void ftrace_stub(void) { } +void arch_ftrace_ops_list_func(void) +{ +} #define stackalign 4 From 4d1c92a4f5ad8454259cfc711c210da6d4cfe8cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 Oct 2021 17:32:37 +0900 Subject: [PATCH 080/101] lib/bootconfig: Make xbc_alloc_mem() and xbc_free_mem() as __init function Since the xbc_alloc_mem() and xbc_free_mem() are used from the __init functions and memblock_alloc() is __init function, make them __init functions too. Link: https://lkml.kernel.org/r/163515075747.547467.5746167540626712819.stgit@devnote2 Signed-off-by: Masami Hiramatsu Fixes: 4ee1b4cac236 ("bootconfig: Cleanup dummy headers in tools/bootconfig") Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- lib/bootconfig.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index a056ae137750..3276675b25e1 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -43,12 +43,12 @@ static int open_brace[XBC_DEPTH_MAX] __initdata; static int brace_index __initdata; #ifdef __KERNEL__ -static inline void *xbc_alloc_mem(size_t size) +static inline void * __init xbc_alloc_mem(size_t size) { return memblock_alloc(size, SMP_CACHE_BYTES); } -static inline void xbc_free_mem(void *addr, size_t size) +static inline void __init xbc_free_mem(void *addr, size_t size) { memblock_free_ptr(addr, size); } From 1f6d3a8f5e397f5d31afbc58d84e1dc68318b874 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 25 Oct 2021 20:41:52 +0900 Subject: [PATCH 081/101] kprobes: Add a test case for stacktrace from kretprobe handler Add a test case for stacktrace from kretprobe handler and nested kretprobe handlers. This test checks both of stack trace inside kretprobe handler and stack trace from pt_regs. Those stack trace must include actual function return address instead of kretprobe trampoline. The nested kretprobe stacktrace test checks whether the unwinder can correctly unwind the call frame on the stack which has been modified by the kretprobe. Since the stacktrace on kretprobe is correctly fixed only on x86, this introduces a meta kconfig ARCH_CORRECT_STACKTRACE_ON_KRETPROBE which tells user that the stacktrace on kretprobe is correct or not. The test results will be shown like below; TAP version 14 1..1 # Subtest: kprobes_test 1..6 ok 1 - test_kprobe ok 2 - test_kprobes ok 3 - test_kretprobe ok 4 - test_kretprobes ok 5 - test_stacktrace_on_kretprobe ok 6 - test_stacktrace_on_nested_kretprobe # kprobes_test: pass:6 fail:0 skip:0 total:6 # Totals: pass:6 fail:0 skip:0 total:6 ok 1 - kprobes_test Link: https://lkml.kernel.org/r/163516211244.604541.18350507860972214415.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 8 +++ arch/x86/Kconfig | 1 + kernel/test_kprobes.c | 162 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 8df1c7102643..8378f83b462c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -191,6 +191,14 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + bool + help + Since kretprobes modifies return address on the stack, the + stacktrace may see the kretprobe trampoline address instead + of correct one. If the architecture stacktrace code and + unwinder can adjust such entries, select this configuration. + config HAVE_FUNCTION_ERROR_INJECTION bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e39e2bd6acc5..cc9ba6798848 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_INIT + select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE select ARCH_ENABLE_HUGEPAGE_MIGRATION if X86_64 && HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_MEMORY_HOTPLUG if X86_64 || (X86_32 && HIGHMEM) select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index e78f18144145..a5edc2ebc947 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -17,6 +17,11 @@ static u32 (*target)(u32 value); static u32 (*target2)(u32 value); static struct kunit *current_test; +static unsigned long (*internal_target)(void); +static unsigned long (*stacktrace_target)(void); +static unsigned long (*stacktrace_driver)(void); +static unsigned long target_return_address[2]; + static noinline u32 kprobe_target(u32 value) { return (value / div_factor); @@ -58,6 +63,33 @@ static noinline u32 kprobe_target2(u32 value) return (value / div_factor) + 1; } +static noinline unsigned long kprobe_stacktrace_internal_target(void) +{ + if (!target_return_address[0]) + target_return_address[0] = (unsigned long)__builtin_return_address(0); + return target_return_address[0]; +} + +static noinline unsigned long kprobe_stacktrace_target(void) +{ + if (!target_return_address[1]) + target_return_address[1] = (unsigned long)__builtin_return_address(0); + + if (internal_target) + internal_target(); + + return target_return_address[1]; +} + +static noinline unsigned long kprobe_stacktrace_driver(void) +{ + if (stacktrace_target) + stacktrace_target(); + + /* This is for preventing inlining the function */ + return (unsigned long)__builtin_return_address(0); +} + static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) { preh_val = (rand1 / div_factor) + 1; @@ -175,12 +207,138 @@ static void test_kretprobes(struct kunit *test) KUNIT_EXPECT_EQ(test, krph_val, rand1); unregister_kretprobes(rps, 2); } + +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE +#define STACK_BUF_SIZE 16 +static unsigned long stack_buf[STACK_BUF_SIZE]; + +static int stacktrace_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long retval = regs_return_value(regs); + int i, ret; + + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, retval, target_return_address[1]); + + /* + * Test stacktrace inside the kretprobe handler, this will involves + * kretprobe trampoline, but must include correct return address + * of the target function. + */ + ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + + for (i = 0; i < ret; i++) { + if (stack_buf[i] == target_return_address[1]) + break; + } + KUNIT_EXPECT_NE(current_test, i, ret); + +#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) + /* + * Test stacktrace from pt_regs at the return address. Thus the stack + * trace must start from the target return address. + */ + ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[1]); +#endif + + return 0; +} + +static struct kretprobe rp3 = { + .handler = stacktrace_return_handler, + .kp.symbol_name = "kprobe_stacktrace_target" +}; + +static void test_stacktrace_on_kretprobe(struct kunit *test) +{ + unsigned long myretaddr = (unsigned long)__builtin_return_address(0); + + current_test = test; + rp3.kp.addr = NULL; + rp3.kp.flags = 0; + + /* + * Run the stacktrace_driver() to record correct return address in + * stacktrace_target() and ensure stacktrace_driver() call is not + * inlined by checking the return address of stacktrace_driver() + * and the return address of this function is different. + */ + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + + KUNIT_ASSERT_EQ(test, 0, register_kretprobe(&rp3)); + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + unregister_kretprobe(&rp3); +} + +static int stacktrace_internal_return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long retval = regs_return_value(regs); + int i, ret; + + KUNIT_EXPECT_FALSE(current_test, preemptible()); + KUNIT_EXPECT_EQ(current_test, retval, target_return_address[0]); + + /* + * Test stacktrace inside the kretprobe handler for nested case. + * The unwinder will find the kretprobe_trampoline address on the + * return address, and kretprobe must solve that. + */ + ret = stack_trace_save(stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + + for (i = 0; i < ret - 1; i++) { + if (stack_buf[i] == target_return_address[0]) { + KUNIT_EXPECT_EQ(current_test, stack_buf[i + 1], target_return_address[1]); + break; + } + } + KUNIT_EXPECT_NE(current_test, i, ret); + +#if !IS_MODULE(CONFIG_KPROBES_SANITY_TEST) + /* Ditto for the regs version. */ + ret = stack_trace_save_regs(regs, stack_buf, STACK_BUF_SIZE, 0); + KUNIT_EXPECT_NE(current_test, ret, 0); + KUNIT_EXPECT_EQ(current_test, stack_buf[0], target_return_address[0]); + KUNIT_EXPECT_EQ(current_test, stack_buf[1], target_return_address[1]); +#endif + + return 0; +} + +static struct kretprobe rp4 = { + .handler = stacktrace_internal_return_handler, + .kp.symbol_name = "kprobe_stacktrace_internal_target" +}; + +static void test_stacktrace_on_nested_kretprobe(struct kunit *test) +{ + unsigned long myretaddr = (unsigned long)__builtin_return_address(0); + struct kretprobe *rps[2] = {&rp3, &rp4}; + + current_test = test; + rp3.kp.addr = NULL; + rp3.kp.flags = 0; + + //KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + + KUNIT_ASSERT_EQ(test, 0, register_kretprobes(rps, 2)); + KUNIT_ASSERT_NE(test, myretaddr, stacktrace_driver()); + unregister_kretprobes(rps, 2); +} +#endif /* CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE */ + #endif /* CONFIG_KRETPROBES */ static int kprobes_test_init(struct kunit *test) { target = kprobe_target; target2 = kprobe_target2; + stacktrace_target = kprobe_stacktrace_target; + internal_target = kprobe_stacktrace_internal_target; + stacktrace_driver = kprobe_stacktrace_driver; do { rand1 = prandom_u32(); @@ -194,6 +352,10 @@ static struct kunit_case kprobes_testcases[] = { #ifdef CONFIG_KRETPROBES KUNIT_CASE(test_kretprobe), KUNIT_CASE(test_kretprobes), +#ifdef CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE + KUNIT_CASE(test_stacktrace_on_kretprobe), + KUNIT_CASE(test_stacktrace_on_nested_kretprobe), +#endif #endif {} }; From 010db091b6879786b5d935555b9e19c41e504f71 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 26 Oct 2021 21:21:07 +0900 Subject: [PATCH 082/101] lib/bootconfig: Fix the xbc_get_info kerneldoc Fix the kernel doc of xbc_get_info() to add '@' to the parameters. Link: https://lkml.kernel.org/r/163525086738.676803.15352231787913236933.stgit@devnote2 Fixes: e306220cb7b7 ("bootconfig: Add xbc_get_info() for the node information") Reported-by: Stephen Rothwell Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- lib/bootconfig.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 3276675b25e1..a10ab25f6fcc 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -67,8 +67,8 @@ static inline void xbc_free_mem(void *addr, size_t size) #endif /** * xbc_get_info() - Get the information of loaded boot config - * node_size: A pointer to store the number of nodes. - * data_size: A pointer to store the size of bootconfig data. + * @node_size: A pointer to store the number of nodes. + * @data_size: A pointer to store the size of bootconfig data. * * Get the number of used nodes in @node_size if it is not NULL, * and the size of bootconfig data in @data_size if it is not NULL. From f76fbbbb5061fe14824ba5807c44bd7400a6b4e1 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:28 +0800 Subject: [PATCH 083/101] samples/kretprobes: Fix return value if register_kretprobe() failed Use the actual return value instead of always -1 if register_kretprobe() failed. E.g. without this patch: # insmod samples/kprobes/kretprobe_example.ko func=no_such_func insmod: ERROR: could not insert module samples/kprobes/kretprobe_example.ko: Operation not permitted With this patch: # insmod samples/kprobes/kretprobe_example.ko func=no_such_func insmod: ERROR: could not insert module samples/kprobes/kretprobe_example.ko: Unknown symbol in module Link: https://lkml.kernel.org/r/1635213091-24387-2-git-send-email-yangtiezhu@loongson.cn Fixes: 804defea1c02 ("Kprobes: move kprobe examples to samples/") Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- samples/kprobes/kretprobe_example.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 5dc1bf3baa98..228321ecb161 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -86,7 +86,7 @@ static int __init kretprobe_init(void) ret = register_kretprobe(&my_kretprobe); if (ret < 0) { pr_err("register_kretprobe failed, returned %d\n", ret); - return -1; + return ret; } pr_info("Planted return probe at %s: %p\n", my_kretprobe.kp.symbol_name, my_kretprobe.kp.addr); From 438697a39f0692d65a7a96e4debca139fa1be9fe Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:29 +0800 Subject: [PATCH 084/101] docs, kprobes: Remove invalid URL and add new reference The following reference is invalid, remove it. https://www.ibm.com/developerworks/library/l-kprobes/index.html Add the following new reference "An introduction to KProbes": https://lwn.net/Articles/132196/ Link: https://lkml.kernel.org/r/1635213091-24387-3-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/kprobes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/kprobes.rst b/Documentation/trace/kprobes.rst index 998149ce2fd9..f318bceda1e6 100644 --- a/Documentation/trace/kprobes.rst +++ b/Documentation/trace/kprobes.rst @@ -784,6 +784,6 @@ References For additional information on Kprobes, refer to the following URLs: -- https://www.ibm.com/developerworks/library/l-kprobes/index.html +- https://lwn.net/Articles/132196/ - https://www.kernel.org/doc/ols/2006/ols2006v2-pages-109-124.pdf From b9e94a7bb6fad880ba1ec0d58897480c62f587cf Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:30 +0800 Subject: [PATCH 085/101] test_kprobes: Move it from kernel/ to lib/ Since config KPROBES_SANITY_TEST is in lib/Kconfig.debug, it is better to let test_kprobes.c in lib/, just like other similar tests found in lib/. Link: https://lkml.kernel.org/r/1635213091-24387-4-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/Makefile | 1 - lib/Makefile | 1 + {kernel => lib}/test_kprobes.c | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename {kernel => lib}/test_kprobes.c (100%) diff --git a/kernel/Makefile b/kernel/Makefile index 4df609be42d0..9e4d33dce8a5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -85,7 +85,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKHEADERS) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o -obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ diff --git a/lib/Makefile b/lib/Makefile index 5efd1b435a37..864ff515814d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_TEST_MEMINIT) += test_meminit.o obj-$(CONFIG_TEST_LOCKUP) += test_lockup.o obj-$(CONFIG_TEST_HMM) += test_hmm.o obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o +obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o # # CFLAGS for compiling floating point code inside the kernel. x86/Makefile turns diff --git a/kernel/test_kprobes.c b/lib/test_kprobes.c similarity index 100% rename from kernel/test_kprobes.c rename to lib/test_kprobes.c From 5c03d8fb04fbf2cf73dd0eacb0912fde59aaa8ed Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 26 Oct 2021 09:51:31 +0800 Subject: [PATCH 086/101] MAINTAINERS: Update KPROBES and TRACING entries There is no git tree for KPROBES in MAINTAINERS, it is not convinent to rebase, lib/test_kprobes.c and samples/kprobes belong to kprobe, so add git tree and missing files for KPROBES, and also use linux-trace.git for TRACING to avoid confusing. Link: https://lkml.kernel.org/r/1635213091-24387-5-git-send-email-yangtiezhu@loongson.cn Signed-off-by: Tiezhu Yang Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- MAINTAINERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5b33791bb8e9..8d26693cc317 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10437,10 +10437,13 @@ M: Anil S Keshavamurthy M: "David S. Miller" M: Masami Hiramatsu S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git F: Documentation/trace/kprobes.rst F: include/asm-generic/kprobes.h F: include/linux/kprobes.h F: kernel/kprobes.c +F: lib/test_kprobes.c +F: samples/kprobes KS0108 LCD CONTROLLER DRIVER M: Miguel Ojeda @@ -18967,7 +18970,7 @@ TRACING M: Steven Rostedt M: Ingo Molnar S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace.git F: Documentation/trace/ftrace.rst F: arch/*/*/*/ftrace.h F: arch/*/kernel/ftrace.c From 25b95138728028dd0f576d9f252bc8f0b6c609fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 27 Oct 2021 08:22:11 +0900 Subject: [PATCH 087/101] selftests/ftrace: Stop tracing while reading the trace file by default Stop tracing while reading the trace file by default, to prevent the test results while checking it and to avoid taking a long time to check the result. If there is any testcase which wants to test the tracing while reading the trace file, please override this setting inside the test case. This also recovers the pause-on-trace when clean it up. Link: https://lkml.kernel.org/r/163529053143.690749.15365238954175942026.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/testing/selftests/ftrace/ftracetest | 2 +- tools/testing/selftests/ftrace/test.d/functions | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index 8ec1922e974e..c3311c8c4089 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -428,7 +428,7 @@ for t in $TEST_CASES; do exit 1 fi done -(cd $TRACING_DIR; initialize_ftrace) # for cleanup +(cd $TRACING_DIR; finish_ftrace) # for cleanup prlog "" prlog "# of passed: " `echo $PASSED_CASES | wc -w` diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index 000fd05e84b1..5f6cbec847fc 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -124,10 +124,22 @@ initialize_ftrace() { # Reset ftrace to initial-state [ -f uprobe_events ] && echo > uprobe_events [ -f synthetic_events ] && echo > synthetic_events [ -f snapshot ] && echo 0 > snapshot + +# Stop tracing while reading the trace file by default, to prevent +# the test results while checking it and to avoid taking a long time +# to check the result. + [ -f options/pause-on-trace ] && echo 1 > options/pause-on-trace + clear_trace enable_tracing } +finish_ftrace() { + initialize_ftrace +# And recover it to default. + [ -f options/pause-on-trace ] && echo 0 > options/pause-on-trace +} + check_requires() { # Check required files and tracers for i in "$@" ; do r=${i%:README} From 52cfb373536a7fb744b0ec4b748518e5dc874fb7 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:33 -0700 Subject: [PATCH 088/101] tracing: Add support for creating hist trigger variables from literal Currently hist trigger expressions don't support the use of numeric literals: e.g. echo 'hist:keys=common_pid:x=$y-1234' --> is not valid expression syntax Having the ability to use numeric constants in hist triggers supports a wider range of expressions for creating variables. Add support for creating trace event histogram variables from numeric literals. e.g. echo 'hist:keys=common_pid:x=1234,y=size-1024' >> event/trigger A negative numeric constant is created, using unary minus operator (parentheses are required). e.g. echo 'hist:keys=common_pid:z=-(2)' >> event/trigger Constants can be used with division/multiplication (added in the next patch in this series) to implement granularity filters for frequent trace events. For instance we can limit emitting the rss_stat trace event to when there is a 512KB cross over in the rss size: # Create a synthetic event to monitor instead of the high frequency # rss_stat event echo 'rss_stat_throttled unsigned int mm_id; unsigned int curr; int member; long size' >> tracing/synthetic_events # Create a hist trigger that emits the synthetic rss_stat_throttled # event only when the rss size crosses a 512KB boundary. echo 'hist:keys=keys=mm_id,member:bucket=size/0x80000:onchange($bucket) .rss_stat_throttled(mm_id,curr,member,size)' >> events/kmem/rss_stat/trigger A use case for using constants with addition/subtraction is not yet known, but for completeness the use of constants are supported for all operators. Link: https://lkml.kernel.org/r/20211025200852.3002369-2-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 71 +++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b64aed538628..e6165e36d3b6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -66,7 +66,8 @@ C(EMPTY_SORT_FIELD, "Empty sort field"), \ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ - C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ + C(EXPECT_NUMBER, "Expecting numeric literal"), #undef C #define C(a, b) HIST_ERR_##a @@ -89,6 +90,7 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) #define HIST_ACTIONS_MAX 8 +#define HIST_CONST_DIGITS_MAX 21 enum field_op_id { FIELD_OP_NONE, @@ -152,6 +154,9 @@ struct hist_field { bool read_once; unsigned int var_str_idx; + + /* Numeric literals are represented as u64 */ + u64 constant; }; static u64 hist_field_none(struct hist_field *field, @@ -163,6 +168,15 @@ static u64 hist_field_none(struct hist_field *field, return 0; } +static u64 hist_field_const(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return field->constant; +} + static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -341,6 +355,7 @@ enum hist_field_flags { HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, }; struct var_defs { @@ -1516,6 +1531,12 @@ static void expr_field_str(struct hist_field *field, char *expr) { if (field->flags & HIST_FIELD_FL_VAR_REF) strcat(expr, "$"); + else if (field->flags & HIST_FIELD_FL_CONST) { + char str[HIST_CONST_DIGITS_MAX]; + + snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); + strcat(expr, str); + } strcat(expr, hist_field_name(field, 0)); @@ -1689,6 +1710,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_CONST) { + hist_field->fn = hist_field_const; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + if (flags & HIST_FIELD_FL_STACKTRACE) { hist_field->fn = hist_field_none; goto out; @@ -2090,6 +2120,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return alias; } +static struct hist_field *parse_const(struct hist_trigger_data *hist_data, + char *str, char *var_name, + unsigned long *flags) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *field = NULL; + u64 constant; + + if (kstrtoull(str, 0, &constant)) { + hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str)); + return NULL; + } + + *flags |= HIST_FIELD_FL_CONST; + field = create_hist_field(hist_data, NULL, *flags, var_name); + if (!field) + return NULL; + + field->constant = constant; + + return field; +} + static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) @@ -2100,6 +2153,15 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, unsigned long buckets = 0; int ret = 0; + if (isdigit(str[0])) { + hist_field = parse_const(hist_data, str, var_name, flags); + if (!hist_field) { + ret = -EINVAL; + goto out; + } + return hist_field; + } + s = strchr(str, '.'); if (s) { s = strchr(++s, '.'); @@ -4945,6 +5007,8 @@ static void hist_field_debug_show_flags(struct seq_file *m, if (flags & HIST_FIELD_FL_ALIAS) seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); + else if (flags & HIST_FIELD_FL_CONST) + seq_puts(m, " HIST_FIELD_FL_CONST\n"); } static int hist_field_debug_show(struct seq_file *m, @@ -4966,6 +5030,9 @@ static int hist_field_debug_show(struct seq_file *m, field->var.idx); } + if (field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, " constant: %llu\n", field->constant); + if (field->flags & HIST_FIELD_FL_ALIAS) seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", field->var_ref_idx); @@ -5208,6 +5275,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + else if (hist_field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, "%llu", hist_field->constant); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) From bcef044150320217e2a00c65050114e509c222b8 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:34 -0700 Subject: [PATCH 089/101] tracing: Add division and multiplication support for hist triggers Adds basic support for division and multiplication operations for hist trigger variable expressions. For simplicity this patch only supports, division and multiplication for a single operation expression (e.g. x=$a/$b), as currently expressions are always evaluated right to left. This can lead to some incorrect results: e.g. echo 'hist:keys=common_pid:x=8-4-2' >> event/trigger 8-4-2 should evaluate to 2 i.e. (8-4)-2 but currently x evaluate to 6 i.e. 8-(4-2) Multiplication and division in sub-expressions will work correctly, once correct operator precedence support is added (See next patch in this series). For the undefined case of division by 0, the histogram expression evaluates to (u64)(-1). Since this cannot be detected when the expression is created, it is the responsibility of the user to be aware and account for this possibility. Examples: echo 'hist:keys=common_pid:a=8,b=4,x=$a/$b' \ >> event/trigger echo 'hist:keys=common_pid:y=5*$b' \ >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-3-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 72 +++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e6165e36d3b6..1edec5d471c1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -97,6 +97,8 @@ enum field_op_id { FIELD_OP_PLUS, FIELD_OP_MINUS, FIELD_OP_UNARY_MINUS, + FIELD_OP_DIV, + FIELD_OP_MULT, }; /* @@ -285,6 +287,40 @@ static u64 hist_field_minus(struct hist_field *hist_field, return val1 - val2; } +static u64 hist_field_div(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + /* Return -1 for the undefined case */ + if (!val2) + return -1; + + return div64_u64(val1, val2); +} + +static u64 hist_field_mult(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + return val1 * val2; +} + static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1592,6 +1628,12 @@ static char *expr_str(struct hist_field *field, unsigned int level) case FIELD_OP_PLUS: strcat(expr, "+"); break; + case FIELD_OP_DIV: + strcat(expr, "/"); + break; + case FIELD_OP_MULT: + strcat(expr, "*"); + break; default: kfree(expr); return NULL; @@ -1607,7 +1649,7 @@ static int contains_operator(char *str) enum field_op_id field_op = FIELD_OP_NONE; char *op; - op = strpbrk(str, "+-"); + op = strpbrk(str, "+-/*"); if (!op) return FIELD_OP_NONE; @@ -1628,6 +1670,12 @@ static int contains_operator(char *str) case '+': field_op = FIELD_OP_PLUS; break; + case '/': + field_op = FIELD_OP_DIV; + break; + case '*': + field_op = FIELD_OP_MULT; + break; default: break; } @@ -2361,10 +2409,26 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: sep = "+"; break; + case FIELD_OP_DIV: + sep = "/"; + break; + case FIELD_OP_MULT: + sep = "*"; + break; default: goto free; } + /* + * Multiplication and division are only supported in single operator + * expressions, since the expression is always evaluated from right + * to left. + */ + if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + return ERR_PTR(-EINVAL); + } + operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; @@ -2436,6 +2500,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: expr->fn = hist_field_plus; break; + case FIELD_OP_DIV: + expr->fn = hist_field_div; + break; + case FIELD_OP_MULT: + expr->fn = hist_field_mult; + break; default: ret = -EINVAL; goto free; From 9710b2f341a0d96f35b911580639853cfda4677d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:35 -0700 Subject: [PATCH 090/101] tracing: Fix operator precedence for hist triggers expression The current histogram expression evaluation logic evaluates the expression from right to left. This can lead to incorrect results if the operations are not associative (as is the case for subtraction and, the now added, division operators). e.g. 16-8-4-2 should be 2 not 10 --> 16-8-4-2 = ((16-8)-4)-2 64/8/4/2 should be 1 not 16 --> 64/8/4/2 = ((64/8)/4)/2 Division and multiplication are currently limited to single operation expression due to operator precedence support not yet implemented. Rework the expression parsing to support the correct evaluation of expressions containing operators of different precedences; and fix the associativity error by evaluating expressions with operators of the same precedence from left to right. Examples: (1) echo 'hist:keys=common_pid:a=8,b=4,c=2,d=1,w=$a-$b-$c-$d' \ >> event/trigger (2) echo 'hist:keys=common_pid:x=$a/$b/3/2' >> event/trigger (3) echo 'hist:keys=common_pid:y=$a+10/$c*1024' >> event/trigger (4) echo 'hist:keys=common_pid:z=$a/$b+$c*$d' >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-4-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 210 ++++++++++++++++++++----------- 1 file changed, 140 insertions(+), 70 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1edec5d471c1..7a50ea2ac6b1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -67,7 +67,9 @@ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ - C(EXPECT_NUMBER, "Expecting numeric literal"), + C(EXPECT_NUMBER, "Expecting numeric literal"), \ + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ + C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1644,40 +1646,96 @@ static char *expr_str(struct hist_field *field, unsigned int level) return expr; } -static int contains_operator(char *str) +/* + * If field_op != FIELD_OP_NONE, *sep points to the root operator + * of the expression tree to be evaluated. + */ +static int contains_operator(char *str, char **sep) { enum field_op_id field_op = FIELD_OP_NONE; - char *op; + char *minus_op, *plus_op, *div_op, *mult_op; - op = strpbrk(str, "+-/*"); - if (!op) - return FIELD_OP_NONE; - switch (*op) { - case '-': + /* + * Report the last occurrence of the operators first, so that the + * expression is evaluated left to right. This is important since + * subtraction and division are not associative. + * + * e.g + * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2 + * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2 + */ + + /* + * First, find lower precedence addition and subtraction + * since the expression will be evaluated recursively. + */ + minus_op = strrchr(str, '-'); + if (minus_op) { + /* Unfortunately, the modifier ".sym-offset" can confuse things. */ + if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) + goto out; + /* - * Unfortunately, the modifier ".sym-offset" - * can confuse things. + * Unary minus is not supported in sub-expressions. If + * present, it is always the next root operator. */ - if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11)) - return FIELD_OP_NONE; - - if (*str == '-') + if (minus_op == str) { field_op = FIELD_OP_UNARY_MINUS; - else - field_op = FIELD_OP_MINUS; - break; - case '+': - field_op = FIELD_OP_PLUS; - break; - case '/': + goto out; + } + + field_op = FIELD_OP_MINUS; + } + + plus_op = strrchr(str, '+'); + if (plus_op || minus_op) { + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (plus_op > minus_op) + field_op = FIELD_OP_PLUS; + goto out; + } + + /* + * Multiplication and division have higher precedence than addition and + * subtraction. + */ + div_op = strrchr(str, '/'); + if (div_op) field_op = FIELD_OP_DIV; - break; - case '*': + + mult_op = strrchr(str, '*'); + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (mult_op > div_op) field_op = FIELD_OP_MULT; - break; - default: - break; + +out: + if (sep) { + switch (field_op) { + case FIELD_OP_UNARY_MINUS: + case FIELD_OP_MINUS: + *sep = minus_op; + break; + case FIELD_OP_PLUS: + *sep = plus_op; + break; + case FIELD_OP_DIV: + *sep = div_op; + break; + case FIELD_OP_MULT: + *sep = mult_op; + break; + case FIELD_OP_NONE: + default: + *sep = NULL; + break; + } } return field_op; @@ -2003,7 +2061,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data, if (strcmp(var_name, name) == 0) { field = hist_data->attrs->var_defs.expr[i]; - if (contains_operator(field) || is_var_ref(field)) + if (contains_operator(field, NULL) || is_var_ref(field)) continue; return field; } @@ -2266,21 +2324,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level); + char *var_name, unsigned int *n_subexprs); static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1, *expr = NULL; unsigned long operand_flags; int ret = 0; char *s; + /* Unary minus operator, increment n_subexprs */ + ++*n_subexprs; + /* we support only -(xxx) i.e. explicit parens required */ - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; @@ -2297,8 +2358,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } s = strrchr(str, ')'); - if (s) + if (s) { + /* unary minus not supported in sub-expressions */ + if (*(s+1) != '\0') { + hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR, + errpos(str)); + ret = -EINVAL; + goto free; + } *s = '\0'; + } else { ret = -EINVAL; /* no closing ')' */ goto free; @@ -2312,7 +2381,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } operand_flags = 0; - operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); goto free; @@ -2382,60 +2451,61 @@ static int check_expr_operands(struct trace_array *tr, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; unsigned long operand_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } - field_op = contains_operator(str); + /* + * ".sym-offset" in expressions has no effect on their evaluation, + * but can confuse operator parsing. + */ + if (*n_subexprs == 0) { + sep = strstr(str, ".sym-offset"); + if (sep) { + *sep = '\0'; + if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { + *sep = '.'; + hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, + errpos(sep)); + return ERR_PTR(-EINVAL); + } + *sep = '.'; + } + } + + field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) return parse_atom(hist_data, file, str, &flags, var_name); if (field_op == FIELD_OP_UNARY_MINUS) - return parse_unary(hist_data, file, str, flags, var_name, ++level); + return parse_unary(hist_data, file, str, flags, var_name, n_subexprs); - switch (field_op) { - case FIELD_OP_MINUS: - sep = "-"; - break; - case FIELD_OP_PLUS: - sep = "+"; - break; - case FIELD_OP_DIV: - sep = "/"; - break; - case FIELD_OP_MULT: - sep = "*"; - break; - default: + /* Binary operator found, increment n_subexprs */ + ++*n_subexprs; + + /* Split the expression string at the root operator */ + if (!sep) goto free; - } + *sep = '\0'; + operand1_str = str; + str = sep+1; - /* - * Multiplication and division are only supported in single operator - * expressions, since the expression is always evaluated from right - * to left. - */ - if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { - hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); - return ERR_PTR(-EINVAL); - } - - operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; operand_flags = 0; - operand1 = parse_atom(hist_data, file, operand1_str, - &operand_flags, NULL); + + /* LHS of string is an expression e.g. a+b in a+b+c */ + operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); operand1 = NULL; @@ -2447,9 +2517,9 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - /* rest of string could be another expression e.g. b+c in a+b+c */ + /* RHS of string is another expression e.g. c in a+b+c */ operand_flags = 0; - operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); operand2 = NULL; @@ -3883,9 +3953,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, unsigned long flags) { struct hist_field *hist_field; - int ret = 0; + int ret = 0, n_subexprs = 0; - hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; @@ -4026,7 +4096,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct hist_field *hist_field = NULL; unsigned long flags = 0; unsigned int key_size; - int ret = 0; + int ret = 0, n_subexprs = 0; if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; @@ -4039,7 +4109,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { hist_field = parse_expr(hist_data, file, field_str, flags, - NULL, 0); + NULL, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; From c5eac6ee8bc5d32e48b3845472b547574061f49f Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:36 -0700 Subject: [PATCH 091/101] tracing/histogram: Simplify handling of .sym-offset in expressions The '-' in .sym-offset can confuse the hist trigger arithmetic expression parsing. Simplify the handling of this by replacing the 'sym-offset' with 'symXoffset'. This allows us to correctly evaluate expressions where the user may have inadvertently added a .sym-offset modifier to one of the operands in an expression, instead of bailing out. In this case the .sym-offset has no effect on the evaluation of the expression. The only valid use of the .sym-offset is as a hist key modifier. Link: https://lkml.kernel.org/r/20211025200852.3002369-5-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 43 +++++++++++++------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7a50ea2ac6b1..bbaf2e16b7ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -68,8 +68,7 @@ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ - C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1672,10 +1671,6 @@ static int contains_operator(char *str, char **sep) */ minus_op = strrchr(str, '-'); if (minus_op) { - /* Unfortunately, the modifier ".sym-offset" can confuse things. */ - if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) - goto out; - /* * Unary minus is not supported in sub-expressions. If * present, it is always the next root operator. @@ -2138,7 +2133,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_HEX; else if (strcmp(modifier, "sym") == 0) *flags |= HIST_FIELD_FL_SYM; - else if (strcmp(modifier, "sym-offset") == 0) + /* + * 'sym-offset' occurrences in the trigger string are modified + * to 'symXoffset' to simplify arithmetic expression parsing. + */ + else if (strcmp(modifier, "symXoffset") == 0) *flags |= HIST_FIELD_FL_SYM_OFFSET; else if ((strcmp(modifier, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) @@ -2463,24 +2462,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, return ERR_PTR(-EINVAL); } - /* - * ".sym-offset" in expressions has no effect on their evaluation, - * but can confuse operator parsing. - */ - if (*n_subexprs == 0) { - sep = strstr(str, ".sym-offset"); - if (sep) { - *sep = '\0'; - if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { - *sep = '.'; - hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, - errpos(sep)); - return ERR_PTR(-EINVAL); - } - *sep = '.'; - } - } - field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) @@ -5999,7 +5980,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct synth_event *se; const char *se_name; bool remove = false; - char *trigger, *p; + char *trigger, *p, *start; int ret = 0; lockdep_assert_held(&event_mutex); @@ -6047,6 +6028,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } + /* + * To simplify arithmetic expression parsing, replace occurrences of + * '.sym-offset' modifier with '.symXoffset' + */ + start = strstr(trigger, ".sym-offset"); + while (start) { + *(start + 4) = 'X'; + start = strstr(start + 11, ".sym-offset"); + }; + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); From f47716b7a955e40e2591b960d1eccb1fde967a70 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:37 -0700 Subject: [PATCH 092/101] tracing/histogram: Covert expr to const if both operands are constants If both operands of a hist trigger expression are constants, convert the expression to a constant. This optimization avoids having to perform the same calculation multiple times and also saves on memory since the merged constants are represented by a single struct hist_field instead or multiple. Link: https://lkml.kernel.org/r/20211025200852.3002369-6-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 104 ++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bbaf2e16b7ae..71b453576d85 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2411,9 +2411,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } +/* + * If the operands are var refs, return pointers the + * variable(s) referenced in var1 and var2, else NULL. + */ static int check_expr_operands(struct trace_array *tr, struct hist_field *operand1, - struct hist_field *operand2) + struct hist_field *operand2, + struct hist_field **var1, + struct hist_field **var2) { unsigned long operand1_flags = operand1->flags; unsigned long operand2_flags = operand2->flags; @@ -2426,6 +2432,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand1_flags = var->flags; + *var1 = var; } if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || @@ -2436,6 +2443,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand2_flags = var->flags; + *var2 = var; } if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != @@ -2453,9 +2461,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; - unsigned long operand_flags; + struct hist_field *var1 = NULL, *var2 = NULL; + unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; + hist_field_fn_t op_fn; + bool combine_consts; if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); @@ -2512,11 +2523,38 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(file->tr, operand1, operand2); + switch (field_op) { + case FIELD_OP_MINUS: + op_fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + op_fn = hist_field_plus; + break; + case FIELD_OP_DIV: + op_fn = hist_field_div; + break; + case FIELD_OP_MULT: + op_fn = hist_field_mult; + break; + default: + ret = -EINVAL; + goto free; + } + + ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) goto free; - flags |= HIST_FIELD_FL_EXPR; + operand_flags = var1 ? var1->flags : operand1->flags; + operand2_flags = var2 ? var2->flags : operand2->flags; + + /* + * If both operands are constant, the expression can be + * collapsed to a single constant. + */ + combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST; + + flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR; flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2533,37 +2571,43 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operands[1] = operand2; - /* The operand sizes should be the same, so just pick one */ - expr->size = operand1->size; + if (combine_consts) { + if (var1) + expr->operands[0] = var1; + if (var2) + expr->operands[1] = var2; - expr->operator = field_op; - expr->name = expr_str(expr, 0); - expr->type = kstrdup_const(operand1->type, GFP_KERNEL); - if (!expr->type) { - ret = -ENOMEM; - goto free; - } + expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); - switch (field_op) { - case FIELD_OP_MINUS: - expr->fn = hist_field_minus; - break; - case FIELD_OP_PLUS: - expr->fn = hist_field_plus; - break; - case FIELD_OP_DIV: - expr->fn = hist_field_div; - break; - case FIELD_OP_MULT: - expr->fn = hist_field_mult; - break; - default: - ret = -EINVAL; - goto free; + expr->operands[0] = NULL; + expr->operands[1] = NULL; + + /* + * var refs won't be destroyed immediately + * See: destroy_hist_field() + */ + destroy_hist_field(operand2, 0); + destroy_hist_field(operand1, 0); + + expr->name = expr_str(expr, 0); + } else { + expr->fn = op_fn; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + + expr->operator = field_op; + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + expr->name = expr_str(expr, 0); } return expr; - free: +free: destroy_hist_field(operand1, 0); destroy_hist_field(operand2, 0); destroy_hist_field(expr, 0); From 722eddaa4043acee8f031cf238ced5f7514ad638 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:38 -0700 Subject: [PATCH 093/101] tracing/histogram: Optimize division by a power of 2 The division is a slow operation. If the divisor is a power of 2, use a shift instead. Results were obtained using Android's version of perf (simpleperf[1]) as described below: 1. hist_field_div() is modified to call 2 test functions: test_hist_field_div_[not]_optimized(); passing them the same args. Use noinline and volatile to ensure these are not optimized out by the compiler. 2. Create a hist event trigger that uses division: events/kmem/rss_stat$ echo 'hist:keys=common_pid:x=size/' >> trigger events/kmem/rss_stat$ echo 'hist:keys=common_pid:vals=$x' >> trigger 3. Run Android's lmkd_test[2] to generate rss_stat events, and record CPU samples with Android's simpleperf: simpleperf record -a --exclude-perf --post-unwind=yes -m 16384 -g -f 2000 -o perf.data == Results == Divisor is a power of 2 (divisor == 32): test_hist_field_div_not_optimized | 8,717,091 cpu-cycles test_hist_field_div_optimized | 1,643,137 cpu-cycles If the divisor is a power of 2, the optimized version is ~5.3x faster. Divisor is not a power of 2 (divisor == 33): test_hist_field_div_not_optimized | 4,444,324 cpu-cycles test_hist_field_div_optimized | 5,497,958 cpu-cycles If the divisor is not a power of 2, as expected, the optimized version is slightly slower (~24% slower). [1] https://android.googlesource.com/platform/system/extras/+/master/simpleperf/doc/README.md [2] https://cs.android.com/android/platform/superproject/+/master:system/memory/lmkd/tests/lmkd_test.cpp Link: https://lkml.kernel.org/r/20211025200852.3002369-7-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 71b453576d85..452daad7cfb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -304,6 +304,10 @@ static u64 hist_field_div(struct hist_field *hist_field, if (!val2) return -1; + /* Use shift if the divisor is a power of 2 */ + if (!(val2 & (val2 - 1))) + return val1 >> __ffs64(val2); + return div64_u64(val1, val2); } From 2d2f6d4b8ce738ef43fc3436b43cecd2fea64152 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:40 -0700 Subject: [PATCH 094/101] tracing/histogram: Document expression arithmetic and constants Histogram expressions now support division, and multiplication in addition to the already supported subtraction and addition operators. Numeric constants can also be used in a hist trigger expressions or assigned to a variable and used by refernce in an expression. Link: https://lkml.kernel.org/r/20211025200852.3002369-9-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 533415644c54..e12699abaee8 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1763,6 +1763,20 @@ using the same key and variable from yet another event:: # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger +Expressions support the use of addition, subtraction, multiplication and +division operators (+-*/). + +Note that division by zero always returns -1. + +Numeric constants can also be used directly in an expression:: + + # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/1000000 ...' >> event/trigger + +or assigned to a variable and referenced in a subsequent expression:: + + # echo 'hist:keys=next_pid:us_per_sec=1000000 ...' >> event/trigger + # echo 'hist:keys=next_pid:timestamp_secs=common_timestamp/$us_per_sec ...' >> event/trigger + 2.2.2 Synthetic Events ---------------------- From ce5e48036c9e76a2a5bd4d9079eac273087a533a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Wed, 27 Oct 2021 11:14:44 +0800 Subject: [PATCH 095/101] ftrace: disable preemption when recursion locked As the documentation explained, ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() were supposed to disable and enable preemption properly, however currently this work is done outside of the function, which could be missing by mistake. And since the internal using of trace_test_and_set_recursion() and trace_clear_recursion() also require preemption disabled, we can just merge the logical. This patch will make sure the preemption has been disabled when trace_test_and_set_recursion() return bit >= 0, and trace_clear_recursion() will enable the preemption if previously enabled. Link: https://lkml.kernel.org/r/13bde807-779c-aa4c-0672-20515ae365ea@linux.alibaba.com CC: Petr Mladek Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: Nicholas Piggin Cc: Jisheng Zhang CC: Steven Rostedt CC: Miroslav Benes Reported-by: Abaci Suggested-by: Peter Zijlstra Signed-off-by: Michael Wang [ Removed extra line in comment - SDR ] Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 2 -- arch/parisc/kernel/ftrace.c | 2 -- arch/powerpc/kernel/kprobes-ftrace.c | 2 -- arch/riscv/kernel/probes/ftrace.c | 2 -- arch/x86/kernel/kprobes/ftrace.c | 2 -- include/linux/trace_recursion.h | 11 ++++++++++- kernel/livepatch/patch.c | 12 ++++++------ kernel/trace/ftrace.c | 15 +++++---------- kernel/trace/trace_functions.c | 5 ----- 9 files changed, 21 insertions(+), 32 deletions(-) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index b388228abbf2..834cffcfbce3 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -17,7 +17,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (!p) { p = get_kprobe((kprobe_opcode_t *)(ip - MCOUNT_INSN_SIZE)); @@ -57,7 +56,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 01581f715737..b14011d3c2f1 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -211,7 +211,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -240,7 +239,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } __this_cpu_write(current_kprobe, NULL); out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 7154d58338cc..072ebe7f290b 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -26,7 +26,6 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -61,7 +60,6 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index aab85a82f419..7142ec42e889 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -15,7 +15,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -52,7 +51,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 596de2f6d3a5..dd2ec14adb77 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 24f284eb55a7..a13f23b04d73 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -155,6 +155,9 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +/* + * Preemption is promised to be disabled when return bit >= 0. + */ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, int start, int max) { @@ -189,14 +192,20 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign current->trace_recursion = val; barrier(); + preempt_disable_notrace(); + return bit + 1; } +/* + * Preemption will be enabled (if it was previously enabled). + */ static __always_inline void trace_clear_recursion(int bit) { if (!bit) return; + preempt_enable_notrace(); barrier(); bit--; trace_recursion_clear(bit); @@ -209,7 +218,7 @@ static __always_inline void trace_clear_recursion(int bit) * tracing recursed in the same context (normal vs interrupt), * * Returns: -1 if a recursion happened. - * >= 0 if no recursion + * >= 0 if no recursion. */ static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, unsigned long parent_ip) diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index e8029aea67f1..fe316c021d73 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); + /* + * The ftrace_test_recursion_trylock() will disable preemption, + * which is required for the variant of synchronize_rcu() that is + * used to allow patching functions where RCU is not watching. + * See klp_synchronize_transition() for more details. + */ bit = ftrace_test_recursion_trylock(ip, parent_ip); if (WARN_ON_ONCE(bit < 0)) return; - /* - * A variant of synchronize_rcu() is used to allow patching functions - * where RCU is not watching, see klp_synchronize_transition(). - */ - preempt_disable_notrace(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); @@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(fregs, (unsigned long)func->new_func); unlock: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2057ad363772..b4ed1a301232 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7198,16 +7198,15 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; + /* + * The ftrace_test_and_set_recursion() will disable preemption, + * which is required since some of the ops may be dynamically + * allocated, they must be freed after a synchronize_rcu(). + */ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_rcu(). - */ - preempt_disable_notrace(); - do_for_each_ftrace_op(op, ftrace_ops_list) { /* Stub functions don't need to be called nor tested */ if (op->flags & FTRACE_OPS_FL_STUB) @@ -7231,7 +7230,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, } } while_for_each_ftrace_op(op); out: - preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -7279,12 +7277,9 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) op->func(ip, parent_ip, op, fregs); - preempt_enable_notrace(); trace_clear_recursion(bit); } NOKPROBE_SYMBOL(ftrace_ops_assist_func); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1f0e63f5d1f9..9f1bfbe105e8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -186,7 +186,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, return; trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); @@ -194,7 +193,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx); ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } #ifdef CONFIG_UNWINDER_ORC @@ -298,8 +296,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (atomic_read(&data->disabled)) @@ -324,7 +320,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, out: ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } static void From d33cc657372366a8959f099c619a208b4c5dc664 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B4=87?= Date: Wed, 27 Oct 2021 11:15:11 +0800 Subject: [PATCH 096/101] ftrace: do CPU checking after preemption disabled With CONFIG_DEBUG_PREEMPT we observed reports like: BUG: using smp_processor_id() in preemptible caller is perf_ftrace_function_call+0x6f/0x2e0 CPU: 1 PID: 680 Comm: a.out Not tainted Call Trace: dump_stack_lvl+0x8d/0xcf check_preemption_disabled+0x104/0x110 ? optimize_nops.isra.7+0x230/0x230 ? text_poke_bp_batch+0x9f/0x310 perf_ftrace_function_call+0x6f/0x2e0 ... __text_poke+0x5/0x620 text_poke_bp_batch+0x9f/0x310 This telling us the CPU could be changed after task is preempted, and the checking on CPU before preemption will be invalid. Since now ftrace_test_recursion_trylock() will help to disable the preemption, this patch just do the checking after trylock() to address the issue. Link: https://lkml.kernel.org/r/54880691-5fe2-33e7-d12f-1fa6136f5183@linux.alibaba.com CC: Steven Rostedt Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: "Peter Zijlstra (Intel)" Cc: Nicholas Piggin Cc: Jisheng Zhang Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 6aed10e2f7ce..fba8cb77a73a 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -441,13 +441,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, if (!rcu_is_watching()) return; - if ((unsigned long)ops->private != smp_processor_id()) - return; - bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + if ((unsigned long)ops->private != smp_processor_id()) + goto out; + event = container_of(ops, struct perf_event, ftrace_ops); /* From 39d9c1c103d3061ac94219ac12c04753860b337e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 27 Oct 2021 10:57:53 -0400 Subject: [PATCH 097/101] bootconfig: Initialize ret in xbc_parse_tree() The do while loop continues while ret is zero, but ret is never initialized. The check for ret in the loop at the while should always be initialized, but if an empty string were to be passed in, q would be NULL and p would be '\0', and it would break out of the loop without ever setting ret. Set ret to zero, and then xbc_verify_tree() would be called and catch that it is an empty tree and report the proper error. Link: https://lkml.kernel.org/r/20211027105753.6ab9da5f@gandalf.local.home Fixes: bdac5c2b243f ("bootconfig: Allocate xbc_data inside xbc_init()") Reported-by: kernel test robot Reported-by: Andrew Morton Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- lib/bootconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/bootconfig.c b/lib/bootconfig.c index a10ab25f6fcc..70e0d52ffd24 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -836,7 +836,7 @@ static int __init xbc_verify_tree(void) static int __init xbc_parse_tree(void) { char *p, *q; - int ret, c; + int ret = 0, c; last_parent = NULL; p = xbc_data; From a90afe8d020da9298c98fddb19b7a6372e2feb45 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 30 Aug 2021 21:37:22 -0700 Subject: [PATCH 098/101] tracing: Show size of requested perf buffer If the perf buffer isn't large enough, provide a hint about how large it needs to be for whatever is running. Link: https://lkml.kernel.org/r/20210831043723.13481-1-robbat2@gentoo.org Signed-off-by: Robin H. Johnson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fba8cb77a73a..a114549720d6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); From e531e90b5ab0f7ce5ff298e165214c1aec6ed187 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 30 Aug 2021 21:37:23 -0700 Subject: [PATCH 099/101] tracing: Increase PERF_MAX_TRACE_SIZE to handle Sentinel1 and docker together Running endpoint security solutions like Sentinel1 that use perf-based tracing heavily lead to this repeated dump complaining about dockerd. The default value of 2048 is nowhere near not large enough. Using the prior patch "tracing: show size of requested buffer", we get "perf buffer not large enough, wanted 6644, have 6144", after repeated up-sizing (I did 2/4/6/8K). With 8K, the problem doesn't occur at all, so below is the trace for 6K. I'm wondering if this value should be selectable at boot time, but this is a good starting point. ``` ------------[ cut here ]------------ perf buffer not large enough, wanted 6644, have 6144 WARNING: CPU: 1 PID: 4997 at kernel/trace/trace_event_perf.c:402 perf_trace_buf_alloc+0x8c/0xa0 Modules linked in: [..] CPU: 1 PID: 4997 Comm: sh Tainted: G T 5.13.13-x86_64-00039-gb3959163488e #63 Hardware name: LENOVO 20KH002JUS/20KH002JUS, BIOS N23ET66W (1.41 ) 09/02/2019 RIP: 0010:perf_trace_buf_alloc+0x8c/0xa0 Code: 80 3d 43 97 d0 01 00 74 07 31 c0 5b 5d 41 5c c3 ba 00 18 00 00 89 ee 48 c7 c7 00 82 7d 91 c6 05 25 97 d0 01 01 e8 22 ee bc 00 <0f> 0b 31 c0 eb db 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 55 89 RSP: 0018:ffffb922026b7d58 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffff9da5ee012000 RCX: 0000000000000027 RDX: ffff9da881657828 RSI: 0000000000000001 RDI: ffff9da881657820 RBP: 00000000000019f4 R08: 0000000000000000 R09: ffffb922026b7b80 R10: ffffb922026b7b78 R11: ffffffff91dda688 R12: 000000000000000f R13: ffff9da5ee012108 R14: ffff9da8816570a0 R15: ffffb922026b7e30 FS: 00007f420db1a080(0000) GS:ffff9da881640000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000060 CR3: 00000002504a8006 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: kprobe_perf_func+0x11e/0x270 ? do_execveat_common.isra.0+0x1/0x1c0 ? do_execveat_common.isra.0+0x5/0x1c0 kprobe_ftrace_handler+0x10e/0x1d0 0xffffffffc03aa0c8 ? do_execveat_common.isra.0+0x1/0x1c0 do_execveat_common.isra.0+0x5/0x1c0 __x64_sys_execve+0x33/0x40 do_syscall_64+0x6b/0xc0 ? do_syscall_64+0x11/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f420dc1db37 Code: ff ff 76 e7 f7 d8 64 41 89 00 eb df 0f 1f 80 00 00 00 00 f7 d8 64 41 89 00 eb dc 0f 1f 84 00 00 00 00 00 b8 3b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 01 43 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffd4e8b4e38 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f420dc1db37 RDX: 0000564338d1e740 RSI: 0000564338d32d50 RDI: 0000564338d28f00 RBP: 0000564338d28f00 R08: 0000564338d32d50 R09: 0000000000000020 R10: 00000000000001b6 R11: 0000000000000246 R12: 0000564338d28f00 R13: 0000564338d32d50 R14: 0000564338d1e740 R15: 0000564338d28c60 ---[ end trace 83ab3e8e16275e49 ]--- ``` Link: https://lkml.kernel.org/r/20210831043723.13481-2-robbat2@gentoo.org Signed-off-by: Robin H. Johnson Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3e475eeb5a99..50453b287615 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -671,7 +671,7 @@ struct trace_event_file { } \ early_initcall(trace_init_perf_perm_##name); -#define PERF_MAX_TRACE_SIZE 2048 +#define PERF_MAX_TRACE_SIZE 8192 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ From 93d76e4a0e0112b320c4f0e2a3930ad634628c58 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 28 Oct 2021 10:05:48 -0700 Subject: [PATCH 100/101] tracing/histogram: Fix documentation inline emphasis warning This fixes the warning: Documentation/trace/histogram.rst:1766: WARNING: Inline emphasis start-string without end-string The issue was caused by an unescaped '*' character. Link: https://lore.kernel.org/all/20211028170548.2597449-1-kaleshsingh@google.com/T/#m77da47432f5cc6521d4294ffdb9621949cc35d04 Link: https://lkml.kernel.org/r/20211028170548.2597449-1-kaleshsingh@google.com Fixes: 2d2f6d4b8ce7 ("tracing/histogram: Document expression arithmetic and constants") Reported-by: Stephen Rothwell Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index e12699abaee8..66ec972dfb78 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -1764,7 +1764,7 @@ using the same key and variable from yet another event:: # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger Expressions support the use of addition, subtraction, multiplication and -division operators (+-*/). +division operators (+-\*/). Note that division by zero always returns -1. From feea69ec121f067073868cebe0cb9d003e64ad80 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Sat, 30 Oct 2021 08:56:15 +0800 Subject: [PATCH 101/101] tracing/histogram: Fix semicolon.cocci warnings kernel/trace/trace_events_hist.c:6039:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Link: https://lkml.kernel.org/r/20211030005615.GA41257@3074f0d39c61 Fixes: c5eac6ee8bc5 ("tracing/histogram: Simplify handling of .sym-offset in expressions") CC: Kalesh Singh Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 452daad7cfb3..682870d004c4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6084,7 +6084,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); - }; + } attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs))