From 8a6b173287bb94b3ef8360119020e856afb1c934 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Mar 2014 18:56:22 +0200 Subject: [PATCH 01/15] uprobes: Kill UPROBE_SKIP_SSTEP and can_skip_sstep() UPROBE_COPY_INSN, UPROBE_SKIP_SSTEP, and uprobe->flags must die. This patch kills UPROBE_SKIP_SSTEP. I never understood why it was added; not only it doesn't help, it harms. It can only help to avoid arch_uprobe_skip_sstep() if it was already called before and failed. But this is ugly, if we want to know whether we can emulate this instruction or not we should do this analysis in arch_uprobe_analyze_insn(), not when we hit this probe for the first time. And in fact this logic is simply wrong. arch_uprobe_skip_sstep() can fail or not depending on the task/register state, if this insn can be emulated but, say, put_user() fails we need to xol it this time, but this doesn't mean we shouldn't try to emulate it when this or another thread hits this bp next time. And this is the actual reason for this change. We need to emulate the "call" insn, but push(return-address) can obviously fail. Per-arch notes: x86: __skip_sstep() can only emulate "rep;nop". With this change it will be called every time and most probably for no reason. This will be fixed by the next changes. We need to change this suboptimal code anyway. arm: Should not be affected. It has its own "bool simulate" flag checked in arch_uprobe_skip_sstep(). ppc: Looks like, it can emulate almost everything. Does it actually need to record the fact that emulate_step() failed? Hopefully not. But if yes, it can add the ppc- specific flag into arch_uprobe. TODO: rename arch_uprobe_skip_sstep() to arch_uprobe_emulate_insn(), Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: David A. Long Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..ea2a7c0728bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { kfree(uprobe); @@ -1628,20 +1623,6 @@ bool uprobe_deny_signal(void) return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1868,13 +1849,13 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } From ddb69f276c4af8bb47ad4f24a72f72ddf58c228a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 15:16:22 +0200 Subject: [PATCH 02/15] uprobes/x86: Fold prepare_fixups() into arch_uprobe_analyze_insn() No functional changes, preparation. Shift the code from prepare_fixups() to arch_uprobe_analyze_insn() with the following modifications: - Do not call insn_get_opcode() again, it was already called by validate_insn_bits(). - Move "case 0xea" up. This way "case 0xff" can fall through to default case. - change "case 0xff" to use the nested "switch (MODRM_REG)", this way the code looks a bit simpler. - Make the comments look consistent. While at it, kill the initialization of rip_rela_target_address and ->fixups, we can rely on kzalloc(). We will add the new members into arch_uprobe, it would be better to assume that everything is zero by default. TODO: cleanup/fix the mess in validate_insn_bits() paths: - validate_insn_64bits() and validate_insn_32bits() should be unified. - "ifdef" is not used consistently; if good_insns_64 depends on CONFIG_X86_64, then probably good_insns_32 should depend on CONFIG_X86_32/EMULATION - the usage of mm->context.ia32_compat looks wrong if the task is TIF_X32. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 110 ++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 63 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 2ed845928b5f..098e56ec7954 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -53,7 +53,7 @@ #define OPCODE1(insn) ((insn)->opcode.bytes[0]) #define OPCODE2(insn) ((insn)->opcode.bytes[1]) #define OPCODE3(insn) ((insn)->opcode.bytes[2]) -#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value) +#define MODRM_REG(insn) X86_MODRM_REG((insn)->modrm.value) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -229,63 +229,6 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) return -ENOTSUPP; } -/* - * Figure out which fixups arch_uprobe_post_xol() will need to perform, and - * annotate arch_uprobe->fixups accordingly. To start with, - * arch_uprobe->fixups is either zero or it reflects rip-related fixups. - */ -static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn) -{ - bool fix_ip = true, fix_call = false; /* defaults */ - int reg; - - insn_get_opcode(insn); /* should be a nop */ - - switch (OPCODE1(insn)) { - case 0x9d: - /* popf */ - auprobe->fixups |= UPROBE_FIX_SETF; - break; - case 0xc3: /* ret/lret */ - case 0xcb: - case 0xc2: - case 0xca: - /* ip is correct */ - fix_ip = false; - break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; - case 0x9a: /* call absolute - Fix return addr, not ip */ - fix_call = true; - fix_ip = false; - break; - case 0xff: - insn_get_modrm(insn); - reg = MODRM_REG(insn); - if (reg == 2 || reg == 3) { - /* call or lcall, indirect */ - /* Fix return addr; ip is correct. */ - fix_call = true; - fix_ip = false; - } else if (reg == 4 || reg == 5) { - /* jmp or ljmp, indirect */ - /* ip is correct. */ - fix_ip = false; - } - break; - case 0xea: /* jmp absolute -- ip is correct */ - fix_ip = false; - break; - default: - break; - } - if (fix_ip) - auprobe->fixups |= UPROBE_FIX_IP; - if (fix_call) - auprobe->fixups |= UPROBE_FIX_CALL; -} - #ifdef CONFIG_X86_64 /* * If arch_uprobe->insn doesn't use rip-relative addressing, return @@ -318,7 +261,6 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins if (mm->context.ia32_compat) return; - auprobe->rip_rela_target_address = 0x0; if (!insn_rip_relative(insn)) return; @@ -421,16 +363,58 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, */ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { - int ret; struct insn insn; + bool fix_ip = true, fix_call = false; + int ret; - auprobe->fixups = 0; ret = validate_insn_bits(auprobe, mm, &insn); - if (ret != 0) + if (ret) return ret; + /* + * Figure out which fixups arch_uprobe_post_xol() will need to perform, + * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups + * is either zero or it reflects rip-related fixups. + */ handle_riprel_insn(auprobe, mm, &insn); - prepare_fixups(auprobe, &insn); + + switch (OPCODE1(&insn)) { + case 0x9d: /* popf */ + auprobe->fixups |= UPROBE_FIX_SETF; + break; + case 0xc3: /* ret or lret -- ip is correct */ + case 0xcb: + case 0xc2: + case 0xca: + fix_ip = false; + break; + case 0xe8: /* call relative - Fix return addr */ + fix_call = true; + break; + case 0x9a: /* call absolute - Fix return addr, not ip */ + fix_call = true; + fix_ip = false; + break; + case 0xea: /* jmp absolute -- ip is correct */ + fix_ip = false; + break; + case 0xff: + insn_get_modrm(&insn); + switch (MODRM_REG(&insn)) { + case 2: case 3: /* call or lcall, indirect */ + fix_call = true; + case 4: case 5: /* jmp or ljmp, indirect */ + fix_ip = false; + } + break; + default: + break; + } + + if (fix_ip) + auprobe->fixups |= UPROBE_FIX_IP; + if (fix_call) + auprobe->fixups |= UPROBE_FIX_CALL; return 0; } From 59078d4b96bb548f97d9fb429b929a289e4884d9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 18:09:36 +0200 Subject: [PATCH 03/15] uprobes/x86: Kill the "ia32_compat" check in handle_riprel_insn(), remove "mm" arg Kill the "mm->context.ia32_compat" check in handle_riprel_insn(), if it is true insn_rip_relative() must return false. validate_insn_bits() passed "ia32_compat" as !x86_64 to insn_init(), and insn_rip_relative() checks insn->x86_64. Also, remove the no longer needed "struct mm_struct *mm" argument and the unnecessary "return" at the end. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 098e56ec7954..963c121c0307 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -253,14 +253,11 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) * - The displacement is always 4 bytes. */ static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { u8 *cursor; u8 reg; - if (mm->context.ia32_compat) - return; - if (!insn_rip_relative(insn)) return; @@ -314,7 +311,6 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct ins cursor++; memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); } - return; } static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) @@ -343,7 +339,7 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { /* No RIP-relative addressing on 32-bit */ } @@ -376,7 +372,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups * is either zero or it reflects rip-related fixups. */ - handle_riprel_insn(auprobe, mm, &insn); + handle_riprel_insn(auprobe, &insn); switch (OPCODE1(&insn)) { case 0x9d: /* popf */ From d20737c07a1063d681fe9fb86f3da369da1edab7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 18:35:09 +0200 Subject: [PATCH 04/15] uprobes/x86: Gather "riprel" functions together Cosmetic. Move pre_xol_rip_insn() and handle_riprel_post_xol() up to the closely related handle_riprel_insn(). This way it is simpler to read and understand this code, and this lessens the number of ifdef's. While at it, update the comment in handle_riprel_post_xol() as Jim suggested. TODO: rename them somehow to make the naming consistent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 118 +++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 65 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 963c121c0307..c52c30fa7871 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -313,6 +313,48 @@ handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) } } +/* + * If we're emulating a rip-relative instruction, save the contents + * of the scratch register and store the target address in that register. + */ +static void +pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ + if (auprobe->fixups & UPROBE_FIX_RIP_AX) { + autask->saved_scratch_register = regs->ax; + regs->ax = current->utask->vaddr; + regs->ax += auprobe->rip_rela_target_address; + } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { + autask->saved_scratch_register = regs->cx; + regs->cx = current->utask->vaddr; + regs->cx += auprobe->rip_rela_target_address; + } +} + +static void +handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +{ + if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { + struct arch_uprobe_task *autask; + + autask = ¤t->utask->autask; + if (auprobe->fixups & UPROBE_FIX_RIP_AX) + regs->ax = autask->saved_scratch_register; + else + regs->cx = autask->saved_scratch_register; + + /* + * The original instruction includes a displacement, and so + * is 4 bytes longer than what we've just single-stepped. + * Caller may need to apply other fixups to handle stuff + * like "jmpq *...(%rip)" and "callq *...(%rip)". + */ + if (correction) + *correction += 4; + } +} + static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) { insn_init(insn, auprobe->insn, true); @@ -339,9 +381,19 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, return validate_insn_64bits(auprobe, insn); } #else /* 32-bit: */ +/* + * No RIP-relative addressing on 32-bit + */ static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) { - /* No RIP-relative addressing on 32-bit */ +} +static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, + struct arch_uprobe_task *autask) +{ +} +static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, + long *correction) +{ } static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) @@ -415,34 +467,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return 0; } -#ifdef CONFIG_X86_64 -/* - * If we're emulating a rip-relative instruction, save the contents - * of the scratch register and store the target address in that register. - */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) -{ - if (auprobe->fixups & UPROBE_FIX_RIP_AX) { - autask->saved_scratch_register = regs->ax; - regs->ax = current->utask->vaddr; - regs->ax += auprobe->rip_rela_target_address; - } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { - autask->saved_scratch_register = regs->cx; - regs->cx = current->utask->vaddr; - regs->cx += auprobe->rip_rela_target_address; - } -} -#else -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, - struct arch_uprobe_task *autask) -{ - /* No RIP-relative addressing on 32-bit */ -} -#endif - /* * arch_uprobe_pre_xol - prepare to execute out of line. * @auprobe: the probepoint information. @@ -492,42 +516,6 @@ static int adjust_ret_addr(unsigned long sp, long correction) return 0; } -#ifdef CONFIG_X86_64 -static bool is_riprel_insn(struct arch_uprobe *auprobe) -{ - return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0); -} - -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) -{ - if (is_riprel_insn(auprobe)) { - struct arch_uprobe_task *autask; - - autask = ¤t->utask->autask; - if (auprobe->fixups & UPROBE_FIX_RIP_AX) - regs->ax = autask->saved_scratch_register; - else - regs->cx = autask->saved_scratch_register; - - /* - * The original instruction includes a displacement, and so - * is 4 bytes longer than what we've just single-stepped. - * Fall through to handle stuff like "jmpq *...(%rip)" and - * "callq *...(%rip)". - */ - if (correction) - *correction += 4; - } -} -#else -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) -{ - /* No RIP-relative addressing on 32-bit */ -} -#endif - /* * If xol insn itself traps and generates a signal(Say, * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped From 34e7317d6ae8f6111ac449444f22e14f4a14ebfd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 19:38:09 +0200 Subject: [PATCH 05/15] uprobes/x86: move the UPROBE_FIX_{RIP,IP,CALL} code at the end of pre/post hooks No functional changes. Preparation to simplify the review of the next change. Just reorder the code in arch_uprobe_pre/post_xol() functions so that UPROBE_FIX_{RIP_*,IP,CALL} logic goes to the end. Also change arch_uprobe_pre_xol() to use utask instead of autask, to make the code more symmetrical with arch_uprobe_post_xol(). Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index c52c30fa7871..3bb4198aa588 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -474,19 +474,18 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, */ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct arch_uprobe_task *autask; + struct uprobe_task *utask = current->utask; - autask = ¤t->utask->autask; - autask->saved_trap_nr = current->thread.trap_nr; + regs->ip = utask->xol_vaddr; + utask->autask.saved_trap_nr = current->thread.trap_nr; current->thread.trap_nr = UPROBE_TRAP_NR; - regs->ip = current->utask->xol_vaddr; - pre_xol_rip_insn(auprobe, regs, autask); - autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + utask->autask.saved_tf = !!(regs->flags & X86_EFLAGS_TF); regs->flags |= X86_EFLAGS_TF; if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); + pre_xol_rip_insn(auprobe, regs, &utask->autask); return 0; } @@ -560,22 +559,13 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) */ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; long correction; int result = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - utask = current->utask; current->thread.trap_nr = utask->autask.saved_trap_nr; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); - /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP * so we can get an extra SIGTRAP if we do not clear TF. We need @@ -586,6 +576,14 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; + correction = (long)(utask->vaddr - utask->xol_vaddr); + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + result = adjust_ret_addr(regs->sp, correction); + return result; } From 8ad8e9d3fd64f101eed6652964670672d699e563 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 21:01:31 +0200 Subject: [PATCH 06/15] uprobes/x86: Introduce uprobe_xol_ops and arch_uprobe->ops Introduce arch_uprobe->ops pointing to the "struct uprobe_xol_ops", move the current UPROBE_FIX_{RIP*,IP,CALL} code into the default set of methods and change arch_uprobe_pre/post_xol() accordingly. This way we can add the new uprobe_xol_ops's to handle the insns which need the special processing (rip-relative jmp/call at least). Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Reviewed-by: Masami Hiramatsu --- arch/x86/include/asm/uprobes.h | 7 ++- arch/x86/kernel/uprobes.c | 107 +++++++++++++++++++++------------ 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 3087ea9c5f2e..9f8210bcbb49 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -33,12 +33,17 @@ typedef u8 uprobe_opcode_t; #define UPROBE_SWBP_INSN 0xcc #define UPROBE_SWBP_INSN_SIZE 1 +struct uprobe_xol_ops; + struct arch_uprobe { - u16 fixups; union { u8 insn[MAX_UINSN_BYTES]; u8 ixol[MAX_UINSN_BYTES]; }; + + u16 fixups; + const struct uprobe_xol_ops *ops; + #ifdef CONFIG_X86_64 unsigned long rip_rela_target_address; #endif diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3bb4198aa588..13ad8a38c2d9 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -402,6 +402,64 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, } #endif /* CONFIG_X86_64 */ +struct uprobe_xol_ops { + bool (*emulate)(struct arch_uprobe *, struct pt_regs *); + int (*pre_xol)(struct arch_uprobe *, struct pt_regs *); + int (*post_xol)(struct arch_uprobe *, struct pt_regs *); +}; + +static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); + return 0; +} + +/* + * Adjust the return address pushed by a call insn executed out of line. + */ +static int adjust_ret_addr(unsigned long sp, long correction) +{ + int rasize, ncopied; + long ra = 0; + + if (is_ia32_task()) + rasize = 4; + else + rasize = 8; + + ncopied = copy_from_user(&ra, (void __user *)sp, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + ra += correction; + ncopied = copy_to_user((void __user *)sp, &ra, rasize); + if (unlikely(ncopied)) + return -EFAULT; + + return 0; +} + +static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + long correction = (long)(utask->vaddr - utask->xol_vaddr); + int ret = 0; + + handle_riprel_post_xol(auprobe, regs, &correction); + if (auprobe->fixups & UPROBE_FIX_IP) + regs->ip += correction; + + if (auprobe->fixups & UPROBE_FIX_CALL) + ret = adjust_ret_addr(regs->sp, correction); + + return ret; +} + +static struct uprobe_xol_ops default_xol_ops = { + .pre_xol = default_pre_xol_op, + .post_xol = default_post_xol_op, +}; + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -464,6 +522,7 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (fix_call) auprobe->fixups |= UPROBE_FIX_CALL; + auprobe->ops = &default_xol_ops; return 0; } @@ -485,33 +544,8 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) if (test_tsk_thread_flag(current, TIF_BLOCKSTEP)) set_task_blockstep(current, false); - pre_xol_rip_insn(auprobe, regs, &utask->autask); - return 0; -} - -/* - * This function is called by arch_uprobe_post_xol() to adjust the return - * address pushed by a call instruction executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) -{ - int rasize, ncopied; - long ra = 0; - - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; - - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) - return -EFAULT; - - ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) - return -EFAULT; - + if (auprobe->ops->pre_xol) + return auprobe->ops->pre_xol(auprobe, regs); return 0; } @@ -560,11 +594,8 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t) int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; - long correction; - int result = 0; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); - current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -576,15 +607,9 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - correction = (long)(utask->vaddr - utask->xol_vaddr); - handle_riprel_post_xol(auprobe, regs, &correction); - if (auprobe->fixups & UPROBE_FIX_IP) - regs->ip += correction; - - if (auprobe->fixups & UPROBE_FIX_CALL) - result = adjust_ret_addr(regs->sp, correction); - - return result; + if (auprobe->ops->post_xol) + return auprobe->ops->post_xol(auprobe, regs); + return 0; } /* callback routine for handling exceptions. */ @@ -642,6 +667,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; + if (auprobe->ops->emulate) + return auprobe->ops->emulate(auprobe, regs); + + /* TODO: move this code into ->emulate() hook */ for (i = 0; i < MAX_UINSN_BYTES; i++) { if (auprobe->insn[i] == 0x66) continue; From e55848a4f8ee52465771983e144f0c3337776eda Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 31 Mar 2014 17:24:14 +0200 Subject: [PATCH 07/15] uprobes/x86: Conditionalize the usage of handle_riprel_insn() arch_uprobe_analyze_insn() calls handle_riprel_insn() at the start, but only "0xff" and "default" cases need the UPROBE_FIX_RIP_ logic. Move the callsite into "default" case and change the "0xff" case to fall-through. We are going to add the various hooks to handle the rip-relative jmp/call instructions (and more), we need this change to enforce the fact that the new code can not conflict with is_riprel_insn() logic which, after this change, can only be used by default_xol_ops. Note: arch_uprobe_abort_xol() still calls handle_riprel_post_xol() directly. This is fine unless another _xol_ops we may add later will need to reuse "UPROBE_FIX_RIP_AX|UPROBE_FIX_RIP_CX" bits in ->fixup. In this case we can add uprobe_xol_ops->abort() hook, which (perhaps) we will need anyway in the long term. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston Reviewed-by: Masami Hiramatsu --- arch/x86/kernel/uprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 13ad8a38c2d9..08cdb82815fe 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -482,8 +482,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups * is either zero or it reflects rip-related fixups. */ - handle_riprel_insn(auprobe, &insn); - switch (OPCODE1(&insn)) { case 0x9d: /* popf */ auprobe->fixups |= UPROBE_FIX_SETF; @@ -512,9 +510,9 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, case 4: case 5: /* jmp or ljmp, indirect */ fix_ip = false; } - break; + /* fall through */ default: - break; + handle_riprel_insn(auprobe, &insn); } if (fix_ip) From 014940bad8e46ca7bd0483f760f9cba60088a3d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:20:10 +0200 Subject: [PATCH 08/15] uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails Currently the error from arch_uprobe_post_xol() is silently ignored. This doesn't look good and this can lead to the hard-to-debug problems. 1. Change handle_singlestep() to loudly complain and send SIGILL. Note: this only affects x86, ppc/arm can't fail. 2. Change arch_uprobe_post_xol() to call arch_uprobe_abort_xol() and avoid TF games if it is going to return an error. This can help to to analyze the problem, if nothing else we should not report ->ip = xol_slot in the core-file. Note: this means that handle_riprel_post_xol() can be called twice, but this is fine because it is idempotent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 16 ++++++++++++---- kernel/events/uprobes.c | 8 +++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 08cdb82815fe..e72903eacd43 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -594,6 +594,15 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) struct uprobe_task *utask = current->utask; WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); + + if (auprobe->ops->post_xol) { + int err = auprobe->ops->post_xol(auprobe, regs); + if (err) { + arch_uprobe_abort_xol(auprobe, regs); + return err; + } + } + current->thread.trap_nr = utask->autask.saved_trap_nr; /* * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP @@ -605,8 +614,6 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) else if (!(auprobe->fixups & UPROBE_FIX_SETF)) regs->flags &= ~X86_EFLAGS_TF; - if (auprobe->ops->post_xol) - return auprobe->ops->post_xol(auprobe, regs); return 0; } @@ -641,8 +648,9 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, /* * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, so reset the instruction pointer to its - * probed address. + * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. + * Reset the instruction pointer to its probed address for the potential + * restart or for post mortem analysis. */ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2a7c0728bb..d1edc5e6fd03 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1867,10 +1867,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1884,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* From 75f9ef0b7f1aae33b7be7ba8d9c23c8cb48c2212 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:52:19 +0200 Subject: [PATCH 09/15] uprobes/x86: Teach arch_uprobe_post_xol() to restart if possible SIGILL after the failed arch_uprobe_post_xol() should only be used as a last resort, we should try to restart the probed insn if possible. Currently only adjust_ret_addr() can fail, and this can only happen if another thread unmapped our stack after we executed "call" out-of-line. Most probably the application if buggy, but even in this case it can have a handler for SIGSEGV/etc. And in theory it can be even correct and do something non-trivial with its memory. Of course we can't restart unconditionally, so arch_uprobe_post_xol() does this only if ->post_xol() returns -ERESTART even if currently this is the only possible error. default_post_xol_op(UPROBE_FIX_CALL) can always restart, but as Jim pointed out it should not forget to pop off the return address pushed by this insn executed out-of-line. Note: this is not "perfect", we do not want the extra handler_chain() after restart, but I think this is the best solution we can realistically do without too much uglifications. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index e72903eacd43..cdd6909affcb 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -443,16 +443,22 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs { struct uprobe_task *utask = current->utask; long correction = (long)(utask->vaddr - utask->xol_vaddr); - int ret = 0; handle_riprel_post_xol(auprobe, regs, &correction); if (auprobe->fixups & UPROBE_FIX_IP) regs->ip += correction; - if (auprobe->fixups & UPROBE_FIX_CALL) - ret = adjust_ret_addr(regs->sp, correction); + if (auprobe->fixups & UPROBE_FIX_CALL) { + if (adjust_ret_addr(regs->sp, correction)) { + if (is_ia32_task()) + regs->sp += 4; + else + regs->sp += 8; + return -ERESTART; + } + } - return ret; + return 0; } static struct uprobe_xol_ops default_xol_ops = { @@ -599,6 +605,12 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) int err = auprobe->ops->post_xol(auprobe, regs); if (err) { arch_uprobe_abort_xol(auprobe, regs); + /* + * Restart the probed insn. ->post_xol() must ensure + * this is really possible if it returns -ERESTART. + */ + if (err == -ERESTART) + return 0; return err; } } From 8faaed1b9f500d6cf32702716733a645c9b0727a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 17:16:10 +0200 Subject: [PATCH 10/15] uprobes/x86: Introduce sizeof_long(), cleanup adjust_ret_addr() and arch_uretprobe_hijack_return_addr() 1. Add the trivial sizeof_long() helper and change other callers of is_ia32_task() to use it. TODO: is_ia32_task() is not what we actually want, TS_COMPAT does not necessarily mean 32bit. Fortunately syscall-like insns can't be probed so it actually works, but it would be better to rename and use is_ia32_frame(). 2. As Jim pointed out "ncopied" in arch_uretprobe_hijack_return_addr() and adjust_ret_addr() should be named "nleft". And in fact only the last copy_to_user() in arch_uretprobe_hijack_return_addr() actually needs to inspect the non-zero error code. TODO: adjust_ret_addr() should die. We can always calculate the value we need to write into *regs->sp, just UPROBE_FIX_CALL should record insn->length. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index cdd6909affcb..aecc22054384 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -408,6 +408,11 @@ struct uprobe_xol_ops { int (*post_xol)(struct arch_uprobe *, struct pt_regs *); }; +static inline int sizeof_long(void) +{ + return is_ia32_task() ? 4 : 8; +} + static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); @@ -419,21 +424,14 @@ static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) */ static int adjust_ret_addr(unsigned long sp, long correction) { - int rasize, ncopied; - long ra = 0; + int rasize = sizeof_long(); + long ra; - if (is_ia32_task()) - rasize = 4; - else - rasize = 8; - - ncopied = copy_from_user(&ra, (void __user *)sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&ra, (void __user *)sp, rasize)) return -EFAULT; ra += correction; - ncopied = copy_to_user((void __user *)sp, &ra, rasize); - if (unlikely(ncopied)) + if (copy_to_user((void __user *)sp, &ra, rasize)) return -EFAULT; return 0; @@ -450,10 +448,7 @@ static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs if (auprobe->fixups & UPROBE_FIX_CALL) { if (adjust_ret_addr(regs->sp, correction)) { - if (is_ia32_task()) - regs->sp += 4; - else - regs->sp += 8; + regs->sp += sizeof_long(); return -ERESTART; } } @@ -714,23 +709,21 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs) { - int rasize, ncopied; + int rasize = sizeof_long(), nleft; unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */ - rasize = is_ia32_task() ? 4 : 8; - ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize); - if (unlikely(ncopied)) + if (copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize)) return -1; /* check whether address has been already hijacked */ if (orig_ret_vaddr == trampoline_vaddr) return orig_ret_vaddr; - ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); - if (likely(!ncopied)) + nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize); + if (likely(!nleft)) return orig_ret_vaddr; - if (ncopied != rasize) { + if (nleft != rasize) { pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, " "%%ip=%#lx\n", current->pid, regs->sp, regs->ip); From 7ba6db2d688bdf83049a18c8e55b2d1e58e8b0bc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Apr 2014 20:05:02 +0200 Subject: [PATCH 11/15] uprobes/x86: Emulate unconditional relative jmp's Currently we always execute all insns out-of-line, including relative jmp's and call's. This assumes that even if regs->ip points to nowhere after the single-step, default_post_xol_op(UPROBE_FIX_IP) logic will update it correctly. However, this doesn't work if this regs->ip == xol_vaddr + insn_offset is not canonical. In this case CPU generates #GP and general_protection() kills the task which tries to execute this insn out-of-line. Now that we have uprobe_xol_ops we can teach uprobes to emulate these insns and solve the problem. This patch adds branch_xol_ops which has a single branch_emulate_op() hook, so far it can only handle rel8/32 relative jmp's. TODO: move ->fixup into the union along with rip_rela_target_address. Signed-off-by: Oleg Nesterov Reported-by: Jonathan Lebon Reviewed-by: Jim Keniston --- arch/x86/include/asm/uprobes.h | 8 ++++++- arch/x86/kernel/uprobes.c | 38 ++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 9f8210bcbb49..e9fd4d5537ed 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -44,9 +44,15 @@ struct arch_uprobe { u16 fixups; const struct uprobe_xol_ops *ops; + union { #ifdef CONFIG_X86_64 - unsigned long rip_rela_target_address; + unsigned long rip_rela_target_address; #endif + struct { + s32 offs; + u8 ilen; + } branch; + }; }; struct arch_uprobe_task { diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index aecc22054384..c3baeaacf1b6 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -461,6 +461,40 @@ static struct uprobe_xol_ops default_xol_ops = { .post_xol = default_post_xol_op, }; +static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + regs->ip += auprobe->branch.ilen + auprobe->branch.offs; + return true; +} + +static struct uprobe_xol_ops branch_xol_ops = { + .emulate = branch_emulate_op, +}; + +/* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ +static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) +{ + + switch (OPCODE1(insn)) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + break; + default: + return -ENOSYS; + } + + /* has the side-effect of processing the entire instruction */ + insn_get_length(insn); + if (WARN_ON_ONCE(!insn_complete(insn))) + return -ENOEXEC; + + auprobe->branch.ilen = insn->length; + auprobe->branch.offs = insn->immediate.value; + + auprobe->ops = &branch_xol_ops; + return 0; +} + /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. @@ -478,6 +512,10 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, if (ret) return ret; + ret = branch_setup_xol_ops(auprobe, &insn); + if (ret != -ENOSYS) + return ret; + /* * Figure out which fixups arch_uprobe_post_xol() will need to perform, * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups From d241006354c550c7d22f304e2fdf90137fb8eaab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Apr 2014 21:06:10 +0200 Subject: [PATCH 12/15] uprobes/x86: Emulate nop's using ops->emulate() Finally we can kill the ugly (and very limited) code in __skip_sstep(). Just change branch_setup_xol_ops() to treat "nop" as jmp to the next insn. Thanks to lib/insn.c, it is clever enough. OPCODE1() == 0x90 includes "(rep;)+ nop;" at least, and (afaics) much more. Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index c3baeaacf1b6..f3c4212f3819 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -478,6 +478,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) switch (OPCODE1(insn)) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ break; default: return -ENOSYS; @@ -710,29 +711,10 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_TF; } -/* - * Skip these instructions as per the currently known x86 ISA. - * rep=0x66*; nop=0x90 - */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { - int i; - if (auprobe->ops->emulate) return auprobe->ops->emulate(auprobe, regs); - - /* TODO: move this code into ->emulate() hook */ - for (i = 0; i < MAX_UINSN_BYTES; i++) { - if (auprobe->insn[i] == 0x66) - continue; - - if (auprobe->insn[i] == 0x90) { - regs->ip += i + 1; - return true; - } - - break; - } return false; } From 8e89c0be171b1a9ed2ba67168733ca811bb45d5c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 18:11:02 +0200 Subject: [PATCH 13/15] uprobes/x86: Emulate relative call's See the previous "Emulate unconditional relative jmp's" which explains why we can not execute "jmp" out-of-line, the same applies to "call". Emulating of rip-relative call is trivial, we only need to additionally push the ret-address. If this fails, we execute this instruction out of line and this should trigger the trap, the probed application should die or the same insn will be restarted if a signal handler expands the stack. We do not even need ->post_xol() for this case. But there is a corner (and almost theoretical) case: another thread can expand the stack right before we execute this insn out of line. In this case it hit the same problem we are trying to solve. So we simply turn the probed insn into "call 1f; 1:" and add ->post_xol() which restores ->sp and restarts. Many thanks to Jonathan who finally found the standalone reproducer, otherwise I would never resolve the "random SIGSEGV's under systemtap" bug-report. Now that the problem is clear we can write the simplified test-case: void probe_func(void), callee(void); int failed = 1; asm ( ".text\n" ".align 4096\n" ".globl probe_func\n" "probe_func:\n" "call callee\n" "ret" ); /* * This assumes that: * * - &probe_func = 0x401000 + a_bit, aligned = 0x402000 * * - xol_vma->vm_start = TASK_SIZE_MAX - PAGE_SIZE = 0x7fffffffe000 * as xol_add_vma() asks; the 1st slot = 0x7fffffffe080 * * so we can target the non-canonical address from xol_vma using * the simple math below, 100 * 4096 is just the random offset */ asm (".org . + 0x800000000000 - 0x7fffffffe080 - 5 - 1 + 100 * 4096\n"); void callee(void) { failed = 0; } int main(void) { probe_func(); return failed; } It SIGSEGV's if you probe "probe_func" (although this is not very reliable, randomize_va_space/etc can change the placement of xol area). Note: as Denys Vlasenko pointed out, amd and intel treat "callw" (0x66 0xe8) differently. This patch relies on lib/insn.c and thus implements the intel's behaviour: 0x66 is simply ignored. Fortunately nothing sane should ever use this insn, so we postpone the fix until we decide what should we do; emulate or not, support or not, etc. Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/include/asm/uprobes.h | 1 + arch/x86/kernel/uprobes.c | 86 +++++++++++++++++++++++++++++----- 2 files changed, 74 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index e9fd4d5537ed..93bee7b93854 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -51,6 +51,7 @@ struct arch_uprobe { struct { s32 offs; u8 ilen; + u8 opc1; } branch; }; }; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index f3c4212f3819..0914435001f5 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -461,34 +461,97 @@ static struct uprobe_xol_ops default_xol_ops = { .post_xol = default_post_xol_op, }; +static bool branch_is_call(struct arch_uprobe *auprobe) +{ + return auprobe->branch.opc1 == 0xe8; +} + static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { - regs->ip += auprobe->branch.ilen + auprobe->branch.offs; + unsigned long new_ip = regs->ip += auprobe->branch.ilen; + + if (branch_is_call(auprobe)) { + unsigned long new_sp = regs->sp - sizeof_long(); + /* + * If it fails we execute this (mangled, see the comment in + * branch_clear_offset) insn out-of-line. In the likely case + * this should trigger the trap, and the probed application + * should die or restart the same insn after it handles the + * signal, arch_uprobe_post_xol() won't be even called. + * + * But there is corner case, see the comment in ->post_xol(). + */ + if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) + return false; + regs->sp = new_sp; + } + + regs->ip = new_ip + auprobe->branch.offs; return true; } +static int branch_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + BUG_ON(!branch_is_call(auprobe)); + /* + * We can only get here if branch_emulate_op() failed to push the ret + * address _and_ another thread expanded our stack before the (mangled) + * "call" insn was executed out-of-line. Just restore ->sp and restart. + * We could also restore ->ip and try to call branch_emulate_op() again. + */ + regs->sp += sizeof_long(); + return -ERESTART; +} + +static void branch_clear_offset(struct arch_uprobe *auprobe, struct insn *insn) +{ + /* + * Turn this insn into "call 1f; 1:", this is what we will execute + * out-of-line if ->emulate() fails. We only need this to generate + * a trap, so that the probed task receives the correct signal with + * the properly filled siginfo. + * + * But see the comment in ->post_xol(), in the unlikely case it can + * succeed. So we need to ensure that the new ->ip can not fall into + * the non-canonical area and trigger #GP. + * + * We could turn it into (say) "pushf", but then we would need to + * divorce ->insn[] and ->ixol[]. We need to preserve the 1st byte + * of ->insn[] for set_orig_insn(). + */ + memset(auprobe->insn + insn_offset_immediate(insn), + 0, insn->immediate.nbytes); +} + static struct uprobe_xol_ops branch_xol_ops = { .emulate = branch_emulate_op, + .post_xol = branch_post_xol_op, }; /* Returns -ENOSYS if branch_xol_ops doesn't handle this insn */ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { - - switch (OPCODE1(insn)) { - case 0xeb: /* jmp 8 */ - case 0xe9: /* jmp 32 */ - case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ - break; - default: - return -ENOSYS; - } + u8 opc1 = OPCODE1(insn); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); if (WARN_ON_ONCE(!insn_complete(insn))) return -ENOEXEC; + switch (opc1) { + case 0xeb: /* jmp 8 */ + case 0xe9: /* jmp 32 */ + case 0x90: /* prefix* + nop; same as jmp with .offs = 0 */ + break; + + case 0xe8: /* call relative */ + branch_clear_offset(auprobe, insn); + break; + default: + return -ENOSYS; + } + + auprobe->branch.opc1 = opc1; auprobe->branch.ilen = insn->length; auprobe->branch.offs = insn->immediate.value; @@ -532,9 +595,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, case 0xca: fix_ip = false; break; - case 0xe8: /* call relative - Fix return addr */ - fix_call = true; - break; case 0x9a: /* call absolute - Fix return addr, not ip */ fix_call = true; fix_ip = false; From 8f95505bc18a026ef7d3dfdbce4e5b31b3e4fc1b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 6 Apr 2014 21:53:47 +0200 Subject: [PATCH 14/15] uprobes/x86: Emulate relative conditional "short" jmp's Teach branch_emulate_op() to emulate the conditional "short" jmp's which check regs->flags. Note: this doesn't support jcxz/jcexz, loope/loopz, and loopne/loopnz. They all are rel8 and thus they can't trigger the problem, but perhaps we will add the support in future just for completeness. Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 57 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0914435001f5..0460d04f0acc 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -466,9 +466,58 @@ static bool branch_is_call(struct arch_uprobe *auprobe) return auprobe->branch.opc1 == 0xe8; } +#define CASE_COND \ + COND(70, 71, XF(OF)) \ + COND(72, 73, XF(CF)) \ + COND(74, 75, XF(ZF)) \ + COND(78, 79, XF(SF)) \ + COND(7a, 7b, XF(PF)) \ + COND(76, 77, XF(CF) || XF(ZF)) \ + COND(7c, 7d, XF(SF) != XF(OF)) \ + COND(7e, 7f, XF(ZF) || XF(SF) != XF(OF)) + +#define COND(op_y, op_n, expr) \ + case 0x ## op_y: DO((expr) != 0) \ + case 0x ## op_n: DO((expr) == 0) + +#define XF(xf) (!!(flags & X86_EFLAGS_ ## xf)) + +static bool is_cond_jmp_opcode(u8 opcode) +{ + switch (opcode) { + #define DO(expr) \ + return true; + CASE_COND + #undef DO + + default: + return false; + } +} + +static bool check_jmp_cond(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + unsigned long flags = regs->flags; + + switch (auprobe->branch.opc1) { + #define DO(expr) \ + return expr; + CASE_COND + #undef DO + + default: /* not a conditional jmp */ + return true; + } +} + +#undef XF +#undef COND +#undef CASE_COND + static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) { unsigned long new_ip = regs->ip += auprobe->branch.ilen; + unsigned long offs = (long)auprobe->branch.offs; if (branch_is_call(auprobe)) { unsigned long new_sp = regs->sp - sizeof_long(); @@ -484,9 +533,11 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs) if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) return false; regs->sp = new_sp; + } else if (!check_jmp_cond(auprobe, regs)) { + offs = 0; } - regs->ip = new_ip + auprobe->branch.offs; + regs->ip = new_ip + offs; return true; } @@ -547,8 +598,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) case 0xe8: /* call relative */ branch_clear_offset(auprobe, insn); break; + default: - return -ENOSYS; + if (!is_cond_jmp_opcode(opc1)) + return -ENOSYS; } auprobe->branch.opc1 = opc1; From 6cc5e7ff2c38641060f20786a5caf2815edbca5f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 16:22:58 +0200 Subject: [PATCH 15/15] uprobes/x86: Emulate relative conditional "near" jmp's Change branch_setup_xol_ops() to simply use opc1 = OPCODE2(insn) - 0x10 if OPCODE1() == 0x0f; this matches the "short" jmp which checks the same condition. Thanks to lib/insn.c, it does the rest correctly. branch->ilen/offs are correct no matter if this jmp is "near" or "short". Reported-by: Jonathan Lebon Signed-off-by: Oleg Nesterov Reviewed-by: Jim Keniston --- arch/x86/kernel/uprobes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 0460d04f0acc..ace22916ade3 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -599,6 +599,14 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) branch_clear_offset(auprobe, insn); break; + case 0x0f: + if (insn->opcode.nbytes != 2) + return -ENOSYS; + /* + * If it is a "near" conditional jmp, OPCODE2() - 0x10 matches + * OPCODE1() of the "short" jmp which checks the same condition. + */ + opc1 = OPCODE2(insn) - 0x10; default: if (!is_cond_jmp_opcode(opc1)) return -ENOSYS;