diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index bb3deb76c951..f0a9ff690881 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -116,6 +116,9 @@ typedef struct { /* Number of users of the external (Nest) MMU */ atomic_t copros; + /* Number of user space windows opened in process mm_context */ + atomic_t vas_windows; + struct hash_mm_context *hash_context; unsigned long vdso_base; diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h index 9872f85d356f..965b1f39b2a5 100644 --- a/arch/powerpc/include/asm/icswx.h +++ b/arch/powerpc/include/asm/icswx.h @@ -108,6 +108,17 @@ struct data_descriptor_entry { __be64 address; } __packed __aligned(DDE_ALIGN); +/* 4.3.2 NX-stamped Fault CRB */ + +#define NX_STAMP_ALIGN (0x10) + +struct nx_fault_stamp { + __be64 fault_storage_addr; + __be16 reserved; + __u8 flags; + __u8 fault_status; + __be32 pswid; +} __packed __aligned(NX_STAMP_ALIGN); /* Chapter 6.5.2 Coprocessor-Request Block (CRB) */ @@ -135,10 +146,15 @@ struct coprocessor_request_block { struct coprocessor_completion_block ccb; - u8 reserved[48]; + union { + struct nx_fault_stamp nx; + u8 reserved[16]; + } stamp; + + u8 reserved[32]; struct coprocessor_status_block csb; -} __packed __aligned(CRB_ALIGN); +} __packed; /* RFC02167 Initiate Coprocessor Instructions document diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 360367c579de..1a474f6b1992 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -185,11 +185,41 @@ static inline void mm_context_remove_copro(struct mm_struct *mm) dec_mm_active_cpus(mm); } } + +/* + * vas_windows counter shows number of open windows in the mm + * context. During context switch, use this counter to clear the + * foreign real address mapping (CP_ABORT) for the thread / process + * that intend to use COPY/PASTE. When a process closes all windows, + * disable CP_ABORT which is expensive to run. + * + * For user context, register a copro so that TLBIs are seen by the + * nest MMU. mm_context_add/remove_vas_window() are used only for user + * space windows. + */ +static inline void mm_context_add_vas_window(struct mm_struct *mm) +{ + atomic_inc(&mm->context.vas_windows); + mm_context_add_copro(mm); +} + +static inline void mm_context_remove_vas_window(struct mm_struct *mm) +{ + int v; + + mm_context_remove_copro(mm); + v = atomic_dec_if_positive(&mm->context.vas_windows); + + /* Detect imbalance between add and remove */ + WARN_ON(v < 0); +} #else static inline void inc_mm_active_cpus(struct mm_struct *mm) { } static inline void dec_mm_active_cpus(struct mm_struct *mm) { } static inline void mm_context_add_copro(struct mm_struct *mm) { } static inline void mm_context_remove_copro(struct mm_struct *mm) { } +static inline void mm_context_add_vas_windows(struct mm_struct *mm) { } +static inline void mm_context_remove_vas_windows(struct mm_struct *mm) { } #endif diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index eedcbfb9a6ff..bfa336fbcfeb 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -272,7 +272,6 @@ struct thread_struct { unsigned mmcr0; unsigned used_ebb; - unsigned int used_vas; #endif }; diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index b867b58b1093..fdab93428372 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -102,8 +102,6 @@ static inline void clear_task_ebb(struct task_struct *t) #endif } -extern int set_thread_uses_vas(void); - extern int set_thread_tidr(struct task_struct *t); #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 93f982dbb3d4..d08ea11b271c 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -5,6 +5,8 @@ #ifndef _ASM_POWERPC_XIVE_H #define _ASM_POWERPC_XIVE_H +#include + #define XIVE_INVALID_VP 0xffffffff #ifdef CONFIG_PPC_XIVE @@ -108,7 +110,6 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); -u32 xive_native_alloc_irq(void); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); @@ -137,6 +138,12 @@ int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, u32 qindex); int xive_native_get_vp_state(u32 vp_id, u64 *out_state); bool xive_native_has_queue_state_support(void); +extern u32 xive_native_alloc_irq_on_chip(u32 chip_id); + +static inline u32 xive_native_alloc_irq(void) +{ + return xive_native_alloc_irq_on_chip(OPAL_XIVE_ANY_CHIP); +} #else diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9c21288f8645..8479c762aef2 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1228,7 +1228,8 @@ struct task_struct *__switch_to(struct task_struct *prev, * mappings, we must issue a cp_abort to clear any state and * prevent snooping, corruption or a covert channel. */ - if (current->thread.used_vas) + if (current->mm && + atomic_read(¤t->mm->context.vas_windows)) asm volatile(PPC_CP_ABORT); } #endif /* CONFIG_PPC_BOOK3S_64 */ @@ -1467,27 +1468,6 @@ void arch_setup_new_exec(void) } #endif -int set_thread_uses_vas(void) -{ -#ifdef CONFIG_PPC_BOOK3S_64 - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - return -EINVAL; - - current->thread.used_vas = 1; - - /* - * Even a process that has no foreign real address mapping can use - * an unpaired COPY instruction (to no real effect). Issue CP_ABORT - * to clear any pending COPY and prevent a covert channel. - * - * __switch_to() will issue CP_ABORT on future context switches. - */ - asm volatile(PPC_CP_ABORT); - -#endif /* CONFIG_PPC_BOOK3S_64 */ - return 0; -} - #ifdef CONFIG_PPC64 /** * Assign a TIDR (thread ID) for task @t and set it in the thread diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index c0f8120045c3..395789ffc482 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o obj-$(CONFIG_OCXL_BASE) += ocxl.o obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c index 44035a3d6414..41fa90d2f4ab 100644 --- a/arch/powerpc/platforms/powernv/vas-debug.c +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private) seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), window->tx_win ? "Send" : "Receive"); - seq_printf(s, "Pid : %d\n", window->pid); + seq_printf(s, "Pid : %d\n", vas_window_pid(window)); unlock: mutex_unlock(&vas_mutex); diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c new file mode 100644 index 000000000000..25db70be4c9c --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -0,0 +1,382 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VAS Fault handling. + * Copyright 2019, IBM Corporation + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vas.h" + +/* + * The maximum FIFO size for fault window can be 8MB + * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS + * instance will be having fault window. + * 8MB FIFO can be used if expects more faults for each VAS + * instance. + */ +#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20) + +static void dump_crb(struct coprocessor_request_block *crb) +{ + struct data_descriptor_entry *dde; + struct nx_fault_stamp *nx; + + dde = &crb->source; + pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + dde = &crb->target; + pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n", + be64_to_cpu(dde->address), be32_to_cpu(dde->length), + dde->count, dde->index, dde->flags); + + nx = &crb->stamp.nx; + pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n", + be32_to_cpu(nx->pswid), + be64_to_cpu(crb->stamp.nx.fault_storage_addr), + nx->flags, nx->fault_status); +} + +/* + * Update the CSB to indicate a translation error. + * + * User space will be polling on CSB after the request is issued. + * If NX can handle the request without any issues, it updates CSB. + * Whereas if NX encounters page fault, the kernel will handle the + * fault and update CSB with translation error. + * + * If we are unable to update the CSB means copy_to_user failed due to + * invalid csb_addr, send a signal to the process. + */ +static void update_csb(struct vas_window *window, + struct coprocessor_request_block *crb) +{ + struct coprocessor_status_block csb; + struct kernel_siginfo info; + struct task_struct *tsk; + void __user *csb_addr; + struct pid *pid; + int rc; + + /* + * NX user space windows can not be opened for task->mm=NULL + * and faults will not be generated for kernel requests. + */ + if (WARN_ON_ONCE(!window->mm || !window->user_win)) + return; + + csb_addr = (void __user *)be64_to_cpu(crb->csb_addr); + + memset(&csb, 0, sizeof(csb)); + csb.cc = CSB_CC_TRANSLATION; + csb.ce = CSB_CE_TERMINATION; + csb.cs = 0; + csb.count = 0; + + /* + * NX operates and returns in BE format as defined CRB struct. + * So saves fault_storage_addr in BE as NX pastes in FIFO and + * expects user space to convert to CPU format. + */ + csb.address = crb->stamp.nx.fault_storage_addr; + csb.flags = 0; + + pid = window->pid; + tsk = get_pid_task(pid, PIDTYPE_PID); + /* + * Process closes send window after all pending NX requests are + * completed. In multi-thread applications, a child thread can + * open a window and can exit without closing it. May be some + * requests are pending or this window can be used by other + * threads later. We should handle faults if NX encounters + * pages faults on these requests. Update CSB with translation + * error and fault address. If csb_addr passed by user space is + * invalid, send SEGV signal to pid saved in window. If the + * child thread is not running, send the signal to tgid. + * Parent thread (tgid) will close this window upon its exit. + * + * pid and mm references are taken when window is opened by + * process (pid). So tgid is used only when child thread opens + * a window and exits without closing it. + */ + if (!tsk) { + pid = window->tgid; + tsk = get_pid_task(pid, PIDTYPE_PID); + /* + * Parent thread (tgid) will be closing window when it + * exits. So should not get here. + */ + if (WARN_ON_ONCE(!tsk)) + return; + } + + /* Return if the task is exiting. */ + if (tsk->flags & PF_EXITING) { + put_task_struct(tsk); + return; + } + + use_mm(window->mm); + rc = copy_to_user(csb_addr, &csb, sizeof(csb)); + /* + * User space polls on csb.flags (first byte). So add barrier + * then copy first byte with csb flags update. + */ + if (!rc) { + csb.flags = CSB_V; + /* Make sure update to csb.flags is visible now */ + smp_mb(); + rc = copy_to_user(csb_addr, &csb, sizeof(u8)); + } + unuse_mm(window->mm); + put_task_struct(tsk); + + /* Success */ + if (!rc) + return; + + pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n", + csb_addr, pid_vnr(pid)); + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = EFAULT; + info.si_code = SEGV_MAPERR; + info.si_addr = csb_addr; + + /* + * process will be polling on csb.flags after request is sent to + * NX. So generally CSB update should not fail except when an + * application passes invalid csb_addr. So an error message will + * be displayed and leave it to user space whether to ignore or + * handle this signal. + */ + rcu_read_lock(); + rc = kill_pid_info(SIGSEGV, &info, pid); + rcu_read_unlock(); + + pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__, + pid_vnr(pid), rc); +} + +static void dump_fifo(struct vas_instance *vinst, void *entry) +{ + unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size; + unsigned long *fifo = entry; + int i; + + pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size, + vinst->fault_fifo_size / CRB_SIZE); + + /* Dump 10 CRB entries or until end of FIFO */ + pr_err("Fault FIFO Dump:\n"); + for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) { + pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n", + i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3)); + } +} + +/* + * Process valid CRBs in fault FIFO. + * NX process user space requests, return credit and update the status + * in CRB. If it encounters transalation error when accessing CRB or + * request buffers, raises interrupt on the CPU to handle the fault. + * It takes credit on fault window, updates nx_fault_stamp in CRB with + * the following information and pastes CRB in fault FIFO. + * + * pswid - window ID of the window on which the request is sent. + * fault_storage_addr - fault address + * + * It can raise a single interrupt for multiple faults. Expects OS to + * process all valid faults and return credit for each fault on user + * space and fault windows. This fault FIFO control will be done with + * credit mechanism. NX can continuously paste CRBs until credits are not + * available on fault window. Otherwise, returns with RMA_reject. + * + * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128) + * + */ +irqreturn_t vas_fault_thread_fn(int irq, void *data) +{ + struct vas_instance *vinst = data; + struct coprocessor_request_block *crb, *entry; + struct coprocessor_request_block buf; + struct vas_window *window; + unsigned long flags; + void *fifo; + + crb = &buf; + + /* + * VAS can interrupt with multiple page faults. So process all + * valid CRBs within fault FIFO until reaches invalid CRB. + * We use CCW[0] and pswid to validate validate CRBs: + * + * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0 + * OS sets this bit to 1 after reading CRB. + * pswid NX assigns window ID. Set pswid to -1 after + * reading CRB from fault FIFO. + * + * We exit this function if no valid CRBs are available to process. + * So acquire fault_lock and reset fifo_in_progress to 0 before + * exit. + * In case kernel receives another interrupt with different page + * fault, interrupt handler returns with IRQ_HANDLED if + * fifo_in_progress is set. Means these new faults will be + * handled by the current thread. Otherwise set fifo_in_progress + * and return IRQ_WAKE_THREAD to wake up thread. + */ + while (true) { + spin_lock_irqsave(&vinst->fault_lock, flags); + /* + * Advance the fault fifo pointer to next CRB. + * Use CRB_SIZE rather than sizeof(*crb) since the latter is + * aligned to CRB_ALIGN (256) but the CRB written to by VAS is + * only CRB_SIZE in len. + */ + fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE); + entry = fifo; + + if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY)) + || (entry->ccw & cpu_to_be32(CCW0_INVALID))) { + vinst->fifo_in_progress = 0; + spin_unlock_irqrestore(&vinst->fault_lock, flags); + return IRQ_HANDLED; + } + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + vinst->fault_crbs++; + if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE)) + vinst->fault_crbs = 0; + + memcpy(crb, fifo, CRB_SIZE); + entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY); + entry->ccw |= cpu_to_be32(CCW0_INVALID); + /* + * Return credit for the fault window. + */ + vas_return_credit(vinst->fault_win, false); + + pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n", + vinst->vas_id, vinst->fault_fifo, fifo, + vinst->fault_crbs); + + dump_crb(crb); + window = vas_pswid_to_window(vinst, + be32_to_cpu(crb->stamp.nx.pswid)); + + if (IS_ERR(window)) { + /* + * We got an interrupt about a specific send + * window but we can't find that window and we can't + * even clean it up (return credit on user space + * window). + * But we should not get here. + * TODO: Disable IRQ. + */ + dump_fifo(vinst, (void *)entry); + pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n", + vinst->vas_id, vinst->fault_fifo, fifo, + be32_to_cpu(crb->stamp.nx.pswid), + vinst->fault_crbs); + + WARN_ON_ONCE(1); + } else { + update_csb(window, crb); + /* + * Return credit for send window after processing + * fault CRB. + */ + vas_return_credit(window, true); + } + } +} + +irqreturn_t vas_fault_handler(int irq, void *dev_id) +{ + struct vas_instance *vinst = dev_id; + irqreturn_t ret = IRQ_WAKE_THREAD; + unsigned long flags; + + /* + * NX can generate an interrupt for multiple faults. So the + * fault handler thread process all CRBs until finds invalid + * entry. In case if NX sees continuous faults, it is possible + * that the thread function entered with the first interrupt + * can execute and process all valid CRBs. + * So wake up thread only if the fault thread is not in progress. + */ + spin_lock_irqsave(&vinst->fault_lock, flags); + + if (vinst->fifo_in_progress) + ret = IRQ_HANDLED; + else + vinst->fifo_in_progress = 1; + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + + return ret; +} + +/* + * Fault window is opened per VAS instance. NX pastes fault CRB in fault + * FIFO upon page faults. + */ +int vas_setup_fault_window(struct vas_instance *vinst) +{ + struct vas_rx_win_attr attr; + + vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE; + vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL); + if (!vinst->fault_fifo) { + pr_err("Unable to alloc %d bytes for fault_fifo\n", + vinst->fault_fifo_size); + return -ENOMEM; + } + + /* + * Invalidate all CRB entries. NX pastes valid entry for each fault. + */ + memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size); + vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); + + attr.rx_fifo_size = vinst->fault_fifo_size; + attr.rx_fifo = vinst->fault_fifo; + + /* + * Max creds is based on number of CRBs can fit in the FIFO. + * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds + * will be 0xffff since the receive creds field is 16bits wide. + */ + attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE; + attr.lnotify_lpid = 0; + attr.lnotify_pid = mfspr(SPRN_PID); + attr.lnotify_tid = mfspr(SPRN_PID); + + vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, + &attr); + + if (IS_ERR(vinst->fault_win)) { + pr_err("VAS: Error %ld opening FaultWin\n", + PTR_ERR(vinst->fault_win)); + kfree(vinst->fault_fifo); + return PTR_ERR(vinst->fault_win); + } + + pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n", + vinst->fault_win->winid, attr.lnotify_lpid, + attr.lnotify_pid, attr.lnotify_tid); + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 0c0d27d17976..d62787f502c9 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include "vas.h" @@ -373,7 +375,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) init_xlate_regs(window, winctx->user_win); val = 0ULL; - val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0); + val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id); write_hvwc_reg(window, VREG(FAULT_TX_WIN), val); /* In PowerNV, interrupts go to HV. */ @@ -748,6 +750,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (rxwin->vinst->virq) + winctx->irq_port = rxwin->vinst->irq_port; } static bool rx_win_args_valid(enum vas_cop_type cop, @@ -768,7 +772,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; - if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + if (!attr->wcreds_max) return false; if (attr->nx_win) { @@ -827,9 +831,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) rxattr->fault_win = true; rxattr->notify_disable = true; rxattr->rx_wcred_mode = true; - rxattr->tx_wcred_mode = true; rxattr->rx_win_ord_mode = true; - rxattr->tx_win_ord_mode = true; + rxattr->rej_no_credit = true; + rxattr->tc_mode = VAS_THRESH_DISABLED; } else if (cop == VAS_COP_TYPE_FTW) { rxattr->user_win = true; rxattr->intr_disable = true; @@ -873,9 +877,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; - rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; - if (rxattr->user_win) - rxwin->pid = task_pid_vnr(current); + rxwin->wcreds_max = rxattr->wcreds_max; init_winctx_for_rxwin(rxwin, rxattr, &winctx); init_winctx_regs(rxwin, &winctx); @@ -944,13 +946,22 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; winctx->rx_win_id = txwin->rxwin->winid; + /* + * IRQ and fault window setup is successful. Set fault window + * for the send window so that ready to handle faults. + */ + if (txwin->vinst->virq) + winctx->fault_win_id = txwin->vinst->fault_win->winid; winctx->dma_type = VAS_DMA_TYPE_INJECT; winctx->tc_mode = txattr->tc_mode; winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (txwin->vinst->virq) + winctx->irq_port = txwin->vinst->irq_port; - winctx->pswid = 0; + winctx->pswid = txattr->pswid ? txattr->pswid : + encode_pswid(txwin->vinst->vas_id, txwin->winid); } static bool tx_win_args_valid(enum vas_cop_type cop, @@ -1016,7 +1027,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; - txwin->pid = attr->pid; txwin->user_win = attr->user_win; txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; @@ -1040,12 +1050,59 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } else { /* - * A user mapping must ensure that context switch issues - * CP_ABORT for this thread. + * Interrupt hanlder or fault window setup failed. Means + * NX can not generate fault for page fault. So not + * opening for user space tx window. */ - rc = set_thread_uses_vas(); - if (rc) + if (!vinst->virq) { + rc = -ENODEV; goto free_window; + } + + /* + * Window opened by a child thread may not be closed when + * it exits. So take reference to its pid and release it + * when the window is free by parent thread. + * Acquire a reference to the task's pid to make sure + * pid will not be re-used - needed only for multithread + * applications. + */ + txwin->pid = get_task_pid(current, PIDTYPE_PID); + /* + * Acquire a reference to the task's mm. + */ + txwin->mm = get_task_mm(current); + + if (!txwin->mm) { + put_pid(txwin->pid); + pr_err("VAS: pid(%d): mm_struct is not found\n", + current->pid); + rc = -EPERM; + goto free_window; + } + + mmgrab(txwin->mm); + mmput(txwin->mm); + mm_context_add_vas_window(txwin->mm); + /* + * Process closes window during exit. In the case of + * multithread application, the child thread can open + * window and can exit without closing it. Expects parent + * thread to use and close the window. So do not need + * to take pid reference for parent thread. + */ + txwin->tgid = find_get_pid(task_tgid_vnr(current)); + /* + * Even a process that has no foreign real address mapping can + * use an unpaired COPY instruction (to no real effect). Issue + * CP_ABORT to clear any pending COPY and prevent a covert + * channel. + * + * __switch_to() will issue CP_ABORT on future context switches + * if process / thread has any open VAS window (Use + * current->mm->context.vas_windows). + */ + asm volatile(PPC_CP_ABORT); } set_vinst_win(vinst, txwin); @@ -1128,6 +1185,7 @@ static void poll_window_credits(struct vas_window *window) { u64 val; int creds, mode; + int count = 0; val = read_hvwc_reg(window, VREG(WINCTL)); if (window->tx_win) @@ -1146,10 +1204,27 @@ retry: creds = GET_FIELD(VAS_LRX_WCRED, val); } + /* + * Takes around few milliseconds to complete all pending requests + * and return credits. + * TODO: Scan fault FIFO and invalidate CRBs points to this window + * and issue CRB Kill to stop all pending requests. Need only + * if there is a bug in NX or fault handling in kernel. + */ if (creds < window->wcreds_max) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Process can not close send window until all credits are + * returned. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n", + vas_window_pid(window), window->winid, + creds, count); + goto retry; } } @@ -1163,6 +1238,7 @@ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; + int count = 0; retry: val = read_hvwc_reg(window, VREG(WIN_STATUS)); @@ -1170,7 +1246,16 @@ retry: if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(5)); + schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Takes around few milliseconds to process all pending + * requests. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n", + vas_window_pid(window), window->winid, count); + goto retry; } } @@ -1235,22 +1320,118 @@ int vas_win_close(struct vas_window *window) unmap_paste_region(window); - clear_vinst_win(window); - poll_window_busy_state(window); unpin_close_window(window); poll_window_credits(window); + clear_vinst_win(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ - if (window->tx_win) + if (window->tx_win) { + if (window->user_win) { + /* Drop references to pid and mm */ + put_pid(window->pid); + if (window->mm) { + mm_context_remove_vas_window(window->mm); + mmdrop(window->mm); + } + } put_rx_win(window->rxwin); + } vas_window_free(window); return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return credit for the given window. + * Send windows and fault window uses credit mechanism as follows: + * + * Send windows: + * - The default number of credits available for each send window is + * 1024. It means 1024 requests can be issued asynchronously at the + * same time. If the credit is not available, that request will be + * returned with RMA_Busy. + * - One credit is taken when NX request is issued. + * - This credit is returned after NX processed that request. + * - If NX encounters translation error, kernel will return the + * credit on the specific send window after processing the fault CRB. + * + * Fault window: + * - The total number credits available is FIFO_SIZE/CRB_SIZE. + * Means 4MB/128 in the current implementation. If credit is not + * available, RMA_Reject is returned. + * - A credit is taken when NX pastes CRB in fault FIFO. + * - The kernel with return credit on fault window after reading entry + * from fault FIFO. + */ +void vas_return_credit(struct vas_window *window, bool tx) +{ + uint64_t val; + + val = 0ULL; + if (tx) { /* send window */ + val = SET_FIELD(VAS_TX_WCRED, val, 1); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val); + } else { + val = SET_FIELD(VAS_LRX_WCRED, val, 1); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val); + } +} + +struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid) +{ + struct vas_window *window; + int winid; + + if (!pswid) { + pr_devel("%s: called for pswid 0!\n", __func__); + return ERR_PTR(-ESRCH); + } + + decode_pswid(pswid, NULL, &winid); + + if (winid >= VAS_WINDOWS_PER_CHIP) + return ERR_PTR(-ESRCH); + + /* + * If application closes the window before the hardware + * returns the fault CRB, we should wait in vas_win_close() + * for the pending requests. so the window must be active + * and the process alive. + * + * If its a kernel process, we should not get any faults and + * should not get here. + */ + window = vinst->windows[winid]; + + if (!window) { + pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n", + winid, pswid, vinst); + return NULL; + } + + /* + * Do some sanity checks on the decoded window. Window should be + * NX GZIP user send window. FTW windows should not incur faults + * since their CRBs are ignored (not queued on FIFO or processed + * by NX). + */ + if (!window->tx_win || !window->user_win || !window->nx_win || + window->cop == VAS_COP_TYPE_FAULT || + window->cop == VAS_COP_TYPE_FTW) { + pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n", + winid, window->tx_win, window->user_win, + window->nx_win, window->cop); + WARN_ON(1); + } + + return window; +} diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index ed9cc6df329a..598e4cd563fb 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -14,7 +14,10 @@ #include #include #include +#include +#include #include +#include #include "vas.h" @@ -23,12 +26,37 @@ static LIST_HEAD(vas_instances); static DEFINE_PER_CPU(int, cpu_vas_id); +static int vas_irq_fault_window_setup(struct vas_instance *vinst) +{ + char devname[64]; + int rc = 0; + + snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id); + rc = request_threaded_irq(vinst->virq, vas_fault_handler, + vas_fault_thread_fn, 0, devname, vinst); + + if (rc) { + pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n", + vinst->vas_id, vinst->virq, rc); + goto out; + } + + rc = vas_setup_fault_window(vinst); + if (rc) + free_irq(vinst->virq, vinst); + +out: + return rc; +} + static int init_vas_instance(struct platform_device *pdev) { - int rc, cpu, vasid; - struct resource *res; - struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; + struct vas_instance *vinst; + struct xive_irq_data *xd; + uint32_t chipid, hwirq; + struct resource *res; + int rc, cpu, vasid; rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); if (rc) { @@ -36,6 +64,12 @@ static int init_vas_instance(struct platform_device *pdev) return -ENODEV; } + rc = of_property_read_u32(dn, "ibm,chip-id", &chipid); + if (rc) { + pr_err("No ibm,chip-id property for %s?\n", pdev->name); + return -ENODEV; + } + if (pdev->num_resources != 4) { pr_err("Unexpected DT configuration for [%s, %d]\n", pdev->name, vasid); @@ -69,9 +103,32 @@ static int init_vas_instance(struct platform_device *pdev) vinst->paste_win_id_shift = 63 - res->end; - pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " - "paste_win_id_shift 0x%llx\n", pdev->name, vasid, - vinst->paste_base_addr, vinst->paste_win_id_shift); + hwirq = xive_native_alloc_irq_on_chip(chipid); + if (!hwirq) { + pr_err("Inst%d: Unable to allocate global irq for chip %d\n", + vinst->vas_id, chipid); + return -ENOENT; + } + + vinst->virq = irq_create_mapping(NULL, hwirq); + if (!vinst->virq) { + pr_err("Inst%d: Unable to map global irq %d\n", + vinst->vas_id, hwirq); + return -EINVAL; + } + + xd = irq_get_handler_data(vinst->virq); + if (!xd) { + pr_err("Inst%d: Invalid virq %d\n", + vinst->vas_id, vinst->virq); + return -EINVAL; + } + + vinst->irq_port = xd->trig_page; + pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n", + pdev->name, vasid, vinst->paste_base_addr, + vinst->paste_win_id_shift, vinst->virq, + vinst->irq_port); for_each_possible_cpu(cpu) { if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) @@ -82,6 +139,22 @@ static int init_vas_instance(struct platform_device *pdev) list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + spin_lock_init(&vinst->fault_lock); + /* + * IRQ and fault handling setup is needed only for user space + * send windows. + */ + if (vinst->virq) { + rc = vas_irq_fault_window_setup(vinst); + /* + * Fault window is used only for user space send windows. + * So if vinst->virq is NULL, tx_win_open returns -ENODEV + * for user space. + */ + if (rc) + vinst->virq = 0; + } + vas_instance_init_dbgdir(vinst); dev_set_drvdata(&pdev->dev, vinst); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 5574aec9ee88..a7143b16232f 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -101,11 +101,9 @@ /* * Initial per-process credits. * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED) - * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) * * TODO: Needs tuning for per-process credits */ -#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) #define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) @@ -295,6 +293,22 @@ enum vas_notify_after_count { VAS_NOTIFY_AFTER_2 }; +/* + * NX can generate an interrupt for multiple faults and expects kernel + * to process all of them. So read all valid CRB entries until find the + * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved + * bit in BE) to check valid CRB. CCW[0] will not be touched by user + * space. Application gets CRB formt error if it updates this bit. + * + * Invalidate FIFO during allocation and process all entries from last + * successful read until finds invalid pswid and ccw[0] values. + * After reading each CRB entry from fault FIFO, the kernel invalidate + * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with + * CCW0_INVALID. + */ +#define FIFO_INVALID_ENTRY 0xffffffff +#define CCW0_INVALID 1 + /* * One per instance of VAS. Each instance will have a separate set of * receive windows, one per coprocessor type. @@ -313,6 +327,15 @@ struct vas_instance { u64 paste_base_addr; u64 paste_win_id_shift; + u64 irq_port; + int virq; + int fault_crbs; + int fault_fifo_size; + int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */ + spinlock_t fault_lock; /* Protects fifo_in_progress update */ + void *fault_fifo; + struct vas_window *fault_win; /* Fault window */ + struct mutex mutex; struct vas_window *rxwin[VAS_COP_TYPE_MAX]; struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; @@ -333,7 +356,9 @@ struct vas_window { bool user_win; /* True if user space window */ void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ - pid_t pid; /* Linux process id of owner */ + struct pid *pid; /* Linux process id of owner */ + struct pid *tgid; /* Thread group ID of owner */ + struct mm_struct *mm; /* Linux process mm_struct */ int wcreds_max; /* Window credits */ char *dbgname; @@ -406,6 +431,17 @@ extern void vas_init_dbgdir(void); extern void vas_instance_init_dbgdir(struct vas_instance *vinst); extern void vas_window_init_dbgdir(struct vas_window *win); extern void vas_window_free_dbgdir(struct vas_window *win); +extern int vas_setup_fault_window(struct vas_instance *vinst); +extern irqreturn_t vas_fault_thread_fn(int irq, void *data); +extern irqreturn_t vas_fault_handler(int irq, void *dev_id); +extern void vas_return_credit(struct vas_window *window, bool tx); +extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid); + +static inline int vas_window_pid(struct vas_window *window) +{ + return pid_vnr(window->pid); +} static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) @@ -444,6 +480,21 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * Bits Usage + * 0:7 VAS id (8 bits) + * 8:15 Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) +{ + return ((u32)winid | (vasid << (31 - 7))); +} + static inline void decode_pswid(u32 pswid, int *vasid, int *winid) { if (vasid) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 5218fdc4b29a..71b881e554fc 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -280,12 +280,12 @@ static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) } #endif /* CONFIG_SMP */ -u32 xive_native_alloc_irq(void) +u32 xive_native_alloc_irq_on_chip(u32 chip_id) { s64 rc; for (;;) { - rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP); + rc = opal_xive_allocate_irq(chip_id); if (rc != OPAL_BUSY) break; msleep(OPAL_BUSY_DELAY_MS); @@ -294,7 +294,7 @@ u32 xive_native_alloc_irq(void) return 0; return rc; } -EXPORT_SYMBOL_GPL(xive_native_alloc_irq); +EXPORT_SYMBOL_GPL(xive_native_alloc_irq_on_chip); void xive_native_free_irq(u32 irq) {