linux/drivers/infiniband/hw/hfi1/sdma.c

3382 lines
88 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
/*
* Copyright(c) 2015 - 2018 Intel Corporation.
*/
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/netdevice.h>
#include <linux/moduleparam.h>
#include <linux/bitops.h>
#include <linux/timer.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include "hfi.h"
#include "common.h"
#include "qp.h"
#include "sdma.h"
#include "iowait.h"
#include "trace.h"
/* must be a power of 2 >= 64 <= 32768 */
#define SDMA_DESCQ_CNT 2048
#define SDMA_DESC_INTR 64
#define INVALID_TAIL 0xffff
#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32))
static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
module_param(sdma_descq_cnt, uint, S_IRUGO);
MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
static uint sdma_idle_cnt = 250;
module_param(sdma_idle_cnt, uint, S_IRUGO);
MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)");
uint mod_num_sdma;
module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO);
MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use");
static uint sdma_desct_intr = SDMA_DESC_INTR;
module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt");
#define SDMA_WAIT_BATCH_SIZE 20
/* max wait time for a SDMA engine to indicate it has halted */
#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */
/* all SDMA engine errors that cause a halt */
#define SD(name) SEND_DMA_##name
#define ALL_SDMA_ENG_HALT_ERRS \
(SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \
| SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK))
/* sdma_sendctrl operations */
#define SDMA_SENDCTRL_OP_ENABLE BIT(0)
#define SDMA_SENDCTRL_OP_INTENABLE BIT(1)
#define SDMA_SENDCTRL_OP_HALT BIT(2)
#define SDMA_SENDCTRL_OP_CLEANUP BIT(3)
/* handle long defines */
#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \
SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK
#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \
SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT
static const char * const sdma_state_names[] = {
[sdma_state_s00_hw_down] = "s00_HwDown",
[sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait",
[sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait",
[sdma_state_s20_idle] = "s20_Idle",
[sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
[sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
[sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait",
[sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait",
[sdma_state_s80_hw_freeze] = "s80_HwFreeze",
[sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean",
[sdma_state_s99_running] = "s99_Running",
};
#ifdef CONFIG_SDMA_VERBOSITY
static const char * const sdma_event_names[] = {
[sdma_event_e00_go_hw_down] = "e00_GoHwDown",
[sdma_event_e10_go_hw_start] = "e10_GoHwStart",
[sdma_event_e15_hw_halt_done] = "e15_HwHaltDone",
[sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone",
[sdma_event_e30_go_running] = "e30_GoRunning",
[sdma_event_e40_sw_cleaned] = "e40_SwCleaned",
[sdma_event_e50_hw_cleaned] = "e50_HwCleaned",
[sdma_event_e60_hw_halted] = "e60_HwHalted",
[sdma_event_e70_go_idle] = "e70_GoIdle",
[sdma_event_e80_hw_freeze] = "e80_HwFreeze",
[sdma_event_e81_hw_frozen] = "e81_HwFrozen",
[sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze",
[sdma_event_e85_link_down] = "e85_LinkDown",
[sdma_event_e90_sw_halted] = "e90_SwHalted",
};
#endif
static const struct sdma_set_state_action sdma_action_table[] = {
[sdma_state_s00_hw_down] = {
.go_s99_running_tofalse = 1,
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s10_hw_start_up_halt_wait] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 1,
.op_cleanup = 0,
},
[sdma_state_s15_hw_start_up_clean_wait] = {
.op_enable = 0,
.op_intenable = 1,
.op_halt = 0,
.op_cleanup = 1,
},
[sdma_state_s20_idle] = {
.op_enable = 0,
.op_intenable = 1,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s30_sw_clean_up_wait] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s40_hw_clean_up_wait] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 1,
},
[sdma_state_s50_hw_halt_wait] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s60_idle_halt_wait] = {
.go_s99_running_tofalse = 1,
.op_enable = 0,
.op_intenable = 0,
.op_halt = 1,
.op_cleanup = 0,
},
[sdma_state_s80_hw_freeze] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s82_freeze_sw_clean] = {
.op_enable = 0,
.op_intenable = 0,
.op_halt = 0,
.op_cleanup = 0,
},
[sdma_state_s99_running] = {
.op_enable = 1,
.op_intenable = 1,
.op_halt = 0,
.op_cleanup = 0,
.go_s99_running_totrue = 1,
},
};
#define SDMA_TAIL_UPDATE_THRESH 0x1F
/* declare all statics here rather than keep sorting */
static void sdma_complete(struct kref *);
static void sdma_finalput(struct sdma_state *);
static void sdma_get(struct sdma_state *);
static void sdma_hw_clean_up_task(struct tasklet_struct *);
static void sdma_put(struct sdma_state *);
static void sdma_set_state(struct sdma_engine *, enum sdma_states);
static void sdma_start_hw_clean_up(struct sdma_engine *);
static void sdma_sw_clean_up_task(struct tasklet_struct *);
static void sdma_sendctrl(struct sdma_engine *, unsigned);
static void init_sdma_regs(struct sdma_engine *, u32, uint);
static void sdma_process_event(
struct sdma_engine *sde,
enum sdma_events event);
static void __sdma_process_event(
struct sdma_engine *sde,
enum sdma_events event);
static void dump_sdma_state(struct sdma_engine *sde);
static void sdma_make_progress(struct sdma_engine *sde, u64 status);
static void sdma_desc_avail(struct sdma_engine *sde, uint avail);
static void sdma_flush_descq(struct sdma_engine *sde);
/**
* sdma_state_name() - return state string from enum
* @state: state
*/
static const char *sdma_state_name(enum sdma_states state)
{
return sdma_state_names[state];
}
static void sdma_get(struct sdma_state *ss)
{
kref_get(&ss->kref);
}
static void sdma_complete(struct kref *kref)
{
struct sdma_state *ss =
container_of(kref, struct sdma_state, kref);
complete(&ss->comp);
}
static void sdma_put(struct sdma_state *ss)
{
kref_put(&ss->kref, sdma_complete);
}
static void sdma_finalput(struct sdma_state *ss)
{
sdma_put(ss);
wait_for_completion(&ss->comp);
}
static inline void write_sde_csr(
struct sdma_engine *sde,
u32 offset0,
u64 value)
{
write_kctxt_csr(sde->dd, sde->this_idx, offset0, value);
}
static inline u64 read_sde_csr(
struct sdma_engine *sde,
u32 offset0)
{
return read_kctxt_csr(sde->dd, sde->this_idx, offset0);
}
/*
* sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for
* sdma engine 'sde' to drop to 0.
*/
static void sdma_wait_for_packet_egress(struct sdma_engine *sde,
int pause)
{
u64 off = 8 * sde->this_idx;
struct hfi1_devdata *dd = sde->dd;
int lcnt = 0;
u64 reg_prev;
u64 reg = 0;
while (1) {
reg_prev = reg;
reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS);
reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK;
reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT;
if (reg == 0)
break;
/* counter is reest if accupancy count changes */
if (reg != reg_prev)
lcnt = 0;
if (lcnt++ > 500) {
/* timed out - bounce the link */
dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n",
__func__, sde->this_idx, (u32)reg);
queue_work(dd->pport->link_wq,
&dd->pport->link_bounce_work);
break;
}
udelay(1);
}
}
/*
* sdma_wait() - wait for packet egress to complete for all SDMA engines,
* and pause for credit return.
*/
void sdma_wait(struct hfi1_devdata *dd)
{
int i;
for (i = 0; i < dd->num_sdma; i++) {
struct sdma_engine *sde = &dd->per_sdma[i];
sdma_wait_for_packet_egress(sde, 0);
}
}
static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt)
{
u64 reg;
if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT))
return;
reg = cnt;
reg &= SD(DESC_CNT_CNT_MASK);
reg <<= SD(DESC_CNT_CNT_SHIFT);
write_sde_csr(sde, SD(DESC_CNT), reg);
}
static inline void complete_tx(struct sdma_engine *sde,
struct sdma_txreq *tx,
int res)
{
/* protect against complete modifying */
struct iowait *wait = tx->wait;
callback_t complete = tx->complete;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
trace_hfi1_sdma_out_sn(sde, tx->sn);
if (WARN_ON_ONCE(sde->head_sn != tx->sn))
dd_dev_err(sde->dd, "expected %llu got %llu\n",
sde->head_sn, tx->sn);
sde->head_sn++;
#endif
__sdma_txclean(sde->dd, tx);
if (complete)
(*complete)(tx, res);
if (iowait_sdma_dec(wait))
iowait_drain_wakeup(wait);
}
/*
* Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status
*
* Depending on timing there can be txreqs in two places:
* - in the descq ring
* - in the flush list
*
* To avoid ordering issues the descq ring needs to be flushed
* first followed by the flush list.
*
* This routine is called from two places
* - From a work queue item
* - Directly from the state machine just before setting the
* state to running
*
* Must be called with head_lock held
*
*/
static void sdma_flush(struct sdma_engine *sde)
{
struct sdma_txreq *txp, *txp_next;
LIST_HEAD(flushlist);
unsigned long flags;
uint seq;
/* flush from head to tail */
sdma_flush_descq(sde);
spin_lock_irqsave(&sde->flushlist_lock, flags);
/* copy flush list */
list_splice_init(&sde->flushlist, &flushlist);
spin_unlock_irqrestore(&sde->flushlist_lock, flags);
/* flush from flush list */
list_for_each_entry_safe(txp, txp_next, &flushlist, list)
complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
/* wakeup QPs orphaned on the dmawait list */
do {
struct iowait *w, *nw;
seq = read_seqbegin(&sde->waitlock);
if (!list_empty(&sde->dmawait)) {
write_seqlock(&sde->waitlock);
list_for_each_entry_safe(w, nw, &sde->dmawait, list) {
if (w->wakeup) {
w->wakeup(w, SDMA_AVAIL_REASON);
list_del_init(&w->list);
}
}
write_sequnlock(&sde->waitlock);
}
} while (read_seqretry(&sde->waitlock, seq));
}
/*
* Fields a work request for flushing the descq ring
* and the flush list
*
* If the engine has been brought to running during
* the scheduling delay, the flush is ignored, assuming
* that the process of bringing the engine to running
* would have done this flush prior to going to running.
*
*/
static void sdma_field_flush(struct work_struct *work)
{
unsigned long flags;
struct sdma_engine *sde =
container_of(work, struct sdma_engine, flush_worker);
write_seqlock_irqsave(&sde->head_lock, flags);
if (!__sdma_running(sde))
sdma_flush(sde);
write_sequnlock_irqrestore(&sde->head_lock, flags);
}
static void sdma_err_halt_wait(struct work_struct *work)
{
struct sdma_engine *sde = container_of(work, struct sdma_engine,
err_halt_worker);
u64 statuscsr;
unsigned long timeout;
timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT);
while (1) {
statuscsr = read_sde_csr(sde, SD(STATUS));
statuscsr &= SD(STATUS_ENG_HALTED_SMASK);
if (statuscsr)
break;
if (time_after(jiffies, timeout)) {
dd_dev_err(sde->dd,
"SDMA engine %d - timeout waiting for engine to halt\n",
sde->this_idx);
/*
* Continue anyway. This could happen if there was
* an uncorrectable error in the wrong spot.
*/
break;
}
usleep_range(80, 120);
}
sdma_process_event(sde, sdma_event_e15_hw_halt_done);
}
static void sdma_err_progress_check_schedule(struct sdma_engine *sde)
{
if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) {
unsigned index;
struct hfi1_devdata *dd = sde->dd;
for (index = 0; index < dd->num_sdma; index++) {
struct sdma_engine *curr_sdma = &dd->per_sdma[index];
if (curr_sdma != sde)
curr_sdma->progress_check_head =
curr_sdma->descq_head;
}
dd_dev_err(sde->dd,
"SDMA engine %d - check scheduled\n",
sde->this_idx);
mod_timer(&sde->err_progress_check_timer, jiffies + 10);
}
}
static void sdma_err_progress_check(struct timer_list *t)
{
unsigned index;
struct sdma_engine *sde = from_timer(sde, t, err_progress_check_timer);
dd_dev_err(sde->dd, "SDE progress check event\n");
for (index = 0; index < sde->dd->num_sdma; index++) {
struct sdma_engine *curr_sde = &sde->dd->per_sdma[index];
unsigned long flags;
/* check progress on each engine except the current one */
if (curr_sde == sde)
continue;
/*
* We must lock interrupts when acquiring sde->lock,
* to avoid a deadlock if interrupt triggers and spins on
* the same lock on same CPU
*/
spin_lock_irqsave(&curr_sde->tail_lock, flags);
write_seqlock(&curr_sde->head_lock);
/* skip non-running queues */
if (curr_sde->state.current_state != sdma_state_s99_running) {
write_sequnlock(&curr_sde->head_lock);
spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
continue;
}
if ((curr_sde->descq_head != curr_sde->descq_tail) &&
(curr_sde->descq_head ==
curr_sde->progress_check_head))
__sdma_process_event(curr_sde,
sdma_event_e90_sw_halted);
write_sequnlock(&curr_sde->head_lock);
spin_unlock_irqrestore(&curr_sde->tail_lock, flags);
}
schedule_work(&sde->err_halt_worker);
}
static void sdma_hw_clean_up_task(struct tasklet_struct *t)
{
struct sdma_engine *sde = from_tasklet(sde, t,
sdma_hw_clean_up_task);
u64 statuscsr;
while (1) {
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
sde->this_idx, slashstrip(__FILE__), __LINE__,
__func__);
#endif
statuscsr = read_sde_csr(sde, SD(STATUS));
statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK);
if (statuscsr)
break;
udelay(10);
}
sdma_process_event(sde, sdma_event_e25_hw_clean_up_done);
}
static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde)
{
return sde->tx_ring[sde->tx_head & sde->sdma_mask];
}
/*
* flush ring for recovery
*/
static void sdma_flush_descq(struct sdma_engine *sde)
{
u16 head, tail;
int progress = 0;
struct sdma_txreq *txp = get_txhead(sde);
/* The reason for some of the complexity of this code is that
* not all descriptors have corresponding txps. So, we have to
* be able to skip over descs until we wander into the range of
* the next txp on the list.
*/
head = sde->descq_head & sde->sdma_mask;
tail = sde->descq_tail & sde->sdma_mask;
while (head != tail) {
/* advance head, wrap if needed */
head = ++sde->descq_head & sde->sdma_mask;
/* if now past this txp's descs, do the callback */
if (txp && txp->next_descq_idx == head) {
/* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
trace_hfi1_sdma_progress(sde, head, tail, txp);
txp = get_txhead(sde);
}
progress++;
}
if (progress)
sdma_desc_avail(sde, sdma_descq_freecnt(sde));
}
static void sdma_sw_clean_up_task(struct tasklet_struct *t)
{
struct sdma_engine *sde = from_tasklet(sde, t, sdma_sw_clean_up_task);
unsigned long flags;
spin_lock_irqsave(&sde->tail_lock, flags);
write_seqlock(&sde->head_lock);
/*
* At this point, the following should always be true:
* - We are halted, so no more descriptors are getting retired.
* - We are not running, so no one is submitting new work.
* - Only we can send the e40_sw_cleaned, so we can't start
* running again until we say so. So, the active list and
* descq are ours to play with.
*/
/*
* In the error clean up sequence, software clean must be called
* before the hardware clean so we can use the hardware head in
* the progress routine. A hardware clean or SPC unfreeze will
* reset the hardware head.
*
* Process all retired requests. The progress routine will use the
* latest physical hardware head - we are not running so speed does
* not matter.
*/
sdma_make_progress(sde, 0);
sdma_flush(sde);
/*
* Reset our notion of head and tail.
* Note that the HW registers have been reset via an earlier
* clean up.
*/
sde->descq_tail = 0;
sde->descq_head = 0;
sde->desc_avail = sdma_descq_freecnt(sde);
*sde->head_dma = 0;
__sdma_process_event(sde, sdma_event_e40_sw_cleaned);
write_sequnlock(&sde->head_lock);
spin_unlock_irqrestore(&sde->tail_lock, flags);
}
static void sdma_sw_tear_down(struct sdma_engine *sde)
{
struct sdma_state *ss = &sde->state;
/* Releasing this reference means the state machine has stopped. */
sdma_put(ss);
/* stop waiting for all unfreeze events to complete */
atomic_set(&sde->dd->sdma_unfreeze_count, -1);
wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
}
static void sdma_start_hw_clean_up(struct sdma_engine *sde)
{
tasklet_hi_schedule(&sde->sdma_hw_clean_up_task);
}
static void sdma_set_state(struct sdma_engine *sde,
enum sdma_states next_state)
{
struct sdma_state *ss = &sde->state;
const struct sdma_set_state_action *action = sdma_action_table;
unsigned op = 0;
trace_hfi1_sdma_state(
sde,
sdma_state_names[ss->current_state],
sdma_state_names[next_state]);
/* debugging bookkeeping */
ss->previous_state = ss->current_state;
ss->previous_op = ss->current_op;
ss->current_state = next_state;
if (ss->previous_state != sdma_state_s99_running &&
next_state == sdma_state_s99_running)
sdma_flush(sde);
if (action[next_state].op_enable)
op |= SDMA_SENDCTRL_OP_ENABLE;
if (action[next_state].op_intenable)
op |= SDMA_SENDCTRL_OP_INTENABLE;
if (action[next_state].op_halt)
op |= SDMA_SENDCTRL_OP_HALT;
if (action[next_state].op_cleanup)
op |= SDMA_SENDCTRL_OP_CLEANUP;
if (action[next_state].go_s99_running_tofalse)
ss->go_s99_running = 0;
if (action[next_state].go_s99_running_totrue)
ss->go_s99_running = 1;
ss->current_op = op;
sdma_sendctrl(sde, ss->current_op);
}
/**
* sdma_get_descq_cnt() - called when device probed
*
* Return a validated descq count.
*
* This is currently only used in the verbs initialization to build the tx
* list.
*
* This will probably be deleted in favor of a more scalable approach to
* alloc tx's.
*
*/
u16 sdma_get_descq_cnt(void)
{
u16 count = sdma_descq_cnt;
if (!count)
return SDMA_DESCQ_CNT;
/* count must be a power of 2 greater than 64 and less than
* 32768. Otherwise return default.
*/
if (!is_power_of_2(count))
return SDMA_DESCQ_CNT;
if (count < 64 || count > 32768)
return SDMA_DESCQ_CNT;
return count;
}
/**
* sdma_engine_get_vl() - return vl for a given sdma engine
* @sde: sdma engine
*
* This function returns the vl mapped to a given engine, or an error if
* the mapping can't be found. The mapping fields are protected by RCU.
*/
int sdma_engine_get_vl(struct sdma_engine *sde)
{
struct hfi1_devdata *dd = sde->dd;
struct sdma_vl_map *m;
u8 vl;
if (sde->this_idx >= TXE_NUM_SDMA_ENGINES)
return -EINVAL;
rcu_read_lock();
m = rcu_dereference(dd->sdma_map);
if (unlikely(!m)) {
rcu_read_unlock();
return -EINVAL;
}
vl = m->engine_to_vl[sde->this_idx];
rcu_read_unlock();
return vl;
}
/**
* sdma_select_engine_vl() - select sdma engine
* @dd: devdata
* @selector: a spreading factor
* @vl: this vl
*
*
* This function returns an engine based on the selector and a vl. The
* mapping fields are protected by RCU.
*/
struct sdma_engine *sdma_select_engine_vl(
struct hfi1_devdata *dd,
u32 selector,
u8 vl)
{
struct sdma_vl_map *m;
struct sdma_map_elem *e;
struct sdma_engine *rval;
/* NOTE This should only happen if SC->VL changed after the initial
* checks on the QP/AH
* Default will return engine 0 below
*/
if (vl >= num_vls) {
rval = NULL;
goto done;
}
rcu_read_lock();
m = rcu_dereference(dd->sdma_map);
if (unlikely(!m)) {
rcu_read_unlock();
return &dd->per_sdma[0];
}
e = m->map[vl & m->mask];
rval = e->sde[selector & e->mask];
rcu_read_unlock();
done:
rval = !rval ? &dd->per_sdma[0] : rval;
trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
return rval;
}
/**
* sdma_select_engine_sc() - select sdma engine
* @dd: devdata
* @selector: a spreading factor
* @sc5: the 5 bit sc
*
*
* This function returns an engine based on the selector and an sc.
*/
struct sdma_engine *sdma_select_engine_sc(
struct hfi1_devdata *dd,
u32 selector,
u8 sc5)
{
u8 vl = sc_to_vlt(dd, sc5);
return sdma_select_engine_vl(dd, selector, vl);
}
struct sdma_rht_map_elem {
u32 mask;
u8 ctr;
struct sdma_engine *sde[];
};
struct sdma_rht_node {
unsigned long cpu_id;
struct sdma_rht_map_elem *map[HFI1_MAX_VLS_SUPPORTED];
struct rhash_head node;
};
#define NR_CPUS_HINT 192
static const struct rhashtable_params sdma_rht_params = {
.nelem_hint = NR_CPUS_HINT,
.head_offset = offsetof(struct sdma_rht_node, node),
.key_offset = offsetof(struct sdma_rht_node, cpu_id),
.key_len = sizeof_field(struct sdma_rht_node, cpu_id),
.max_size = NR_CPUS,
.min_size = 8,
.automatic_shrinking = true,
};
/*
* sdma_select_user_engine() - select sdma engine based on user setup
* @dd: devdata
* @selector: a spreading factor
* @vl: this vl
*
* This function returns an sdma engine for a user sdma request.
* User defined sdma engine affinity setting is honored when applicable,
* otherwise system default sdma engine mapping is used. To ensure correct
* ordering, the mapping from <selector, vl> to sde must remain unchanged.
*/
struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
u32 selector, u8 vl)
{
struct sdma_rht_node *rht_node;
struct sdma_engine *sde = NULL;
unsigned long cpu_id;
/*
* To ensure that always the same sdma engine(s) will be
* selected make sure the process is pinned to this CPU only.
*/
if (current->nr_cpus_allowed != 1)
goto out;
rcu_read_lock();
2021-11-30 03:19:58 +08:00
cpu_id = smp_processor_id();
rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id,
sdma_rht_params);
if (rht_node && rht_node->map[vl]) {
struct sdma_rht_map_elem *map = rht_node->map[vl];
sde = map->sde[selector & map->mask];
}
rcu_read_unlock();
if (sde)
return sde;
out:
return sdma_select_engine_vl(dd, selector, vl);
}
static void sdma_populate_sde_map(struct sdma_rht_map_elem *map)
{
int i;
for (i = 0; i < roundup_pow_of_two(map->ctr ? : 1) - map->ctr; i++)
map->sde[map->ctr + i] = map->sde[i];
}
static void sdma_cleanup_sde_map(struct sdma_rht_map_elem *map,
struct sdma_engine *sde)
{
unsigned int i, pow;
/* only need to check the first ctr entries for a match */
for (i = 0; i < map->ctr; i++) {
if (map->sde[i] == sde) {
memmove(&map->sde[i], &map->sde[i + 1],
(map->ctr - i - 1) * sizeof(map->sde[0]));
map->ctr--;
pow = roundup_pow_of_two(map->ctr ? : 1);
map->mask = pow - 1;
sdma_populate_sde_map(map);
break;
}
}
}
/*
* Prevents concurrent reads and writes of the sdma engine cpu_mask
*/
static DEFINE_MUTEX(process_to_sde_mutex);
ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf,
size_t count)
{
struct hfi1_devdata *dd = sde->dd;
cpumask_var_t mask, new_mask;
unsigned long cpu;
int ret, vl, sz;
struct sdma_rht_node *rht_node;
vl = sdma_engine_get_vl(sde);
if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map)))
return -EINVAL;
ret = zalloc_cpumask_var(&mask, GFP_KERNEL);
if (!ret)
return -ENOMEM;
ret = zalloc_cpumask_var(&new_mask, GFP_KERNEL);
if (!ret) {
free_cpumask_var(mask);
return -ENOMEM;
}
ret = cpulist_parse(buf, mask);
if (ret)
goto out_free;
if (!cpumask_subset(mask, cpu_online_mask)) {
dd_dev_warn(sde->dd, "Invalid CPU mask\n");
ret = -EINVAL;
goto out_free;
}
sz = sizeof(struct sdma_rht_map_elem) +
(TXE_NUM_SDMA_ENGINES * sizeof(struct sdma_engine *));
mutex_lock(&process_to_sde_mutex);
for_each_cpu(cpu, mask) {
/* Check if we have this already mapped */
if (cpumask_test_cpu(cpu, &sde->cpu_mask)) {
cpumask_set_cpu(cpu, new_mask);
continue;
}
rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
sdma_rht_params);
if (!rht_node) {
rht_node = kzalloc(sizeof(*rht_node), GFP_KERNEL);
if (!rht_node) {
ret = -ENOMEM;
goto out;
}
rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
if (!rht_node->map[vl]) {
kfree(rht_node);
ret = -ENOMEM;
goto out;
}
rht_node->cpu_id = cpu;
rht_node->map[vl]->mask = 0;
rht_node->map[vl]->ctr = 1;
rht_node->map[vl]->sde[0] = sde;
ret = rhashtable_insert_fast(dd->sdma_rht,
&rht_node->node,
sdma_rht_params);
if (ret) {
kfree(rht_node->map[vl]);
kfree(rht_node);
dd_dev_err(sde->dd, "Failed to set process to sde affinity for cpu %lu\n",
cpu);
goto out;
}
} else {
int ctr, pow;
/* Add new user mappings */
if (!rht_node->map[vl])
rht_node->map[vl] = kzalloc(sz, GFP_KERNEL);
if (!rht_node->map[vl]) {
ret = -ENOMEM;
goto out;
}
rht_node->map[vl]->ctr++;
ctr = rht_node->map[vl]->ctr;
rht_node->map[vl]->sde[ctr - 1] = sde;
pow = roundup_pow_of_two(ctr);
rht_node->map[vl]->mask = pow - 1;
/* Populate the sde map table */
sdma_populate_sde_map(rht_node->map[vl]);
}
cpumask_set_cpu(cpu, new_mask);
}
/* Clean up old mappings */
for_each_cpu(cpu, cpu_online_mask) {
struct sdma_rht_node *rht_node;
/* Don't cleanup sdes that are set in the new mask */
if (cpumask_test_cpu(cpu, mask))
continue;
rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu,
sdma_rht_params);
if (rht_node) {
bool empty = true;
int i;
/* Remove mappings for old sde */
for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
if (rht_node->map[i])
sdma_cleanup_sde_map(rht_node->map[i],
sde);
/* Free empty hash table entries */
for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
if (!rht_node->map[i])
continue;
if (rht_node->map[i]->ctr) {
empty = false;
break;
}
}
if (empty) {
ret = rhashtable_remove_fast(dd->sdma_rht,
&rht_node->node,
sdma_rht_params);
WARN_ON(ret);
for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
kfree(rht_node->map[i]);
kfree(rht_node);
}
}
}
cpumask_copy(&sde->cpu_mask, new_mask);
out:
mutex_unlock(&process_to_sde_mutex);
out_free:
free_cpumask_var(mask);
free_cpumask_var(new_mask);
return ret ? : strnlen(buf, PAGE_SIZE);
}
ssize_t sdma_get_cpu_to_sde_map(struct sdma_engine *sde, char *buf)
{
mutex_lock(&process_to_sde_mutex);
if (cpumask_empty(&sde->cpu_mask))
snprintf(buf, PAGE_SIZE, "%s\n", "empty");
else
cpumap_print_to_pagebuf(true, buf, &sde->cpu_mask);
mutex_unlock(&process_to_sde_mutex);
return strnlen(buf, PAGE_SIZE);
}
static void sdma_rht_free(void *ptr, void *arg)
{
struct sdma_rht_node *rht_node = ptr;
int i;
for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++)
kfree(rht_node->map[i]);
kfree(rht_node);
}
/**
* sdma_seqfile_dump_cpu_list() - debugfs dump the cpu to sdma mappings
* @s: seq file
* @dd: hfi1_devdata
* @cpuid: cpu id
*
* This routine dumps the process to sde mappings per cpu
*/
void sdma_seqfile_dump_cpu_list(struct seq_file *s,
struct hfi1_devdata *dd,
unsigned long cpuid)
{
struct sdma_rht_node *rht_node;
int i, j;
rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpuid,
sdma_rht_params);
if (!rht_node)
return;
seq_printf(s, "cpu%3lu: ", cpuid);
for (i = 0; i < HFI1_MAX_VLS_SUPPORTED; i++) {
if (!rht_node->map[i] || !rht_node->map[i]->ctr)
continue;
seq_printf(s, " vl%d: [", i);
for (j = 0; j < rht_node->map[i]->ctr; j++) {
if (!rht_node->map[i]->sde[j])
continue;
if (j > 0)
seq_puts(s, ",");
seq_printf(s, " sdma%2d",
rht_node->map[i]->sde[j]->this_idx);
}
seq_puts(s, " ]");
}
seq_puts(s, "\n");
}
/*
* Free the indicated map struct
*/
static void sdma_map_free(struct sdma_vl_map *m)
{
int i;
for (i = 0; m && i < m->actual_vls; i++)
kfree(m->map[i]);
kfree(m);
}
/*
* Handle RCU callback
*/
static void sdma_map_rcu_callback(struct rcu_head *list)
{
struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list);
sdma_map_free(m);
}
/**
* sdma_map_init - called when # vls change
* @dd: hfi1_devdata
* @port: port number
* @num_vls: number of vls
* @vl_engines: per vl engine mapping (optional)
*
* This routine changes the mapping based on the number of vls.
*
* vl_engines is used to specify a non-uniform vl/engine loading. NULL
* implies auto computing the loading and giving each VLs a uniform
* distribution of engines per VL.
*
* The auto algorithm computes the sde_per_vl and the number of extra
* engines. Any extra engines are added from the last VL on down.
*
* rcu locking is used here to control access to the mapping fields.
*
* If either the num_vls or num_sdma are non-power of 2, the array sizes
* in the struct sdma_vl_map and the struct sdma_map_elem are rounded
* up to the next highest power of 2 and the first entry is reused
* in a round robin fashion.
*
* If an error occurs the map change is not done and the mapping is
* not changed.
*
*/
int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines)
{
int i, j;
int extra, sde_per_vl;
int engine = 0;
u8 lvl_engines[OPA_MAX_VLS];
struct sdma_vl_map *oldmap, *newmap;
if (!(dd->flags & HFI1_HAS_SEND_DMA))
return 0;
if (!vl_engines) {
/* truncate divide */
sde_per_vl = dd->num_sdma / num_vls;
/* extras */
extra = dd->num_sdma % num_vls;
vl_engines = lvl_engines;
/* add extras from last vl down */
for (i = num_vls - 1; i >= 0; i--, extra--)
vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0);
}
/* build new map */
newmap = kzalloc(
sizeof(struct sdma_vl_map) +
roundup_pow_of_two(num_vls) *
sizeof(struct sdma_map_elem *),
GFP_KERNEL);
if (!newmap)
goto bail;
newmap->actual_vls = num_vls;
newmap->vls = roundup_pow_of_two(num_vls);
newmap->mask = (1 << ilog2(newmap->vls)) - 1;
/* initialize back-map */
for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++)
newmap->engine_to_vl[i] = -1;
for (i = 0; i < newmap->vls; i++) {
/* save for wrap around */
int first_engine = engine;
if (i < newmap->actual_vls) {
int sz = roundup_pow_of_two(vl_engines[i]);
/* only allocate once */
newmap->map[i] = kzalloc(
sizeof(struct sdma_map_elem) +
sz * sizeof(struct sdma_engine *),
GFP_KERNEL);
if (!newmap->map[i])
goto bail;
newmap->map[i]->mask = (1 << ilog2(sz)) - 1;
/* assign engines */
for (j = 0; j < sz; j++) {
newmap->map[i]->sde[j] =
&dd->per_sdma[engine];
if (++engine >= first_engine + vl_engines[i])
/* wrap back to first engine */
engine = first_engine;
}
/* assign back-map */
for (j = 0; j < vl_engines[i]; j++)
newmap->engine_to_vl[first_engine + j] = i;
} else {
/* just re-use entry without allocating */
newmap->map[i] = newmap->map[i % num_vls];
}
engine = first_engine + vl_engines[i];
}
/* newmap in hand, save old map */
spin_lock_irq(&dd->sde_map_lock);
oldmap = rcu_dereference_protected(dd->sdma_map,
lockdep_is_held(&dd->sde_map_lock));
/* publish newmap */
rcu_assign_pointer(dd->sdma_map, newmap);
spin_unlock_irq(&dd->sde_map_lock);
/* success, free any old map after grace period */
if (oldmap)
call_rcu(&oldmap->list, sdma_map_rcu_callback);
return 0;
bail:
/* free any partial allocation */
sdma_map_free(newmap);
return -ENOMEM;
}
IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-02 02:43:50 +08:00
/**
* sdma_clean - Clean up allocated memory
IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-02 02:43:50 +08:00
* @dd: struct hfi1_devdata
* @num_engines: num sdma engines
*
IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-02 02:43:50 +08:00
* This routine can be called regardless of the success of
* sdma_init()
*/
IB/hfi1: Fix for early release of sdma context With IRQF_SHARED flag set and CONFIG_DEBUG_SHIRQ enabled module removal may result in panic in sdma_interrupt() routine if associated sdma context was released before pci_free_irq(); [ 9198.939885] BUG: unable to handle kernel NULL pointer dereference at (null) [ 9198.940514] IP: sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.941114] PGD 170bdc0067 P4D 170bdc0067 PUD 172063e067 PMD 0 [ 9198.941783] Oops: 0000 [#1] SMP ..... [ 9198.958877] CPU: 132 PID: 64173 Comm: rmmod Tainted: G OE 4.14.0-rc4+ #1 [ 9198.961032] Hardware name: Intel Corporation S7200AP/S7200AP, BIOS S72C610.86B.01.02.0118.080620171935 08/06/2017 [ 9198.963323] task: ffff9681397f0000 task.stack: ffffae1647c40000 [ 9198.965695] RIP: 0010:sdma_make_progress+0xa5/0x450 [hfi1] [ 9198.968082] RSP: 0018:ffffae1647c43be8 EFLAGS: 00010046 [ 9198.970503] RAX: 0000000000000000 RBX: ffff9680ce8b5ca8 RCX: 0000000000000000 [ 9198.973006] RDX: 0000000000000000 RSI: 0000000001a00d28 RDI: ffff9680ce8b5ca0 [ 9198.975546] RBP: ffffae1647c43c40 R08: ffff96814325ec00 R09: 00000000ffffffff [ 9198.978142] R10: 000000004325e501 R11: ffff96814325ec00 R12: ffff9680ce8b5c44 [ 9198.980779] R13: ffff9680ce8b5ca0 R14: 0000000000000000 R15: ffff9680ce8b5b00 [ 9198.983462] FS: 00007f31196ba740(0000) GS:ffff96819df00000(0000) knlGS:0000000000000000 [ 9198.986231] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 9198.989036] CR2: 0000000000000000 CR3: 000000170833f000 CR4: 00000000001406e0 [ 9198.991911] Call Trace: [ 9198.994847] sdma_engine_interrupt+0x82/0x100 [hfi1] [ 9198.997852] sdma_interrupt+0x61/0xc0 [hfi1] [ 9199.000852] __free_irq+0x1b3/0x2d0 [ 9199.003873] free_irq+0x35/0x70 [ 9199.006909] pci_free_irq+0x1c/0x30 [ 9199.009999] clean_up_interrupts+0x53/0xf0 [hfi1] [ 9199.013137] hfi1_start_cleanup+0x117/0x190 [hfi1] [ 9199.016315] postinit_cleanup+0x1d/0x270 [hfi1] [ 9199.019529] remove_one+0x1f3/0x210 [hfi1] [ 9199.022738] pci_device_remove+0x39/0xc0 [ 9199.025974] device_release_driver_internal+0x141/0x210 [ 9199.029268] driver_detach+0x3f/0x80 [ 9199.032580] bus_remove_driver+0x55/0xd0 [ 9199.035931] driver_unregister+0x2c/0x50 [ 9199.039321] pci_unregister_driver+0x2a/0xa0 [ 9199.042755] hfi1_mod_cleanup+0x10/0xb50 [hfi1] [ 9199.046196] SyS_delete_module+0x171/0x250 ... Fix by exporting sdma_clean() and removing from sdma_exit(). sdma_exit() now just manipulates the engine state, leaving the memory free to sdma_clean() which is now called just before the dd is freed. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Reviewed-by: Michael J Ruhl <michael.j.ruhl@intel.com> Signed-off-by: Alex Estrin <alex.estrin@intel.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-02-02 02:43:50 +08:00
void sdma_clean(struct hfi1_devdata *dd, size_t num_engines)
{
size_t i;
struct sdma_engine *sde;
if (dd->sdma_pad_dma) {
dma_free_coherent(&dd->pcidev->dev, SDMA_PAD,
(void *)dd->sdma_pad_dma,
dd->sdma_pad_phys);
dd->sdma_pad_dma = NULL;
dd->sdma_pad_phys = 0;
}
if (dd->sdma_heads_dma) {
dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size,
(void *)dd->sdma_heads_dma,
dd->sdma_heads_phys);
dd->sdma_heads_dma = NULL;
dd->sdma_heads_phys = 0;
}
for (i = 0; dd->per_sdma && i < num_engines; ++i) {
sde = &dd->per_sdma[i];
sde->head_dma = NULL;
sde->head_phys = 0;
if (sde->descq) {
dma_free_coherent(
&dd->pcidev->dev,
sde->descq_cnt * sizeof(u64[2]),
sde->descq,
sde->descq_phys
);
sde->descq = NULL;
sde->descq_phys = 0;
}
kvfree(sde->tx_ring);
sde->tx_ring = NULL;
}
RDMA/hfi1: Prevent use of lock before it is initialized If there is a failure during probe of hfi1 before the sdma_map_lock is initialized, the call to hfi1_free_devdata() will attempt to use a lock that has not been initialized. If the locking correctness validator is on then an INFO message and stack trace resembling the following may be seen: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. Call Trace: register_lock_class+0x11b/0x880 __lock_acquire+0xf3/0x7930 lock_acquire+0xff/0x2d0 _raw_spin_lock_irq+0x46/0x60 sdma_clean+0x42a/0x660 [hfi1] hfi1_free_devdata+0x3a7/0x420 [hfi1] init_one+0x867/0x11a0 [hfi1] pci_device_probe+0x40e/0x8d0 The use of sdma_map_lock in sdma_clean() is for freeing the sdma_map memory, and sdma_map is not allocated/initialized until after sdma_map_lock has been initialized. This code only needs to be run if sdma_map is not NULL, and so checking for that condition will avoid trying to use the lock before it is initialized. Fixes: 473291b3ea0e ("IB/hfi1: Fix for early release of sdma context") Fixes: 7724105686e7 ("IB/hfi1: add driver files") Link: https://lore.kernel.org/r/20220520183701.48973.72434.stgit@awfm-01.cornelisnetworks.com Reported-by: Zheyu Ma <zheyuma97@gmail.com> Signed-off-by: Douglas Miller <doug.miller@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2022-05-21 02:37:01 +08:00
if (rcu_access_pointer(dd->sdma_map)) {
spin_lock_irq(&dd->sde_map_lock);
sdma_map_free(rcu_access_pointer(dd->sdma_map));
RCU_INIT_POINTER(dd->sdma_map, NULL);
spin_unlock_irq(&dd->sde_map_lock);
synchronize_rcu();
}
kfree(dd->per_sdma);
dd->per_sdma = NULL;
if (dd->sdma_rht) {
rhashtable_free_and_destroy(dd->sdma_rht, sdma_rht_free, NULL);
kfree(dd->sdma_rht);
dd->sdma_rht = NULL;
}
}
/**
* sdma_init() - called when device probed
* @dd: hfi1_devdata
* @port: port number (currently only zero)
*
* Initializes each sde and its csrs.
* Interrupts are not required to be enabled.
*
* Returns:
* 0 - success, -errno on failure
*/
int sdma_init(struct hfi1_devdata *dd, u8 port)
{
unsigned this_idx;
struct sdma_engine *sde;
struct rhashtable *tmp_sdma_rht;
u16 descq_cnt;
void *curr_head;
struct hfi1_pportdata *ppd = dd->pport + port;
u32 per_sdma_credits;
uint idle_cnt = sdma_idle_cnt;
size_t num_engines = chip_sdma_engines(dd);
int ret = -ENOMEM;
if (!HFI1_CAP_IS_KSET(SDMA)) {
HFI1_CAP_CLEAR(SDMA_AHG);
return 0;
}
if (mod_num_sdma &&
/* can't exceed chip support */
mod_num_sdma <= chip_sdma_engines(dd) &&
/* count must be >= vls */
mod_num_sdma >= num_vls)
num_engines = mod_num_sdma;
dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma);
dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", chip_sdma_engines(dd));
dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n",
chip_sdma_mem_size(dd));
per_sdma_credits =
chip_sdma_mem_size(dd) / (num_engines * SDMA_BLOCK_SIZE);
/* set up freeze waitqueue */
init_waitqueue_head(&dd->sdma_unfreeze_wq);
atomic_set(&dd->sdma_unfreeze_count, 0);
descq_cnt = sdma_get_descq_cnt();
dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n",
num_engines, descq_cnt);
/* alloc memory for array of send engines */
dd->per_sdma = kcalloc_node(num_engines, sizeof(*dd->per_sdma),
GFP_KERNEL, dd->node);
if (!dd->per_sdma)
return ret;
idle_cnt = ns_to_cclock(dd, idle_cnt);
if (idle_cnt)
dd->default_desc1 =
SDMA_DESC1_HEAD_TO_HOST_FLAG;
else
dd->default_desc1 =
SDMA_DESC1_INT_REQ_FLAG;
if (!sdma_desct_intr)
sdma_desct_intr = SDMA_DESC_INTR;
/* Allocate memory for SendDMA descriptor FIFOs */
for (this_idx = 0; this_idx < num_engines; ++this_idx) {
sde = &dd->per_sdma[this_idx];
sde->dd = dd;
sde->ppd = ppd;
sde->this_idx = this_idx;
sde->descq_cnt = descq_cnt;
sde->desc_avail = sdma_descq_freecnt(sde);
sde->sdma_shift = ilog2(descq_cnt);
sde->sdma_mask = (1 << sde->sdma_shift) - 1;
/* Create a mask specifically for each interrupt source */
sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES +
this_idx);
sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES +
this_idx);
sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES +
this_idx);
/* Create a combined mask to cover all 3 interrupt sources */
sde->imask = sde->int_mask | sde->progress_mask |
sde->idle_mask;
spin_lock_init(&sde->tail_lock);
seqlock_init(&sde->head_lock);
spin_lock_init(&sde->senddmactrl_lock);
spin_lock_init(&sde->flushlist_lock);
seqlock_init(&sde->waitlock);
/* insure there is always a zero bit */
sde->ahg_bits = 0xfffffffe00000000ULL;
sdma_set_state(sde, sdma_state_s00_hw_down);
/* set up reference counting */
kref_init(&sde->state.kref);
init_completion(&sde->state.comp);
INIT_LIST_HEAD(&sde->flushlist);
INIT_LIST_HEAD(&sde->dmawait);
sde->tail_csr =
get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
tasklet_setup(&sde->sdma_hw_clean_up_task,
sdma_hw_clean_up_task);
tasklet_setup(&sde->sdma_sw_clean_up_task,
sdma_sw_clean_up_task);
INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
INIT_WORK(&sde->flush_worker, sdma_field_flush);
sde->progress_check_head = 0;
timer_setup(&sde->err_progress_check_timer,
sdma_err_progress_check, 0);
sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
descq_cnt * sizeof(u64[2]),
&sde->descq_phys, GFP_KERNEL);
if (!sde->descq)
goto bail;
sde->tx_ring =
treewide: Use array_size() in kvzalloc_node() The kvzalloc_node() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: kvzalloc_node(a * b, gfp, node) with: kvzalloc_node(array_size(a, b), gfp, node) as well as handling cases of: kvzalloc_node(a * b * c, gfp, node) with: kvzalloc_node(array3_size(a, b, c), gfp, node) This does, however, attempt to ignore constant size factors like: kvzalloc_node(4 * 1024, gfp, node) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( kvzalloc_node( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) | kvzalloc_node( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( kvzalloc_node( - sizeof(u8) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(__u8) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(char) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(unsigned char) * (COUNT) + COUNT , ...) | kvzalloc_node( - sizeof(u8) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(__u8) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(char) * COUNT + COUNT , ...) | kvzalloc_node( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( kvzalloc_node( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ kvzalloc_node( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( kvzalloc_node( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) | kvzalloc_node( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( kvzalloc_node( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) | kvzalloc_node( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) | kvzalloc_node( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( kvzalloc_node( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) | kvzalloc_node( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( kvzalloc_node(C1 * C2 * C3, ...) | kvzalloc_node( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( kvzalloc_node(C1 * C2, ...) | kvzalloc_node( - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 05:28:04 +08:00
kvzalloc_node(array_size(descq_cnt,
sizeof(struct sdma_txreq *)),
GFP_KERNEL, dd->node);
if (!sde->tx_ring)
goto bail;
}
dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
/* Allocate memory for DMA of head registers to memory */
dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
dd->sdma_heads_size,
&dd->sdma_heads_phys,
GFP_KERNEL);
if (!dd->sdma_heads_dma) {
dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
goto bail;
}
/* Allocate memory for pad */
dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD,
&dd->sdma_pad_phys, GFP_KERNEL);
if (!dd->sdma_pad_dma) {
dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
goto bail;
}
/* assign each engine to different cacheline and init registers */
curr_head = (void *)dd->sdma_heads_dma;
for (this_idx = 0; this_idx < num_engines; ++this_idx) {
unsigned long phys_offset;
sde = &dd->per_sdma[this_idx];
sde->head_dma = curr_head;
curr_head += L1_CACHE_BYTES;
phys_offset = (unsigned long)sde->head_dma -
(unsigned long)dd->sdma_heads_dma;
sde->head_phys = dd->sdma_heads_phys + phys_offset;
init_sdma_regs(sde, per_sdma_credits, idle_cnt);
}
dd->flags |= HFI1_HAS_SEND_DMA;
dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0;
dd->num_sdma = num_engines;
ret = sdma_map_init(dd, port, ppd->vls_operational, NULL);
if (ret < 0)
goto bail;
tmp_sdma_rht = kzalloc(sizeof(*tmp_sdma_rht), GFP_KERNEL);
if (!tmp_sdma_rht) {
ret = -ENOMEM;
goto bail;
}
ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
if (ret < 0) {
kfree(tmp_sdma_rht);
goto bail;
}
dd->sdma_rht = tmp_sdma_rht;
dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
return 0;
bail:
sdma_clean(dd, num_engines);
return ret;
}
/**
* sdma_all_running() - called when the link goes up
* @dd: hfi1_devdata
*
* This routine moves all engines to the running state.
*/
void sdma_all_running(struct hfi1_devdata *dd)
{
struct sdma_engine *sde;
unsigned int i;
/* move all engines to running */
for (i = 0; i < dd->num_sdma; ++i) {
sde = &dd->per_sdma[i];
sdma_process_event(sde, sdma_event_e30_go_running);
}
}
/**
* sdma_all_idle() - called when the link goes down
* @dd: hfi1_devdata
*
* This routine moves all engines to the idle state.
*/
void sdma_all_idle(struct hfi1_devdata *dd)
{
struct sdma_engine *sde;
unsigned int i;
/* idle all engines */
for (i = 0; i < dd->num_sdma; ++i) {
sde = &dd->per_sdma[i];
sdma_process_event(sde, sdma_event_e70_go_idle);
}
}
/**
* sdma_start() - called to kick off state processing for all engines
* @dd: hfi1_devdata
*
* This routine is for kicking off the state processing for all required
* sdma engines. Interrupts need to be working at this point.
*
*/
void sdma_start(struct hfi1_devdata *dd)
{
unsigned i;
struct sdma_engine *sde;
/* kick off the engines state processing */
for (i = 0; i < dd->num_sdma; ++i) {
sde = &dd->per_sdma[i];
sdma_process_event(sde, sdma_event_e10_go_hw_start);
}
}
/**
* sdma_exit() - used when module is removed
* @dd: hfi1_devdata
*/
void sdma_exit(struct hfi1_devdata *dd)
{
unsigned this_idx;
struct sdma_engine *sde;
for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma;
++this_idx) {
sde = &dd->per_sdma[this_idx];
if (!list_empty(&sde->dmawait))
dd_dev_err(dd, "sde %u: dmawait list not empty!\n",
sde->this_idx);
sdma_process_event(sde, sdma_event_e00_go_hw_down);
del_timer_sync(&sde->err_progress_check_timer);
/*
* This waits for the state machine to exit so it is not
* necessary to kill the sdma_sw_clean_up_task to make sure
* it is not running.
*/
sdma_finalput(&sde->state);
}
}
/*
* unmap the indicated descriptor
*/
static inline void sdma_unmap_desc(
struct hfi1_devdata *dd,
struct sdma_desc *descp)
{
IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate The hfi1 user SDMA pinned-page cache will leave a stale cache entry when the cache-entry's virtual address range is invalidated but that cache entry is in-use by an outstanding SDMA request. Subsequent user SDMA requests with buffers in or spanning the virtual address range of the stale cache entry will result in packets constructed from the wrong memory, the physical pages pointed to by the stale cache entry. To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler cache independent of the cache entry's refcount. Add 'struct kref refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with kref_get() and kref_put(). mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove 'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node code to use mmu_rb_node.refcount. Move the mmu_rb_handler destructor call after a wait-for-SDMA-request-completion call so mmu_rb_nodes that need mmu_rb_handler's workqueue to queue themselves up for destruction from an interrupt context may do so. Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging") Fixes: 00cbce5cbf88 ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests") Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick <dean.luick@cornelisnetworks.com> Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2023-05-20 00:32:16 +08:00
switch (sdma_mapping_type(descp)) {
case SDMA_MAP_SINGLE:
dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
case SDMA_MAP_PAGE:
dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
}
if (descp->pinning_ctx && descp->ctx_put)
descp->ctx_put(descp->pinning_ctx);
descp->pinning_ctx = NULL;
}
/*
* return the mode as indicated by the first
* descriptor in the tx.
*/
static inline u8 ahg_mode(struct sdma_txreq *tx)
{
return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK)
>> SDMA_DESC1_HEADER_MODE_SHIFT;
}
/**
* __sdma_txclean() - clean tx of mappings, descp *kmalloc's
* @dd: hfi1_devdata for unmapping
* @tx: tx request to clean
*
* This is used in the progress routine to clean the tx or
* by the ULP to toss an in-process tx build.
*
* The code can be called multiple times without issue.
*
*/
void __sdma_txclean(
struct hfi1_devdata *dd,
struct sdma_txreq *tx)
{
u16 i;
if (tx->num_desc) {
u8 skip = 0, mode = ahg_mode(tx);
/* unmap first */
sdma_unmap_desc(dd, &tx->descp[0]);
/* determine number of AHG descriptors to skip */
if (mode > SDMA_AHG_APPLY_UPDATE1)
skip = mode >> 1;
for (i = 1 + skip; i < tx->num_desc; i++)
sdma_unmap_desc(dd, &tx->descp[i]);
tx->num_desc = 0;
}
kfree(tx->coalesce_buf);
tx->coalesce_buf = NULL;
/* kmalloc'ed descp */
if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) {
tx->desc_limit = ARRAY_SIZE(tx->descs);
kfree(tx->descp);
}
}
static inline u16 sdma_gethead(struct sdma_engine *sde)
{
struct hfi1_devdata *dd = sde->dd;
int use_dmahead;
u16 hwhead;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
#endif
retry:
use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) &&
(dd->flags & HFI1_HAS_SDMA_TIMEOUT);
hwhead = use_dmahead ?
(u16)le64_to_cpu(*sde->head_dma) :
(u16)read_sde_csr(sde, SD(HEAD));
if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) {
u16 cnt;
u16 swtail;
u16 swhead;
int sane;
swhead = sde->descq_head & sde->sdma_mask;
/* this code is really bad for cache line trading */
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
cnt = sde->descq_cnt;
if (swhead < swtail)
/* not wrapped */
sane = (hwhead >= swhead) & (hwhead <= swtail);
else if (swhead > swtail)
/* wrapped around */
sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
(hwhead <= swtail);
else
/* empty */
sane = (hwhead == swhead);
if (unlikely(!sane)) {
dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%u swhd=%u swtl=%u cnt=%u\n",
sde->this_idx,
use_dmahead ? "dma" : "kreg",
hwhead, swhead, swtail, cnt);
if (use_dmahead) {
/* try one more time, using csr */
use_dmahead = 0;
goto retry;
}
/* proceed as if no progress */
hwhead = swhead;
}
}
return hwhead;
}
/*
* This is called when there are send DMA descriptors that might be
* available.
*
* This is called with head_lock held.
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
slashstrip(__FILE__), __LINE__, __func__);
dd_dev_err(sde->dd, "avail: %u\n", avail);
#endif
do {
seq = read_seqbegin(&sde->waitlock);
if (!list_empty(&sde->dmawait)) {
/* at least one item */
write_seqlock(&sde->waitlock);
/* Harvest waiters wanting DMA descriptors */
list_for_each_entry_safe(
wait,
nw,
&sde->dmawait,
list) {
u32 num_desc;
if (!wait->wakeup)
continue;
if (n == ARRAY_SIZE(waits))
break;
iowait_init_priority(wait);
num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
/* Find the top-priority wait memeber */
if (n) {
twait = waits[tidx];
tidx =
iowait_priority_update_top(wait,
twait,
n,
tidx);
}
list_del_init(&wait->list);
waits[n++] = wait;
}
write_sequnlock(&sde->waitlock);
break;
}
} while (read_seqretry(&sde->waitlock, seq));
/* Schedule the top-priority entry first */
if (n)
waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
/* head_lock must be held */
static void sdma_make_progress(struct sdma_engine *sde, u64 status)
{
struct sdma_txreq *txp = NULL;
int progress = 0;
u16 hwhead, swhead;
int idle_check_done = 0;
hwhead = sdma_gethead(sde);
/* The reason for some of the complexity of this code is that
* not all descriptors have corresponding txps. So, we have to
* be able to skip over descs until we wander into the range of
* the next txp on the list.
*/
retry:
txp = get_txhead(sde);
swhead = sde->descq_head & sde->sdma_mask;
trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
while (swhead != hwhead) {
/* advance head, wrap if needed */
swhead = ++sde->descq_head & sde->sdma_mask;
/* if now past this txp's descs, do the callback */
if (txp && txp->next_descq_idx == swhead) {
/* remove from list */
sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL;
complete_tx(sde, txp, SDMA_TXREQ_S_OK);
/* see if there is another txp */
txp = get_txhead(sde);
}
trace_hfi1_sdma_progress(sde, hwhead, swhead, txp);
progress++;
}
/*
* The SDMA idle interrupt is not guaranteed to be ordered with respect
* to updates to the dma_head location in host memory. The head
* value read might not be fully up to date. If there are pending
* descriptors and the SDMA idle interrupt fired then read from the
* CSR SDMA head instead to get the latest value from the hardware.
* The hardware SDMA head should be read at most once in this invocation
* of sdma_make_progress(..) which is ensured by idle_check_done flag
*/
if ((status & sde->idle_mask) && !idle_check_done) {
u16 swtail;
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
if (swtail != hwhead) {
hwhead = (u16)read_sde_csr(sde, SD(HEAD));
idle_check_done = 1;
goto retry;
}
}
sde->last_status = status;
if (progress)
sdma_desc_avail(sde, sdma_descq_freecnt(sde));
}
/*
* sdma_engine_interrupt() - interrupt handler for engine
* @sde: sdma engine
* @status: sdma interrupt reason
*
* Status is a mask of the 3 possible interrupts for this engine. It will
* contain bits _only_ for this SDMA engine. It will contain at least one
* bit, it may contain more.
*/
void sdma_engine_interrupt(struct sdma_engine *sde, u64 status)
{
trace_hfi1_sdma_engine_interrupt(sde, status);
write_seqlock(&sde->head_lock);
sdma_set_desc_cnt(sde, sdma_desct_intr);
if (status & sde->idle_mask)
sde->idle_int_cnt++;
else if (status & sde->progress_mask)
sde->progress_int_cnt++;
else if (status & sde->int_mask)
sde->sdma_int_cnt++;
sdma_make_progress(sde, status);
write_sequnlock(&sde->head_lock);
}
/**
* sdma_engine_error() - error handler for engine
* @sde: sdma engine
* @status: sdma interrupt reason
*/
void sdma_engine_error(struct sdma_engine *sde, u64 status)
{
unsigned long flags;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n",
sde->this_idx,
(unsigned long long)status,
sdma_state_names[sde->state.current_state]);
#endif
spin_lock_irqsave(&sde->tail_lock, flags);
write_seqlock(&sde->head_lock);
if (status & ALL_SDMA_ENG_HALT_ERRS)
__sdma_process_event(sde, sdma_event_e60_hw_halted);
if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) {
dd_dev_err(sde->dd,
"SDMA (%u) engine error: 0x%llx state %s\n",
sde->this_idx,
(unsigned long long)status,
sdma_state_names[sde->state.current_state]);
dump_sdma_state(sde);
}
write_sequnlock(&sde->head_lock);
spin_unlock_irqrestore(&sde->tail_lock, flags);
}
static void sdma_sendctrl(struct sdma_engine *sde, unsigned op)
{
u64 set_senddmactrl = 0;
u64 clr_senddmactrl = 0;
unsigned long flags;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n",
sde->this_idx,
(op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0,
(op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0,
(op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0,
(op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0);
#endif
if (op & SDMA_SENDCTRL_OP_ENABLE)
set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
else
clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK);
if (op & SDMA_SENDCTRL_OP_INTENABLE)
set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
else
clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK);
if (op & SDMA_SENDCTRL_OP_HALT)
set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
else
clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK);
spin_lock_irqsave(&sde->senddmactrl_lock, flags);
sde->p_senddmactrl |= set_senddmactrl;
sde->p_senddmactrl &= ~clr_senddmactrl;
if (op & SDMA_SENDCTRL_OP_CLEANUP)
write_sde_csr(sde, SD(CTRL),
sde->p_senddmactrl |
SD(CTRL_SDMA_CLEANUP_SMASK));
else
write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl);
spin_unlock_irqrestore(&sde->senddmactrl_lock, flags);
#ifdef CONFIG_SDMA_VERBOSITY
sdma_dumpstate(sde);
#endif
}
static void sdma_setlengen(struct sdma_engine *sde)
{
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
#endif
/*
* Set SendDmaLenGen and clear-then-set the MSB of the generation
* count to enable generation checking and load the internal
* generation counter.
*/
write_sde_csr(sde, SD(LEN_GEN),
(sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT));
write_sde_csr(sde, SD(LEN_GEN),
((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) |
(4ULL << SD(LEN_GEN_GENERATION_SHIFT)));
}
static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail)
{
/* Commit writes to memory and advance the tail on the chip */
smp_wmb(); /* see get_txhead() */
writeq(tail, sde->tail_csr);
}
/*
* This is called when changing to state s10_hw_start_up_halt_wait as
* a result of send buffer errors or send DMA descriptor errors.
*/
static void sdma_hw_start_up(struct sdma_engine *sde)
{
u64 reg;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n",
sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
#endif
sdma_setlengen(sde);
sdma_update_tail(sde, 0); /* Set SendDmaTail */
*sde->head_dma = 0;
reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) <<
SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT);
write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg);
}
/*
* set_sdma_integrity
*
* Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'.
*/
static void set_sdma_integrity(struct sdma_engine *sde)
{
struct hfi1_devdata *dd = sde->dd;
write_sde_csr(sde, SD(CHECK_ENABLE),
hfi1_pkt_base_sdma_integrity(dd));
}
static void init_sdma_regs(
struct sdma_engine *sde,
u32 credits,
uint idle_cnt)
{
u8 opval, opmask;
#ifdef CONFIG_SDMA_VERBOSITY
struct hfi1_devdata *dd = sde->dd;
dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n",
sde->this_idx, slashstrip(__FILE__), __LINE__, __func__);
#endif
write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys);
sdma_setlengen(sde);
sdma_update_tail(sde, 0); /* Set SendDmaTail */
write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt);
write_sde_csr(sde, SD(DESC_CNT), 0);
write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys);
write_sde_csr(sde, SD(MEMORY),
((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) |
((u64)(credits * sde->this_idx) <<
SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT)));
write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull);
set_sdma_integrity(sde);
opmask = OPCODE_CHECK_MASK_DISABLED;
opval = OPCODE_CHECK_VAL_DISABLED;
write_sde_csr(sde, SD(CHECK_OPCODE),
(opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) |
(opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT));
}
#ifdef CONFIG_SDMA_VERBOSITY
#define sdma_dumpstate_helper0(reg) do { \
csr = read_csr(sde->dd, reg); \
dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \
} while (0)
#define sdma_dumpstate_helper(reg) do { \
csr = read_sde_csr(sde, reg); \
dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \
#reg, sde->this_idx, csr); \
} while (0)
#define sdma_dumpstate_helper2(reg) do { \
csr = read_csr(sde->dd, reg + (8 * i)); \
dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \
#reg, i, csr); \
} while (0)
void sdma_dumpstate(struct sdma_engine *sde)
{
u64 csr;
unsigned i;
sdma_dumpstate_helper(SD(CTRL));
sdma_dumpstate_helper(SD(STATUS));
sdma_dumpstate_helper0(SD(ERR_STATUS));
sdma_dumpstate_helper0(SD(ERR_MASK));
sdma_dumpstate_helper(SD(ENG_ERR_STATUS));
sdma_dumpstate_helper(SD(ENG_ERR_MASK));
for (i = 0; i < CCE_NUM_INT_CSRS; ++i) {
sdma_dumpstate_helper2(CCE_INT_STATUS);
sdma_dumpstate_helper2(CCE_INT_MASK);
sdma_dumpstate_helper2(CCE_INT_BLOCKED);
}
sdma_dumpstate_helper(SD(TAIL));
sdma_dumpstate_helper(SD(HEAD));
sdma_dumpstate_helper(SD(PRIORITY_THLD));
sdma_dumpstate_helper(SD(IDLE_CNT));
sdma_dumpstate_helper(SD(RELOAD_CNT));
sdma_dumpstate_helper(SD(DESC_CNT));
sdma_dumpstate_helper(SD(DESC_FETCHED_CNT));
sdma_dumpstate_helper(SD(MEMORY));
sdma_dumpstate_helper0(SD(ENGINES));
sdma_dumpstate_helper0(SD(MEM_SIZE));
/* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */
sdma_dumpstate_helper(SD(BASE_ADDR));
sdma_dumpstate_helper(SD(LEN_GEN));
sdma_dumpstate_helper(SD(HEAD_ADDR));
sdma_dumpstate_helper(SD(CHECK_ENABLE));
sdma_dumpstate_helper(SD(CHECK_VL));
sdma_dumpstate_helper(SD(CHECK_JOB_KEY));
sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY));
sdma_dumpstate_helper(SD(CHECK_SLID));
sdma_dumpstate_helper(SD(CHECK_OPCODE));
}
#endif
static void dump_sdma_state(struct sdma_engine *sde)
{
struct hw_sdma_desc *descqp;
u64 desc[2];
u64 addr;
u8 gen;
u16 len;
u16 head, tail, cnt;
head = sde->descq_head & sde->sdma_mask;
tail = sde->descq_tail & sde->sdma_mask;
cnt = sdma_descq_freecnt(sde);
dd_dev_err(sde->dd,
"SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n",
sde->this_idx, head, tail, cnt,
!list_empty(&sde->flushlist));
/* print info for each entry in the descriptor queue */
while (head != tail) {
char flags[6] = { 'x', 'x', 'x', 'x', 0 };
descqp = &sde->descq[head];
desc[0] = le64_to_cpu(descqp->qw[0]);
desc[1] = le64_to_cpu(descqp->qw[1]);
flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
'H' : '-';
flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
& SDMA_DESC0_PHY_ADDR_MASK;
gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
& SDMA_DESC1_GENERATION_MASK;
len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
& SDMA_DESC0_BYTE_COUNT_MASK;
dd_dev_err(sde->dd,
"SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
head, flags, addr, gen, len);
dd_dev_err(sde->dd,
"\tdesc0:0x%016llx desc1 0x%016llx\n",
desc[0], desc[1]);
if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
dd_dev_err(sde->dd,
"\taidx: %u amode: %u alen: %u\n",
(u8)((desc[1] &
SDMA_DESC1_HEADER_INDEX_SMASK) >>
SDMA_DESC1_HEADER_INDEX_SHIFT),
(u8)((desc[1] &
SDMA_DESC1_HEADER_MODE_SMASK) >>
SDMA_DESC1_HEADER_MODE_SHIFT),
(u8)((desc[1] &
SDMA_DESC1_HEADER_DWS_SMASK) >>
SDMA_DESC1_HEADER_DWS_SHIFT));
head++;
head &= sde->sdma_mask;
}
}
#define SDE_FMT \
"SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
/**
* sdma_seqfile_dump_sde() - debugfs dump of sde
* @s: seq file
* @sde: send dma engine to dump
*
* This routine dumps the sde to the indicated seq file.
*/
void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
{
u16 head, tail;
struct hw_sdma_desc *descqp;
u64 desc[2];
u64 addr;
u8 gen;
u16 len;
head = sde->descq_head & sde->sdma_mask;
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
seq_printf(s, SDE_FMT, sde->this_idx,
sde->cpu,
sdma_state_name(sde->state.current_state),
(unsigned long long)read_sde_csr(sde, SD(CTRL)),
(unsigned long long)read_sde_csr(sde, SD(STATUS)),
(unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)),
(unsigned long long)read_sde_csr(sde, SD(TAIL)), tail,
(unsigned long long)read_sde_csr(sde, SD(HEAD)), head,
(unsigned long long)le64_to_cpu(*sde->head_dma),
(unsigned long long)read_sde_csr(sde, SD(MEMORY)),
(unsigned long long)read_sde_csr(sde, SD(LEN_GEN)),
(unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)),
(unsigned long long)sde->last_status,
(unsigned long long)sde->ahg_bits,
sde->tx_tail,
sde->tx_head,
sde->descq_tail,
sde->descq_head,
!list_empty(&sde->flushlist),
sde->descq_full_count,
(unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID));
/* print info for each entry in the descriptor queue */
while (head != tail) {
char flags[6] = { 'x', 'x', 'x', 'x', 0 };
descqp = &sde->descq[head];
desc[0] = le64_to_cpu(descqp->qw[0]);
desc[1] = le64_to_cpu(descqp->qw[1]);
flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-';
flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ?
'H' : '-';
flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-';
flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-';
addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT)
& SDMA_DESC0_PHY_ADDR_MASK;
gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT)
& SDMA_DESC1_GENERATION_MASK;
len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT)
& SDMA_DESC0_BYTE_COUNT_MASK;
seq_printf(s,
"\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n",
head, flags, addr, gen, len);
if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG)
seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n",
(u8)((desc[1] &
SDMA_DESC1_HEADER_INDEX_SMASK) >>
SDMA_DESC1_HEADER_INDEX_SHIFT),
(u8)((desc[1] &
SDMA_DESC1_HEADER_MODE_SMASK) >>
SDMA_DESC1_HEADER_MODE_SHIFT));
head = (head + 1) & sde->sdma_mask;
}
}
/*
* add the generation number into
* the qw1 and return
*/
static inline u64 add_gen(struct sdma_engine *sde, u64 qw1)
{
u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3;
qw1 &= ~SDMA_DESC1_GENERATION_SMASK;
qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK)
<< SDMA_DESC1_GENERATION_SHIFT;
return qw1;
}
/*
* This routine submits the indicated tx
*
* Space has already been guaranteed and
* tail side of ring is locked.
*
* The hardware tail update is done
* in the caller and that is facilitated
* by returning the new tail.
*
* There is special case logic for ahg
* to not add the generation number for
* up to 2 descriptors that follow the
* first descriptor.
*
*/
static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx)
{
int i;
u16 tail;
struct sdma_desc *descp = tx->descp;
u8 skip = 0, mode = ahg_mode(tx);
tail = sde->descq_tail & sde->sdma_mask;
sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1]));
trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1],
tail, &sde->descq[tail]);
tail = ++sde->descq_tail & sde->sdma_mask;
descp++;
if (mode > SDMA_AHG_APPLY_UPDATE1)
skip = mode >> 1;
for (i = 1; i < tx->num_desc; i++, descp++) {
u64 qw1;
sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]);
if (skip) {
/* edits don't have generation */
qw1 = descp->qw[1];
skip--;
} else {
/* replace generation with real one for non-edits */
qw1 = add_gen(sde, descp->qw[1]);
}
sde->descq[tail].qw[1] = cpu_to_le64(qw1);
trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1,
tail, &sde->descq[tail]);
tail = ++sde->descq_tail & sde->sdma_mask;
}
tx->next_descq_idx = tail;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
trace_hfi1_sdma_in_sn(sde, tx->sn);
WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]);
#endif
sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx;
sde->desc_avail -= tx->num_desc;
return tail;
}
/*
* Check for progress
*/
static int sdma_check_progress(
struct sdma_engine *sde,
struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent)
{
int ret;
sde->desc_avail = sdma_descq_freecnt(sde);
if (tx->num_desc <= sde->desc_avail)
return -EAGAIN;
/* pulse the head_lock */
if (wait && iowait_ioww_to_iow(wait)->sleep) {
unsigned seq;
seq = raw_seqcount_begin(
(const seqcount_t *)&sde->head_lock.seqcount);
ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
if (ret == -EAGAIN)
sde->desc_avail = sdma_descq_freecnt(sde);
} else {
ret = -EBUSY;
}
return ret;
}
/**
* sdma_send_txreq() - submit a tx req to ring
* @sde: sdma engine to use
* @wait: SE wait structure to use when full (may be NULL)
* @tx: sdma_txreq to submit
* @pkts_sent: has any packet been sent yet?
*
* The call submits the tx into the ring. If a iowait structure is non-NULL
* the packet will be queued to the list in wait.
*
* Return:
* 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in
* ring (wait == NULL)
* -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
*/
int sdma_send_txreq(struct sdma_engine *sde,
struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent)
{
int ret = 0;
u16 tail;
unsigned long flags;
/* user should have supplied entire packet */
if (unlikely(tx->tlen))
return -EINVAL;
tx->wait = iowait_ioww_to_iow(wait);
spin_lock_irqsave(&sde->tail_lock, flags);
retry:
if (unlikely(!__sdma_running(sde)))
goto unlock_noconn;
if (unlikely(tx->num_desc > sde->desc_avail))
goto nodesc;
tail = submit_tx(sde, tx);
if (wait)
iowait_sdma_inc(iowait_ioww_to_iow(wait));
sdma_update_tail(sde, tail);
unlock:
spin_unlock_irqrestore(&sde->tail_lock, flags);
return ret;
unlock_noconn:
if (wait)
iowait_sdma_inc(iowait_ioww_to_iow(wait));
tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
trace_hfi1_sdma_in_sn(sde, tx->sn);
#endif
spin_lock(&sde->flushlist_lock);
list_add_tail(&tx->list, &sde->flushlist);
spin_unlock(&sde->flushlist_lock);
iowait_inc_wait_count(wait, tx->num_desc);
queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
ret = -ECOMM;
goto unlock;
nodesc:
ret = sdma_check_progress(sde, wait, tx, pkts_sent);
if (ret == -EAGAIN) {
ret = 0;
goto retry;
}
sde->descq_full_count++;
goto unlock;
}
/**
* sdma_send_txlist() - submit a list of tx req to ring
* @sde: sdma engine to use
* @wait: SE wait structure to use when full (may be NULL)
* @tx_list: list of sdma_txreqs to submit
* @count_out: pointer to a u16 which, after return will contain the total number of
* sdma_txreqs removed from the tx_list. This will include sdma_txreqs
* whose SDMA descriptors are submitted to the ring and the sdma_txreqs
* which are added to SDMA engine flush list if the SDMA engine state is
* not running.
*
* The call submits the list into the ring.
*
* If the iowait structure is non-NULL and not equal to the iowait list
* the unprocessed part of the list will be appended to the list in wait.
*
* In all cases, the tx_list will be updated so the head of the tx_list is
* the list of descriptors that have yet to be transmitted.
*
* The intent of this call is to provide a more efficient
* way of submitting multiple packets to SDMA while holding the tail
* side locking.
*
* Return:
* 0 - Success,
staging/rdma/hfi1: Properly determine error status of SDMA slots To ensure correct operation between the driver and PSM with respect to managing the SDMA request ring, it is important that the status for a particular request slot is set at the correct time. Otherwise, PSM can get out of sync with the driver, which could lead to hangs or errors on new requests. Properly determining of when to set the error status of a SDMA slot depends on knowing exactly when the last txreq for that request has been completed. This in turn requires that the driver knows exactly how many requests have been generated and how many of those requests have been successfully submitted to the SDMA queue. The previous implementation of the mid-layer SDMA API did not provide a way for the caller of sdma_send_txlist() to know how many of the txreqs in the input list have actually been submitted without traversing the list and counting. Since sdma_send_txlist() already traverses the list in order to process it, requiring such traversal in the caller is completely unnecessary. Therefore, it is much easier to enhance sdma_send_txlist() to return the number of successfully submitted txreqs. This, in turn, allows the caller to accurately determine the progress of the SDMA request and, therefore, correctly set the error status at the right time. Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Jubin John <jubin.john@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
2016-02-04 06:35:23 +08:00
* -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
* -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
*/
int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
struct list_head *tx_list, u16 *count_out)
{
struct sdma_txreq *tx, *tx_next;
int ret = 0;
unsigned long flags;
u16 tail = INVALID_TAIL;
u32 submit_count = 0, flush_count = 0, total_count;
spin_lock_irqsave(&sde->tail_lock, flags);
retry:
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
tx->wait = iowait_ioww_to_iow(wait);
if (unlikely(!__sdma_running(sde)))
goto unlock_noconn;
if (unlikely(tx->num_desc > sde->desc_avail))
goto nodesc;
if (unlikely(tx->tlen)) {
ret = -EINVAL;
goto update_tail;
}
list_del_init(&tx->list);
tail = submit_tx(sde, tx);
submit_count++;
if (tail != INVALID_TAIL &&
(submit_count & SDMA_TAIL_UPDATE_THRESH) == 0) {
sdma_update_tail(sde, tail);
tail = INVALID_TAIL;
}
}
update_tail:
total_count = submit_count + flush_count;
if (wait) {
iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
iowait_starve_clear(submit_count > 0,
iowait_ioww_to_iow(wait));
}
if (tail != INVALID_TAIL)
sdma_update_tail(sde, tail);
spin_unlock_irqrestore(&sde->tail_lock, flags);
*count_out = total_count;
return ret;
unlock_noconn:
spin_lock(&sde->flushlist_lock);
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
tx->wait = iowait_ioww_to_iow(wait);
list_del_init(&tx->list);
tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
trace_hfi1_sdma_in_sn(sde, tx->sn);
#endif
list_add_tail(&tx->list, &sde->flushlist);
flush_count++;
iowait_inc_wait_count(wait, tx->num_desc);
}
spin_unlock(&sde->flushlist_lock);
queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
ret = -ECOMM;
goto update_tail;
nodesc:
ret = sdma_check_progress(sde, wait, tx, submit_count > 0);
if (ret == -EAGAIN) {
ret = 0;
goto retry;
}
sde->descq_full_count++;
goto update_tail;
}
static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event)
{
unsigned long flags;
spin_lock_irqsave(&sde->tail_lock, flags);
write_seqlock(&sde->head_lock);
__sdma_process_event(sde, event);
if (sde->state.current_state == sdma_state_s99_running)
sdma_desc_avail(sde, sdma_descq_freecnt(sde));
write_sequnlock(&sde->head_lock);
spin_unlock_irqrestore(&sde->tail_lock, flags);
}
static void __sdma_process_event(struct sdma_engine *sde,
enum sdma_events event)
{
struct sdma_state *ss = &sde->state;
int need_progress = 0;
/* CONFIG SDMA temporary */
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx,
sdma_state_names[ss->current_state],
sdma_event_names[event]);
#endif
switch (ss->current_state) {
case sdma_state_s00_hw_down:
switch (event) {
case sdma_event_e00_go_hw_down:
break;
case sdma_event_e30_go_running:
/*
* If down, but running requested (usually result
* of link up, then we need to start up.
* This can happen when hw down is requested while
* bringing the link up with traffic active on
* 7220, e.g.
*/
ss->go_s99_running = 1;
fallthrough; /* and start dma engine */
case sdma_event_e10_go_hw_start:
/* This reference means the state machine is started */
sdma_get(&sde->state);
sdma_set_state(sde,
sdma_state_s10_hw_start_up_halt_wait);
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e40_sw_cleaned:
sdma_sw_tear_down(sde);
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s10_hw_start_up_halt_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
sdma_sw_tear_down(sde);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
sdma_set_state(sde,
sdma_state_s15_hw_start_up_clean_wait);
sdma_start_hw_clean_up(sde);
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
schedule_work(&sde->err_halt_worker);
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s15_hw_start_up_clean_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
sdma_sw_tear_down(sde);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
sdma_hw_start_up(sde);
sdma_set_state(sde, ss->go_s99_running ?
sdma_state_s99_running :
sdma_state_s20_idle);
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s20_idle:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
sdma_sw_tear_down(sde);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
sdma_set_state(sde, sdma_state_s99_running);
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
schedule_work(&sde->err_halt_worker);
break;
case sdma_event_e70_go_idle:
break;
case sdma_event_e85_link_down:
case sdma_event_e80_hw_freeze:
sdma_set_state(sde, sdma_state_s80_hw_freeze);
atomic_dec(&sde->dd->sdma_unfreeze_count);
wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s30_sw_clean_up_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait);
sdma_start_hw_clean_up(sde);
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
ss->go_s99_running = 0;
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s40_hw_clean_up_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
sdma_hw_start_up(sde);
sdma_set_state(sde, ss->go_s99_running ?
sdma_state_s99_running :
sdma_state_s20_idle);
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
ss->go_s99_running = 0;
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s50_hw_halt_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
schedule_work(&sde->err_halt_worker);
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
ss->go_s99_running = 0;
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s60_idle_halt_wait:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
schedule_work(&sde->err_halt_worker);
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s80_hw_freeze:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
sdma_set_state(sde, sdma_state_s82_freeze_sw_clean);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e82_hw_unfreeze:
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s82_freeze_sw_clean:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
ss->go_s99_running = 1;
break;
case sdma_event_e40_sw_cleaned:
/* notify caller this engine is done cleaning */
atomic_dec(&sde->dd->sdma_unfreeze_count);
wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
break;
case sdma_event_e70_go_idle:
ss->go_s99_running = 0;
break;
case sdma_event_e80_hw_freeze:
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
sdma_hw_start_up(sde);
sdma_set_state(sde, ss->go_s99_running ?
sdma_state_s99_running :
sdma_state_s20_idle);
break;
case sdma_event_e85_link_down:
break;
case sdma_event_e90_sw_halted:
break;
}
break;
case sdma_state_s99_running:
switch (event) {
case sdma_event_e00_go_hw_down:
sdma_set_state(sde, sdma_state_s00_hw_down);
tasklet_hi_schedule(&sde->sdma_sw_clean_up_task);
break;
case sdma_event_e10_go_hw_start:
break;
case sdma_event_e15_hw_halt_done:
break;
case sdma_event_e25_hw_clean_up_done:
break;
case sdma_event_e30_go_running:
break;
case sdma_event_e40_sw_cleaned:
break;
case sdma_event_e50_hw_cleaned:
break;
case sdma_event_e60_hw_halted:
need_progress = 1;
sdma_err_progress_check_schedule(sde);
fallthrough;
case sdma_event_e90_sw_halted:
/*
* SW initiated halt does not perform engines
* progress check
*/
sdma_set_state(sde, sdma_state_s50_hw_halt_wait);
schedule_work(&sde->err_halt_worker);
break;
case sdma_event_e70_go_idle:
sdma_set_state(sde, sdma_state_s60_idle_halt_wait);
break;
case sdma_event_e85_link_down:
ss->go_s99_running = 0;
fallthrough;
case sdma_event_e80_hw_freeze:
sdma_set_state(sde, sdma_state_s80_hw_freeze);
atomic_dec(&sde->dd->sdma_unfreeze_count);
wake_up_interruptible(&sde->dd->sdma_unfreeze_wq);
break;
case sdma_event_e81_hw_frozen:
break;
case sdma_event_e82_hw_unfreeze:
break;
}
break;
}
ss->last_event = event;
if (need_progress)
sdma_make_progress(sde, 0);
}
/*
* _extend_sdma_tx_descs() - helper to extend txreq
*
* This is called once the initial nominal allocation
* of descriptors in the sdma_txreq is exhausted.
*
* The code will bump the allocation up to the max
* of MAX_DESC (64) descriptors. There doesn't seem
* much point in an interim step. The last descriptor
* is reserved for coalesce buffer in order to support
* cases where input packet has >MAX_DESC iovecs.
*
*/
static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
{
int i;
struct sdma_desc *descp;
/* Handle last descriptor */
if (unlikely((tx->num_desc == (MAX_DESC - 1)))) {
/* if tlen is 0, it is for padding, release last descriptor */
if (!tx->tlen) {
tx->desc_limit = MAX_DESC;
} else if (!tx->coalesce_buf) {
/* allocate coalesce buffer with space for padding */
tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32),
GFP_ATOMIC);
if (!tx->coalesce_buf)
goto enomem;
tx->coalesce_idx = 0;
}
return 0;
}
if (unlikely(tx->num_desc == MAX_DESC))
goto enomem;
descp = kmalloc_array(MAX_DESC, sizeof(struct sdma_desc), GFP_ATOMIC);
if (!descp)
goto enomem;
tx->descp = descp;
/* reserve last descriptor for coalescing */
tx->desc_limit = MAX_DESC - 1;
/* copy ones already built */
for (i = 0; i < tx->num_desc; i++)
tx->descp[i] = tx->descs[i];
return 0;
enomem:
__sdma_txclean(dd, tx);
return -ENOMEM;
}
/*
* ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors
*
* This is called once the initial nominal allocation of descriptors
* in the sdma_txreq is exhausted.
*
* This function calls _extend_sdma_tx_descs to extend or allocate
* coalesce buffer. If there is a allocated coalesce buffer, it will
* copy the input packet data into the coalesce buffer. It also adds
* coalesce buffer descriptor once when whole packet is received.
*
* Return:
* <0 - error
* 0 - coalescing, don't populate descriptor
* 1 - continue with populating descriptor
*/
int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
int type, void *kvaddr, struct page *page,
unsigned long offset, u16 len)
{
int pad_len, rval;
dma_addr_t addr;
rval = _extend_sdma_tx_descs(dd, tx);
if (rval) {
__sdma_txclean(dd, tx);
return rval;
}
/* If coalesce buffer is allocated, copy data into it */
if (tx->coalesce_buf) {
if (type == SDMA_MAP_NONE) {
__sdma_txclean(dd, tx);
return -EINVAL;
}
if (type == SDMA_MAP_PAGE) {
kvaddr = kmap_local_page(page);
kvaddr += offset;
} else if (WARN_ON(!kvaddr)) {
__sdma_txclean(dd, tx);
return -EINVAL;
}
memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
tx->coalesce_idx += len;
if (type == SDMA_MAP_PAGE)
kunmap_local(kvaddr);
/* If there is more data, return */
if (tx->tlen - tx->coalesce_idx)
return 0;
/* Whole packet is received; add any padding */
pad_len = tx->packet_len & (sizeof(u32) - 1);
if (pad_len) {
pad_len = sizeof(u32) - pad_len;
memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len);
/* padding is taken care of for coalescing case */
tx->packet_len += pad_len;
tx->tlen += pad_len;
}
/* dma map the coalesce buffer */
addr = dma_map_single(&dd->pcidev->dev,
tx->coalesce_buf,
tx->tlen,
DMA_TO_DEVICE);
if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) {
__sdma_txclean(dd, tx);
return -ENOSPC;
}
/* Add descriptor for coalesce buffer */
tx->desc_limit = MAX_DESC;
IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate The hfi1 user SDMA pinned-page cache will leave a stale cache entry when the cache-entry's virtual address range is invalidated but that cache entry is in-use by an outstanding SDMA request. Subsequent user SDMA requests with buffers in or spanning the virtual address range of the stale cache entry will result in packets constructed from the wrong memory, the physical pages pointed to by the stale cache entry. To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler cache independent of the cache entry's refcount. Add 'struct kref refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with kref_get() and kref_put(). mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove 'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node code to use mmu_rb_node.refcount. Move the mmu_rb_handler destructor call after a wait-for-SDMA-request-completion call so mmu_rb_nodes that need mmu_rb_handler's workqueue to queue themselves up for destruction from an interrupt context may do so. Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging") Fixes: 00cbce5cbf88 ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests") Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick <dean.luick@cornelisnetworks.com> Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2023-05-20 00:32:16 +08:00
return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
addr, tx->tlen, NULL, NULL, NULL);
}
return 1;
}
/* Update sdes when the lmc changes */
void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid)
{
struct sdma_engine *sde;
int i;
u64 sreg;
sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) <<
SD(CHECK_SLID_MASK_SHIFT)) |
(((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) <<
SD(CHECK_SLID_VALUE_SHIFT));
for (i = 0; i < dd->num_sdma; i++) {
hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x",
i, (u32)sreg);
sde = &dd->per_sdma[i];
write_sde_csr(sde, SD(CHECK_SLID), sreg);
}
}
/* tx not dword sized - pad */
int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
{
int rval = 0;
IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors Fix three sources of error involving struct sdma_txreq.num_descs. When _extend_sdma_tx_descs() extends the descriptor array, it uses the value of tx->num_descs to determine how many existing entries from the tx's original, internal descriptor array to copy to the newly allocated one. As this value was incremented before the call, the copy loop will access one entry past the internal descriptor array, copying its contents into the corresponding slot in the new array. If the call to _extend_sdma_tx_descs() fails, _pad_smda_tx_descs() then invokes __sdma_tx_clean() which uses the value of tx->num_desc to drive a loop that unmaps all descriptor entries in use. As this value was incremented before the call, the unmap loop will invoke sdma_unmap_desc() on a descriptor entry whose contents consist of whatever random data was copied into it during (1), leading to cascading further calls into the kernel and driver using arbitrary data. _sdma_close_tx() was using tx->num_descs instead of tx->num_descs - 1. Fix all of the above by: - Only increment .num_descs after .descp is extended. - Use .num_descs - 1 instead of .num_descs for last .descp entry. Fixes: f4d26d81ad7f ("staging/rdma/hfi1: Add coalescing support for SDMA TX descriptors") Link: https://lore.kernel.org/r/167656658879.2223096.10026561343022570690.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2023-02-17 00:56:28 +08:00
if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) {
rval = _extend_sdma_tx_descs(dd, tx);
if (rval) {
__sdma_txclean(dd, tx);
return rval;
}
}
IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests hfi1 user SDMA request processing has two bugs that can cause data corruption for user SDMA requests that have multiple payload iovecs where an iovec other than the tail iovec does not run up to the page boundary for the buffer pointed to by that iovec.a Here are the specific bugs: 1. user_sdma_txadd() does not use struct user_sdma_iovec->iov.iov_len. Rather, user_sdma_txadd() will add up to PAGE_SIZE bytes from iovec to the packet, even if some of those bytes are past iovec->iov.iov_len and are thus not intended to be in the packet. 2. user_sdma_txadd() and user_sdma_send_pkts() fail to advance to the next iovec in user_sdma_request->iovs when the current iovec is not PAGE_SIZE and does not contain enough data to complete the packet. The transmitted packet will contain the wrong data from the iovec pages. This has not been an issue with SDMA packets from hfi1 Verbs or PSM2 because they only produce iovecs that end short of PAGE_SIZE as the tail iovec of an SDMA request. Fixing these bugs exposes other bugs with the SDMA pin cache (struct mmu_rb_handler) that get in way of supporting user SDMA requests with multiple payload iovecs whose buffers do not end at PAGE_SIZE. So this commit fixes those issues as well. Here are the mmu_rb_handler bugs that non-PAGE_SIZE-end multi-iovec payload user SDMA requests can hit: 1. Overlapping memory ranges in mmu_rb_handler will result in duplicate pinnings. 2. When extending an existing mmu_rb_handler entry (struct mmu_rb_node), the mmu_rb code (1) removes the existing entry under a lock, (2) releases that lock, pins the new pages, (3) then reacquires the lock to insert the extended mmu_rb_node. If someone else comes in and inserts an overlapping entry between (2) and (3), insert in (3) will fail. The failure path code in this case unpins _all_ pages in either the original mmu_rb_node or the new mmu_rb_node that was inserted between (2) and (3). 3. In hfi1_mmu_rb_remove_unless_exact(), mmu_rb_node->refcount is incremented outside of mmu_rb_handler->lock. As a result, mmu_rb_node could be evicted by another thread that gets mmu_rb_handler->lock and checks mmu_rb_node->refcount before mmu_rb_node->refcount is incremented. 4. Related to #2 above, SDMA request submission failure path does not check mmu_rb_node->refcount before freeing mmu_rb_node object. If there are other SDMA requests in progress whose iovecs have pointers to the now-freed mmu_rb_node(s), those pointers to the now-freed mmu_rb nodes will be dereferenced when those SDMA requests complete. Fixes: 7be85676f1d1 ("IB/hfi1: Don't remove RB entry when not needed.") Fixes: 7724105686e7 ("IB/hfi1: add driver files") Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Link: https://lore.kernel.org/r/168088636445.3027109.10054635277810177889.stgit@252.162.96.66.static.eigbox.net Signed-off-by: Leon Romanovsky <leon@kernel.org>
2023-04-08 00:52:44 +08:00
/* finish the one just added */
make_tx_sdma_desc(
tx,
SDMA_MAP_NONE,
dd->sdma_pad_phys,
IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate The hfi1 user SDMA pinned-page cache will leave a stale cache entry when the cache-entry's virtual address range is invalidated but that cache entry is in-use by an outstanding SDMA request. Subsequent user SDMA requests with buffers in or spanning the virtual address range of the stale cache entry will result in packets constructed from the wrong memory, the physical pages pointed to by the stale cache entry. To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler cache independent of the cache entry's refcount. Add 'struct kref refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with kref_get() and kref_put(). mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove 'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node code to use mmu_rb_node.refcount. Move the mmu_rb_handler destructor call after a wait-for-SDMA-request-completion call so mmu_rb_nodes that need mmu_rb_handler's workqueue to queue themselves up for destruction from an interrupt context may do so. Fixes: f48ad614c100 ("IB/hfi1: Move driver out of staging") Fixes: 00cbce5cbf88 ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests") Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.com Reviewed-by: Dean Luick <dean.luick@cornelisnetworks.com> Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2023-05-20 00:32:16 +08:00
sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)),
NULL, NULL, NULL);
IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors Fix three sources of error involving struct sdma_txreq.num_descs. When _extend_sdma_tx_descs() extends the descriptor array, it uses the value of tx->num_descs to determine how many existing entries from the tx's original, internal descriptor array to copy to the newly allocated one. As this value was incremented before the call, the copy loop will access one entry past the internal descriptor array, copying its contents into the corresponding slot in the new array. If the call to _extend_sdma_tx_descs() fails, _pad_smda_tx_descs() then invokes __sdma_tx_clean() which uses the value of tx->num_desc to drive a loop that unmaps all descriptor entries in use. As this value was incremented before the call, the unmap loop will invoke sdma_unmap_desc() on a descriptor entry whose contents consist of whatever random data was copied into it during (1), leading to cascading further calls into the kernel and driver using arbitrary data. _sdma_close_tx() was using tx->num_descs instead of tx->num_descs - 1. Fix all of the above by: - Only increment .num_descs after .descp is extended. - Use .num_descs - 1 instead of .num_descs for last .descp entry. Fixes: f4d26d81ad7f ("staging/rdma/hfi1: Add coalescing support for SDMA TX descriptors") Link: https://lore.kernel.org/r/167656658879.2223096.10026561343022570690.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
2023-02-17 00:56:28 +08:00
tx->num_desc++;
_sdma_close_tx(dd, tx);
return rval;
}
/*
* Add ahg to the sdma_txreq
*
* The logic will consume up to 3
* descriptors at the beginning of
* sdma_txreq.
*/
void _sdma_txreq_ahgadd(
struct sdma_txreq *tx,
u8 num_ahg,
u8 ahg_entry,
u32 *ahg,
u8 ahg_hlen)
{
u32 i, shift = 0, desc = 0;
u8 mode;
WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4);
/* compute mode */
if (num_ahg == 1)
mode = SDMA_AHG_APPLY_UPDATE1;
else if (num_ahg <= 5)
mode = SDMA_AHG_APPLY_UPDATE2;
else
mode = SDMA_AHG_APPLY_UPDATE3;
tx->num_desc++;
/* initialize to consumed descriptors to zero */
switch (mode) {
case SDMA_AHG_APPLY_UPDATE3:
tx->num_desc++;
tx->descs[2].qw[0] = 0;
tx->descs[2].qw[1] = 0;
fallthrough;
case SDMA_AHG_APPLY_UPDATE2:
tx->num_desc++;
tx->descs[1].qw[0] = 0;
tx->descs[1].qw[1] = 0;
break;
}
ahg_hlen >>= 2;
tx->descs[0].qw[1] |=
(((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK)
<< SDMA_DESC1_HEADER_INDEX_SHIFT) |
(((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK)
<< SDMA_DESC1_HEADER_DWS_SHIFT) |
(((u64)mode & SDMA_DESC1_HEADER_MODE_MASK)
<< SDMA_DESC1_HEADER_MODE_SHIFT) |
(((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK)
<< SDMA_DESC1_HEADER_UPDATE1_SHIFT);
for (i = 0; i < (num_ahg - 1); i++) {
if (!shift && !(i & 2))
desc++;
tx->descs[desc].qw[!!(i & 2)] |=
(((u64)ahg[i + 1])
<< shift);
shift = (shift + 32) & 63;
}
}
/**
* sdma_ahg_alloc - allocate an AHG entry
* @sde: engine to allocate from
*
* Return:
* 0-31 when successful, -EOPNOTSUPP if AHG is not enabled,
* -ENOSPC if an entry is not available
*/
int sdma_ahg_alloc(struct sdma_engine *sde)
{
int nr;
int oldbit;
if (!sde) {
trace_hfi1_ahg_allocate(sde, -EINVAL);
return -EINVAL;
}
while (1) {
locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE() patterns to READ_ONCE()/WRITE_ONCE() Please do not apply this to mainline directly, instead please re-run the coccinelle script shown below and apply its output. For several reasons, it is desirable to use {READ,WRITE}_ONCE() in preference to ACCESS_ONCE(), and new code is expected to use one of the former. So far, there's been no reason to change most existing uses of ACCESS_ONCE(), as these aren't harmful, and changing them results in churn. However, for some features, the read/write distinction is critical to correct operation. To distinguish these cases, separate read/write accessors must be used. This patch migrates (most) remaining ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following coccinelle script: ---- // Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and // WRITE_ONCE() // $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch virtual patch @ depends on patch @ expression E1, E2; @@ - ACCESS_ONCE(E1) = E2 + WRITE_ONCE(E1, E2) @ depends on patch @ expression E; @@ - ACCESS_ONCE(E) + READ_ONCE(E) ---- Signed-off-by: Mark Rutland <mark.rutland@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: davem@davemloft.net Cc: linux-arch@vger.kernel.org Cc: mpe@ellerman.id.au Cc: shuah@kernel.org Cc: snitzer@redhat.com Cc: thor.thayer@linux.intel.com Cc: tj@kernel.org Cc: viro@zeniv.linux.org.uk Cc: will.deacon@arm.com Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-10-24 05:07:29 +08:00
nr = ffz(READ_ONCE(sde->ahg_bits));
if (nr > 31) {
trace_hfi1_ahg_allocate(sde, -ENOSPC);
return -ENOSPC;
}
oldbit = test_and_set_bit(nr, &sde->ahg_bits);
if (!oldbit)
break;
cpu_relax();
}
trace_hfi1_ahg_allocate(sde, nr);
return nr;
}
/**
* sdma_ahg_free - free an AHG entry
* @sde: engine to return AHG entry
* @ahg_index: index to free
*
* This routine frees the indicate AHG entry.
*/
void sdma_ahg_free(struct sdma_engine *sde, int ahg_index)
{
if (!sde)
return;
trace_hfi1_ahg_deallocate(sde, ahg_index);
if (ahg_index < 0 || ahg_index > 31)
return;
clear_bit(ahg_index, &sde->ahg_bits);
}
/*
* SPC freeze handling for SDMA engines. Called when the driver knows
* the SPC is going into a freeze but before the freeze is fully
* settled. Generally an error interrupt.
*
* This event will pull the engine out of running so no more entries can be
* added to the engine's queue.
*/
void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down)
{
int i;
enum sdma_events event = link_down ? sdma_event_e85_link_down :
sdma_event_e80_hw_freeze;
/* set up the wait but do not wait here */
atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
/* tell all engines to stop running and wait */
for (i = 0; i < dd->num_sdma; i++)
sdma_process_event(&dd->per_sdma[i], event);
/* sdma_freeze() will wait for all engines to have stopped */
}
/*
* SPC freeze handling for SDMA engines. Called when the driver knows
* the SPC is fully frozen.
*/
void sdma_freeze(struct hfi1_devdata *dd)
{
int i;
int ret;
/*
* Make sure all engines have moved out of the running state before
* continuing.
*/
ret = wait_event_interruptible(dd->sdma_unfreeze_wq,
atomic_read(&dd->sdma_unfreeze_count) <=
0);
/* interrupted or count is negative, then unloading - just exit */
if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0)
return;
/* set up the count for the next wait */
atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma);
/* tell all engines that the SPC is frozen, they can start cleaning */
for (i = 0; i < dd->num_sdma; i++)
sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen);
/*
* Wait for everyone to finish software clean before exiting. The
* software clean will read engine CSRs, so must be completed before
* the next step, which will clear the engine CSRs.
*/
(void)wait_event_interruptible(dd->sdma_unfreeze_wq,
atomic_read(&dd->sdma_unfreeze_count) <= 0);
/* no need to check results - done no matter what */
}
/*
* SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen.
*
* The SPC freeze acts like a SDMA halt and a hardware clean combined. All
* that is left is a software clean. We could do it after the SPC is fully
* frozen, but then we'd have to add another state to wait for the unfreeze.
* Instead, just defer the software clean until the unfreeze step.
*/
void sdma_unfreeze(struct hfi1_devdata *dd)
{
int i;
/* tell all engines start freeze clean up */
for (i = 0; i < dd->num_sdma; i++)
sdma_process_event(&dd->per_sdma[i],
sdma_event_e82_hw_unfreeze);
}
/**
* _sdma_engine_progress_schedule() - schedule progress on engine
* @sde: sdma_engine to schedule progress
*
*/
void _sdma_engine_progress_schedule(
struct sdma_engine *sde)
{
trace_hfi1_sdma_engine_progress(sde, sde->progress_mask);
/* assume we have selected a good cpu */
write_csr(sde->dd,
CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)),
sde->progress_mask);
}