2010-04-22 06:30:06 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
|
|
|
|
*
|
|
|
|
* This software is available to you under a choice of one of two
|
|
|
|
* licenses. You may choose to be licensed under the terms of the GNU
|
|
|
|
* General Public License (GPL) Version 2, available from the file
|
|
|
|
* COPYING in the main directory of this source tree, or the
|
|
|
|
* OpenIB.org BSD license below:
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or
|
|
|
|
* without modification, are permitted provided that the following
|
|
|
|
* conditions are met:
|
|
|
|
*
|
|
|
|
* - Redistributions of source code must retain the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer.
|
|
|
|
*
|
|
|
|
* - Redistributions in binary form must reproduce the above
|
|
|
|
* copyright notice, this list of conditions and the following
|
|
|
|
* disclaimer in the documentation and/or other materials
|
|
|
|
* provided with the distribution.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
|
|
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
|
|
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
|
|
* SOFTWARE.
|
|
|
|
*/
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/moduleparam.h>
|
|
|
|
#include <linux/device.h>
|
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/etherdevice.h>
|
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/ethtool.h>
|
|
|
|
#include <linux/rtnetlink.h>
|
|
|
|
#include <linux/inetdevice.h>
|
|
|
|
#include <linux/io.h>
|
|
|
|
|
|
|
|
#include <asm/irq.h>
|
|
|
|
#include <asm/byteorder.h>
|
|
|
|
|
|
|
|
#include <rdma/iw_cm.h>
|
|
|
|
#include <rdma/ib_verbs.h>
|
|
|
|
#include <rdma/ib_smi.h>
|
|
|
|
#include <rdma/ib_umem.h>
|
|
|
|
#include <rdma/ib_user_verbs.h>
|
|
|
|
|
|
|
|
#include "iw_cxgb4.h"
|
|
|
|
|
2010-09-18 04:40:15 +08:00
|
|
|
static int fastreg_support = 1;
|
2010-04-22 06:30:06 +08:00
|
|
|
module_param(fastreg_support, int, 0644);
|
2010-09-18 04:40:15 +08:00
|
|
|
MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2019-02-13 02:39:16 +08:00
|
|
|
static void c4iw_dealloc_ucontext(struct ib_ucontext *context)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2019-02-13 02:39:15 +08:00
|
|
|
struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
|
2016-12-22 23:40:36 +08:00
|
|
|
struct c4iw_dev *rhp;
|
2010-04-22 06:30:06 +08:00
|
|
|
struct c4iw_mm_entry *mm, *tmp;
|
|
|
|
|
2019-02-13 02:39:15 +08:00
|
|
|
pr_debug("context %p\n", context);
|
2016-12-22 23:40:36 +08:00
|
|
|
rhp = to_c4iw_dev(ucontext->ibucontext.device);
|
|
|
|
|
2010-04-22 06:30:06 +08:00
|
|
|
list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
|
|
|
|
kfree(mm);
|
|
|
|
c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
|
|
|
|
}
|
|
|
|
|
2019-02-13 02:39:16 +08:00
|
|
|
static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext,
|
|
|
|
struct ib_udata *udata)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2019-02-13 02:39:16 +08:00
|
|
|
struct ib_device *ibdev = ucontext->device;
|
|
|
|
struct c4iw_ucontext *context = to_c4iw_ucontext(ucontext);
|
2010-04-22 06:30:06 +08:00
|
|
|
struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
struct c4iw_alloc_ucontext_resp uresp;
|
|
|
|
int ret = 0;
|
|
|
|
struct c4iw_mm_entry *mm = NULL;
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p\n", ibdev);
|
2010-04-22 06:30:06 +08:00
|
|
|
c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
|
|
|
|
INIT_LIST_HEAD(&context->mmaps);
|
|
|
|
spin_lock_init(&context->mmap_lock);
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
|
RDMA/cxgb4: add missing padding at end of struct c4iw_alloc_ucontext_resp
The i386 ABI disagrees with most other ABIs regarding alignment of
data types larger than 4 bytes: on most ABIs a padding must be added
at end of the structures, while it is not required on i386.
So for most ABI struct c4iw_alloc_ucontext_resp gets implicitly padded
to be aligned on a 8 bytes multiple, while for i386, such padding is
not added.
The tool pahole can be used to find such implicit padding:
$ pahole --anon_include \
--nested_anon_include \
--recursive \
--class_name c4iw_alloc_ucontext_resp \
drivers/infiniband/hw/cxgb4/iw_cxgb4.o
Then, structure layout can be compared between i386 and x86_64:
+++ obj-i386/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 11:43:05.547432195 +0100
--- obj-x86_64/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 10:55:10.990133017 +0100
@@ -2,9 +2,8 @@ struct c4iw_alloc_ucontext_resp {
__u64 status_page_key; /* 0 8 */
__u32 status_page_size; /* 8 4 */
- /* size: 12, cachelines: 1, members: 2 */
- /* last cacheline: 12 bytes */
+ /* size: 16, cachelines: 1, members: 2 */
+ /* padding: 4 */
+ /* last cacheline: 16 bytes */
};
This ABI disagreement will make an x86_64 kernel try to write past the
buffer provided by an i386 binary.
When boundary check will be implemented, the x86_64 kernel will refuse
to write past the i386 userspace provided buffer and the uverbs will
fail.
If the structure is on a page boundary and the next page is not
mapped, ib_copy_to_udata() will fail and the uverb will fail.
Additionally, as reported by Dan Carpenter, without the implicit
padding being properly cleared, an information leak would take place
in most architectures.
This patch adds an explicit padding to struct c4iw_alloc_ucontext_resp,
and, like 92b0ca7cb149 ("IB/mlx5: Fix stack info leak in
mlx5_ib_alloc_ucontext()"), makes function c4iw_alloc_ucontext()
not writting this padding field to userspace. This way, x86_64 kernel
will be able to write struct c4iw_alloc_ucontext_resp as expected by
unpatched and patched i386 libcxgb4.
Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com
Link: http://marc.info/?i=1395848977.3297.15.camel@localhost.localdomain
Link: http://marc.info/?i=20140328082428.GH25192@mwanda
Cc: <stable@vger.kernel.org>
Fixes: 05eb23893c2c ("cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes")
Reported-by: Yann Droneaud <ydroneaud@opteya.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Acked-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-05-06 01:35:26 +08:00
|
|
|
if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
|
2017-02-10 06:23:50 +08:00
|
|
|
pr_err_once("Warning - downlevel libcxgb4 (non-fatal), device status page disabled\n");
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED;
|
|
|
|
} else {
|
|
|
|
mm = kmalloc(sizeof(*mm), GFP_KERNEL);
|
2014-03-29 02:55:21 +08:00
|
|
|
if (!mm) {
|
|
|
|
ret = -ENOMEM;
|
2019-02-13 02:39:16 +08:00
|
|
|
goto err;
|
2014-03-29 02:55:21 +08:00
|
|
|
}
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
|
|
|
|
uresp.status_page_size = PAGE_SIZE;
|
|
|
|
|
|
|
|
spin_lock(&context->mmap_lock);
|
|
|
|
uresp.status_page_key = context->key;
|
|
|
|
context->key += PAGE_SIZE;
|
|
|
|
spin_unlock(&context->mmap_lock);
|
|
|
|
|
RDMA/cxgb4: add missing padding at end of struct c4iw_alloc_ucontext_resp
The i386 ABI disagrees with most other ABIs regarding alignment of
data types larger than 4 bytes: on most ABIs a padding must be added
at end of the structures, while it is not required on i386.
So for most ABI struct c4iw_alloc_ucontext_resp gets implicitly padded
to be aligned on a 8 bytes multiple, while for i386, such padding is
not added.
The tool pahole can be used to find such implicit padding:
$ pahole --anon_include \
--nested_anon_include \
--recursive \
--class_name c4iw_alloc_ucontext_resp \
drivers/infiniband/hw/cxgb4/iw_cxgb4.o
Then, structure layout can be compared between i386 and x86_64:
+++ obj-i386/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 11:43:05.547432195 +0100
--- obj-x86_64/drivers/infiniband/hw/cxgb4/iw_cxgb4.o.pahole.txt 2014-03-28 10:55:10.990133017 +0100
@@ -2,9 +2,8 @@ struct c4iw_alloc_ucontext_resp {
__u64 status_page_key; /* 0 8 */
__u32 status_page_size; /* 8 4 */
- /* size: 12, cachelines: 1, members: 2 */
- /* last cacheline: 12 bytes */
+ /* size: 16, cachelines: 1, members: 2 */
+ /* padding: 4 */
+ /* last cacheline: 16 bytes */
};
This ABI disagreement will make an x86_64 kernel try to write past the
buffer provided by an i386 binary.
When boundary check will be implemented, the x86_64 kernel will refuse
to write past the i386 userspace provided buffer and the uverbs will
fail.
If the structure is on a page boundary and the next page is not
mapped, ib_copy_to_udata() will fail and the uverb will fail.
Additionally, as reported by Dan Carpenter, without the implicit
padding being properly cleared, an information leak would take place
in most architectures.
This patch adds an explicit padding to struct c4iw_alloc_ucontext_resp,
and, like 92b0ca7cb149 ("IB/mlx5: Fix stack info leak in
mlx5_ib_alloc_ucontext()"), makes function c4iw_alloc_ucontext()
not writting this padding field to userspace. This way, x86_64 kernel
will be able to write struct c4iw_alloc_ucontext_resp as expected by
unpatched and patched i386 libcxgb4.
Link: http://marc.info/?i=cover.1399309513.git.ydroneaud@opteya.com
Link: http://marc.info/?i=1395848977.3297.15.camel@localhost.localdomain
Link: http://marc.info/?i=20140328082428.GH25192@mwanda
Cc: <stable@vger.kernel.org>
Fixes: 05eb23893c2c ("cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes")
Reported-by: Yann Droneaud <ydroneaud@opteya.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Yann Droneaud <ydroneaud@opteya.com>
Acked-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
2014-05-06 01:35:26 +08:00
|
|
|
ret = ib_copy_to_udata(udata, &uresp,
|
|
|
|
sizeof(uresp) - sizeof(uresp.reserved));
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
if (ret)
|
|
|
|
goto err_mm;
|
|
|
|
|
|
|
|
mm->key = uresp.status_page_key;
|
|
|
|
mm->addr = virt_to_phys(rhp->rdev.status_page);
|
|
|
|
mm->len = PAGE_SIZE;
|
|
|
|
insert_mmap(context, mm);
|
|
|
|
}
|
2019-02-13 02:39:16 +08:00
|
|
|
return 0;
|
cxgb4/iw_cxgb4: Doorbell Drop Avoidance Bug Fixes
The current logic suffers from a slow response time to disable user DB
usage, and also fails to avoid DB FIFO drops under heavy load. This commit
fixes these deficiencies and makes the avoidance logic more optimal.
This is done by more efficiently notifying the ULDs of potential DB
problems, and implements a smoother flow control algorithm in iw_cxgb4,
which is the ULD that puts the most load on the DB fifo.
Design:
cxgb4:
Direct ULD callback from the DB FULL/DROP interrupt handler. This allows
the ULD to stop doing user DB writes as quickly as possible.
While user DB usage is disabled, the LLD will accumulate DB write events
for its queues. Then once DB usage is reenabled, a single DB write is
done for each queue with its accumulated write count. This reduces the
load put on the DB fifo when reenabling.
iw_cxgb4:
Instead of marking each qp to indicate DB writes are disabled, we create
a device-global status page that each user process maps. This allows
iw_cxgb4 to only set this single bit to disable all DB writes for all
user QPs vs traversing the idr of all the active QPs. If the libcxgb4
doesn't support this, then we fall back to the old approach of marking
each QP. Thus we allow the new driver to work with an older libcxgb4.
When the LLD upcalls iw_cxgb4 indicating DB FULL, we disable all DB writes
via the status page and transition the DB state to STOPPED. As user
processes see that DB writes are disabled, they call into iw_cxgb4
to submit their DB write events. Since the DB state is in STOPPED,
the QP trying to write gets enqueued on a new DB "flow control" list.
As subsequent DB writes are submitted for this flow controlled QP, the
amount of writes are accumulated for each QP on the flow control list.
So all the user QPs that are actively ringing the DB get put on this
list and the number of writes they request are accumulated.
When the LLD upcalls iw_cxgb4 indicating DB EMPTY, which is in a workq
context, we change the DB state to FLOW_CONTROL, and begin resuming all
the QPs that are on the flow control list. This logic runs on until
the flow control list is empty or we exit FLOW_CONTROL mode (due to
a DB DROP upcall, for example). QPs are removed from this list, and
their accumulated DB write counts written to the DB FIFO. Sets of QPs,
called chunks in the code, are removed at one time. The chunk size is 64.
So 64 QPs are resumed at a time, and before the next chunk is resumed, the
logic waits (blocks) for the DB FIFO to drain. This prevents resuming to
quickly and overflowing the FIFO. Once the flow control list is empty,
the db state transitions back to NORMAL and user QPs are again allowed
to write directly to the user DB register.
The algorithm is designed such that if the DB write load is high enough,
then all the DB writes get submitted by the kernel using this flow
controlled approach to avoid DB drops. As the load lightens though, we
resume to normal DB writes directly by user applications.
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-15 00:22:08 +08:00
|
|
|
err_mm:
|
|
|
|
kfree(mm);
|
|
|
|
err:
|
2019-02-13 02:39:16 +08:00
|
|
|
return ret;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
int len = vma->vm_end - vma->vm_start;
|
|
|
|
u32 key = vma->vm_pgoff << PAGE_SHIFT;
|
|
|
|
struct c4iw_rdev *rdev;
|
|
|
|
int ret = 0;
|
|
|
|
struct c4iw_mm_entry *mm;
|
|
|
|
struct c4iw_ucontext *ucontext;
|
|
|
|
u64 addr;
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff,
|
2017-02-10 06:23:51 +08:00
|
|
|
key, len);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
|
|
|
if (vma->vm_start & (PAGE_SIZE-1))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
rdev = &(to_c4iw_dev(context->device)->rdev);
|
|
|
|
ucontext = to_c4iw_ucontext(context);
|
|
|
|
|
|
|
|
mm = remove_mmap(ucontext, key, len);
|
|
|
|
if (!mm)
|
|
|
|
return -EINVAL;
|
|
|
|
addr = mm->addr;
|
|
|
|
kfree(mm);
|
|
|
|
|
2010-09-14 00:23:57 +08:00
|
|
|
if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
|
|
|
|
(addr < (pci_resource_start(rdev->lldi.pdev, 0) +
|
|
|
|
pci_resource_len(rdev->lldi.pdev, 0)))) {
|
2010-04-22 06:30:06 +08:00
|
|
|
|
|
|
|
/*
|
2010-09-14 00:23:57 +08:00
|
|
|
* MA_SYNC register...
|
2010-04-22 06:30:06 +08:00
|
|
|
*/
|
|
|
|
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
|
2010-09-14 00:23:57 +08:00
|
|
|
ret = io_remap_pfn_range(vma, vma->vm_start,
|
|
|
|
addr >> PAGE_SHIFT,
|
|
|
|
len, vma->vm_page_prot);
|
|
|
|
} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
|
|
|
|
(addr < (pci_resource_start(rdev->lldi.pdev, 2) +
|
|
|
|
pci_resource_len(rdev->lldi.pdev, 2)))) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map user DB or OCQP memory...
|
|
|
|
*/
|
|
|
|
if (addr >= rdev->oc_mw_pa)
|
|
|
|
vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
|
2013-03-14 13:08:58 +08:00
|
|
|
else {
|
2015-09-23 19:49:27 +08:00
|
|
|
if (!is_t4(rdev->lldi.adapter_type))
|
2013-03-14 13:08:58 +08:00
|
|
|
vma->vm_page_prot =
|
|
|
|
t4_pgprot_wc(vma->vm_page_prot);
|
|
|
|
else
|
|
|
|
vma->vm_page_prot =
|
|
|
|
pgprot_noncached(vma->vm_page_prot);
|
|
|
|
}
|
2010-04-22 06:30:06 +08:00
|
|
|
ret = io_remap_pfn_range(vma, vma->vm_start,
|
|
|
|
addr >> PAGE_SHIFT,
|
|
|
|
len, vma->vm_page_prot);
|
|
|
|
} else {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map WQ or CQ contig dma memory...
|
|
|
|
*/
|
|
|
|
ret = remap_pfn_range(vma, vma->vm_start,
|
|
|
|
addr >> PAGE_SHIFT,
|
|
|
|
len, vma->vm_page_prot);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-01 00:10:05 +08:00
|
|
|
static void c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
|
|
|
struct c4iw_dev *rhp;
|
|
|
|
struct c4iw_pd *php;
|
|
|
|
|
|
|
|
php = to_c4iw_pd(pd);
|
|
|
|
rhp = php->rhp;
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibpd %p pdid 0x%x\n", pd, php->pdid);
|
2012-05-18 17:59:32 +08:00
|
|
|
c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
|
2012-05-18 17:59:27 +08:00
|
|
|
mutex_lock(&rhp->rdev.stats.lock);
|
|
|
|
rhp->rdev.stats.pd.cur--;
|
|
|
|
mutex_unlock(&rhp->rdev.stats.lock);
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
2019-04-01 00:10:07 +08:00
|
|
|
static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2019-02-03 20:55:51 +08:00
|
|
|
struct c4iw_pd *php = to_c4iw_pd(pd);
|
|
|
|
struct ib_device *ibdev = pd->device;
|
2010-04-22 06:30:06 +08:00
|
|
|
u32 pdid;
|
|
|
|
struct c4iw_dev *rhp;
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p\n", ibdev);
|
2010-04-22 06:30:06 +08:00
|
|
|
rhp = (struct c4iw_dev *) ibdev;
|
2012-05-18 17:59:32 +08:00
|
|
|
pdid = c4iw_get_resource(&rhp->rdev.resource.pdid_table);
|
2010-04-22 06:30:06 +08:00
|
|
|
if (!pdid)
|
2019-02-03 20:55:51 +08:00
|
|
|
return -EINVAL;
|
|
|
|
|
2010-04-22 06:30:06 +08:00
|
|
|
php->pdid = pdid;
|
|
|
|
php->rhp = rhp;
|
2019-04-01 00:10:07 +08:00
|
|
|
if (udata) {
|
2018-03-15 06:01:50 +08:00
|
|
|
struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid};
|
|
|
|
|
|
|
|
if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
|
2019-04-01 00:10:05 +08:00
|
|
|
c4iw_deallocate_pd(&php->ibpd, udata);
|
2019-02-03 20:55:51 +08:00
|
|
|
return -EFAULT;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
}
|
2012-05-18 17:59:27 +08:00
|
|
|
mutex_lock(&rhp->rdev.stats.lock);
|
|
|
|
rhp->rdev.stats.pd.cur++;
|
|
|
|
if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max)
|
|
|
|
rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
|
|
|
|
mutex_unlock(&rhp->rdev.stats.lock);
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("pdid 0x%0x ptr 0x%p\n", pdid, php);
|
2019-02-03 20:55:51 +08:00
|
|
|
return 0;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
|
|
|
|
u16 *pkey)
|
|
|
|
{
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p\n", ibdev);
|
2010-04-22 06:30:06 +08:00
|
|
|
*pkey = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
|
|
|
|
union ib_gid *gid)
|
|
|
|
{
|
|
|
|
struct c4iw_dev *dev;
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p, port %d, index %d, gid %p\n",
|
|
|
|
ibdev, port, index, gid);
|
2017-11-03 05:11:03 +08:00
|
|
|
if (!port)
|
|
|
|
return -EINVAL;
|
2010-04-22 06:30:06 +08:00
|
|
|
dev = to_c4iw_dev(ibdev);
|
|
|
|
memset(&(gid->raw[0]), 0, sizeof(gid->raw));
|
|
|
|
memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-06-11 21:35:25 +08:00
|
|
|
static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
|
|
|
|
struct ib_udata *uhw)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
|
|
|
|
|
|
|
struct c4iw_dev *dev;
|
2015-06-11 21:35:25 +08:00
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p\n", ibdev);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2015-06-11 21:35:25 +08:00
|
|
|
if (uhw->inlen || uhw->outlen)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2010-04-22 06:30:06 +08:00
|
|
|
dev = to_c4iw_dev(ibdev);
|
|
|
|
memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
|
2013-03-14 13:08:58 +08:00
|
|
|
props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
|
2010-04-22 06:30:06 +08:00
|
|
|
props->fw_ver = dev->rdev.lldi.fw_vers;
|
|
|
|
props->device_cap_flags = dev->device_cap_flags;
|
|
|
|
props->page_size_cap = T4_PAGESIZE_MASK;
|
|
|
|
props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
|
|
|
|
props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
|
|
|
|
props->max_mr_size = T4_MAX_MR_SIZE;
|
2014-07-21 23:25:15 +08:00
|
|
|
props->max_qp = dev->rdev.lldi.vr->qp.size / 2;
|
2018-07-25 23:52:14 +08:00
|
|
|
props->max_srq = dev->rdev.lldi.vr->srq.size;
|
2014-07-15 00:04:51 +08:00
|
|
|
props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth;
|
2018-07-25 23:52:14 +08:00
|
|
|
props->max_srq_wr = dev->rdev.hw_queue.t4_max_qp_depth;
|
2018-06-18 23:05:26 +08:00
|
|
|
props->max_send_sge = min(T4_MAX_SEND_SGE, T4_MAX_WRITE_SGE);
|
|
|
|
props->max_recv_sge = T4_MAX_RECV_SGE;
|
2018-07-25 23:52:14 +08:00
|
|
|
props->max_srq_sge = T4_MAX_RECV_SGE;
|
2010-04-22 06:30:06 +08:00
|
|
|
props->max_sge_rd = 1;
|
2014-07-15 00:04:52 +08:00
|
|
|
props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter;
|
|
|
|
props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp,
|
|
|
|
c4iw_max_read_depth);
|
|
|
|
props->max_qp_init_rd_atom = props->max_qp_rd_atom;
|
2014-07-21 23:25:15 +08:00
|
|
|
props->max_cq = dev->rdev.lldi.vr->qp.size;
|
2014-07-15 00:04:51 +08:00
|
|
|
props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth;
|
2010-04-22 06:30:06 +08:00
|
|
|
props->max_mr = c4iw_num_stags(&dev->rdev);
|
|
|
|
props->max_pd = T4_MAX_NUM_PD;
|
|
|
|
props->local_ca_ack_delay = 0;
|
2016-02-12 18:40:35 +08:00
|
|
|
props->max_fast_reg_page_list_len =
|
|
|
|
t4_max_fr_depth(dev->rdev.lldi.ulptx_memwrite_dsgl && use_dsgl);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int c4iw_query_port(struct ib_device *ibdev, u8 port,
|
|
|
|
struct ib_port_attr *props)
|
|
|
|
{
|
2019-10-25 18:57:02 +08:00
|
|
|
int ret = 0;
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("ibdev %p\n", ibdev);
|
2019-10-25 18:57:02 +08:00
|
|
|
ret = ib_get_eth_speed(ibdev, port, &props->active_speed,
|
|
|
|
&props->active_width);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
|
|
|
props->port_cap_flags =
|
|
|
|
IB_PORT_CM_SUP |
|
|
|
|
IB_PORT_SNMP_TUNNEL_SUP |
|
|
|
|
IB_PORT_REINIT_SUP |
|
|
|
|
IB_PORT_DEVICE_MGMT_SUP |
|
|
|
|
IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
|
|
|
|
props->gid_tbl_len = 1;
|
|
|
|
props->pkey_tbl_len = 1;
|
|
|
|
props->max_msg_sz = -1;
|
|
|
|
|
2019-10-25 18:57:02 +08:00
|
|
|
return ret;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
static ssize_t hw_rev_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2018-12-18 20:15:56 +08:00
|
|
|
struct c4iw_dev *c4iw_dev =
|
|
|
|
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("dev 0x%p\n", dev);
|
2013-03-14 13:08:58 +08:00
|
|
|
return sprintf(buf, "%d\n",
|
|
|
|
CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
2018-10-12 03:31:54 +08:00
|
|
|
static DEVICE_ATTR_RO(hw_rev);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
static ssize_t hca_type_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2018-12-18 20:15:56 +08:00
|
|
|
struct c4iw_dev *c4iw_dev =
|
|
|
|
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
|
2010-04-22 06:30:06 +08:00
|
|
|
struct ethtool_drvinfo info;
|
|
|
|
struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("dev 0x%p\n", dev);
|
2010-04-22 06:30:06 +08:00
|
|
|
lldev->ethtool_ops->get_drvinfo(lldev, &info);
|
|
|
|
return sprintf(buf, "%s\n", info.driver);
|
|
|
|
}
|
2018-10-12 03:31:54 +08:00
|
|
|
static DEVICE_ATTR_RO(hca_type);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
static ssize_t board_id_show(struct device *dev, struct device_attribute *attr,
|
|
|
|
char *buf)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2018-12-18 20:15:56 +08:00
|
|
|
struct c4iw_dev *c4iw_dev =
|
|
|
|
rdma_device_to_drv_device(dev, struct c4iw_dev, ibdev);
|
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("dev 0x%p\n", dev);
|
2010-04-22 06:30:06 +08:00
|
|
|
return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
|
|
|
|
c4iw_dev->rdev.lldi.pdev->device);
|
|
|
|
}
|
2018-10-12 03:31:54 +08:00
|
|
|
static DEVICE_ATTR_RO(board_id);
|
2010-04-22 06:30:06 +08:00
|
|
|
|
IB/core: Make device counter infrastructure dynamic
In practice, each RDMA device has a unique set of counters that the
hardware implements. Having a central set of counters that they must
all adhere to is limiting and causes many useful counters to not be
available.
Therefore we create a dynamic counter registration infrastructure.
The driver must implement a stats structure allocation routine, in
which the driver must place the directory name it wants, a list of
names for all of the counters, an array of u64 counters themselves,
plus a few generic configuration options.
We then implement a core routine to create a sysfs file for each
of the named stats elements, and a core routine to retrieve the
stats when any of the sysfs attribute files are read.
To avoid excessive beating on the stats generation routine in the
drivers, the core code also caches the stats for a short period of
time so that someone attempting to read all of the stats in a
given device's directory will not result in a stats generation
call per file read.
Future work will attempt to standardize just the shared stats
elements, and possibly add a method to get the stats via netlink
in addition to sysfs.
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
[ Add caching, make structure names more informative, add i40iw support,
other significant rewrites from the original patch ]
2016-05-17 01:49:33 +08:00
|
|
|
enum counters {
|
|
|
|
IP4INSEGS,
|
|
|
|
IP4OUTSEGS,
|
|
|
|
IP4RETRANSSEGS,
|
|
|
|
IP4OUTRSTS,
|
|
|
|
IP6INSEGS,
|
|
|
|
IP6OUTSEGS,
|
|
|
|
IP6RETRANSSEGS,
|
|
|
|
IP6OUTRSTS,
|
|
|
|
NR_COUNTERS
|
|
|
|
};
|
|
|
|
|
|
|
|
static const char * const names[] = {
|
|
|
|
[IP4INSEGS] = "ip4InSegs",
|
|
|
|
[IP4OUTSEGS] = "ip4OutSegs",
|
|
|
|
[IP4RETRANSSEGS] = "ip4RetransSegs",
|
|
|
|
[IP4OUTRSTS] = "ip4OutRsts",
|
|
|
|
[IP6INSEGS] = "ip6InSegs",
|
|
|
|
[IP6OUTSEGS] = "ip6OutSegs",
|
|
|
|
[IP6RETRANSSEGS] = "ip6RetransSegs",
|
|
|
|
[IP6OUTRSTS] = "ip6OutRsts"
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev,
|
|
|
|
u8 port_num)
|
|
|
|
{
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
|
|
|
|
|
|
|
|
if (port_num != 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
|
|
|
|
RDMA_HW_STATS_DEFAULT_LIFESPAN);
|
|
|
|
}
|
|
|
|
|
2010-04-22 06:30:06 +08:00
|
|
|
static int c4iw_get_mib(struct ib_device *ibdev,
|
IB/core: Make device counter infrastructure dynamic
In practice, each RDMA device has a unique set of counters that the
hardware implements. Having a central set of counters that they must
all adhere to is limiting and causes many useful counters to not be
available.
Therefore we create a dynamic counter registration infrastructure.
The driver must implement a stats structure allocation routine, in
which the driver must place the directory name it wants, a list of
names for all of the counters, an array of u64 counters themselves,
plus a few generic configuration options.
We then implement a core routine to create a sysfs file for each
of the named stats elements, and a core routine to retrieve the
stats when any of the sysfs attribute files are read.
To avoid excessive beating on the stats generation routine in the
drivers, the core code also caches the stats for a short period of
time so that someone attempting to read all of the stats in a
given device's directory will not result in a stats generation
call per file read.
Future work will attempt to standardize just the shared stats
elements, and possibly add a method to get the stats via netlink
in addition to sysfs.
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
[ Add caching, make structure names more informative, add i40iw support,
other significant rewrites from the original patch ]
2016-05-17 01:49:33 +08:00
|
|
|
struct rdma_hw_stats *stats,
|
|
|
|
u8 port, int index)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
2010-10-18 23:16:40 +08:00
|
|
|
struct tp_tcp_stats v4, v6;
|
|
|
|
struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
|
|
|
|
|
|
|
|
cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
|
IB/core: Make device counter infrastructure dynamic
In practice, each RDMA device has a unique set of counters that the
hardware implements. Having a central set of counters that they must
all adhere to is limiting and causes many useful counters to not be
available.
Therefore we create a dynamic counter registration infrastructure.
The driver must implement a stats structure allocation routine, in
which the driver must place the directory name it wants, a list of
names for all of the counters, an array of u64 counters themselves,
plus a few generic configuration options.
We then implement a core routine to create a sysfs file for each
of the named stats elements, and a core routine to retrieve the
stats when any of the sysfs attribute files are read.
To avoid excessive beating on the stats generation routine in the
drivers, the core code also caches the stats for a short period of
time so that someone attempting to read all of the stats in a
given device's directory will not result in a stats generation
call per file read.
Future work will attempt to standardize just the shared stats
elements, and possibly add a method to get the stats via netlink
in addition to sysfs.
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Mark Bloch <markb@mellanox.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
[ Add caching, make structure names more informative, add i40iw support,
other significant rewrites from the original patch ]
2016-05-17 01:49:33 +08:00
|
|
|
stats->value[IP4INSEGS] = v4.tcp_in_segs;
|
|
|
|
stats->value[IP4OUTSEGS] = v4.tcp_out_segs;
|
|
|
|
stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs;
|
|
|
|
stats->value[IP4OUTRSTS] = v4.tcp_out_rsts;
|
|
|
|
stats->value[IP6INSEGS] = v6.tcp_in_segs;
|
|
|
|
stats->value[IP6OUTSEGS] = v6.tcp_out_segs;
|
|
|
|
stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs;
|
|
|
|
stats->value[IP6OUTRSTS] = v6.tcp_out_rsts;
|
|
|
|
|
|
|
|
return stats->num_counters;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
static struct attribute *c4iw_class_attributes[] = {
|
|
|
|
&dev_attr_hw_rev.attr,
|
|
|
|
&dev_attr_hca_type.attr,
|
|
|
|
&dev_attr_board_id.attr,
|
|
|
|
NULL
|
|
|
|
};
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
static const struct attribute_group c4iw_attr_group = {
|
|
|
|
.attrs = c4iw_class_attributes,
|
2010-04-22 06:30:06 +08:00
|
|
|
};
|
|
|
|
|
2015-05-14 08:02:58 +08:00
|
|
|
static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num,
|
|
|
|
struct ib_port_immutable *immutable)
|
|
|
|
{
|
|
|
|
struct ib_port_attr attr;
|
|
|
|
int err;
|
|
|
|
|
2017-01-24 19:02:39 +08:00
|
|
|
immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
|
|
|
|
|
|
|
|
err = ib_query_port(ibdev, port_num, &attr);
|
2015-05-14 08:02:58 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
immutable->pkey_tbl_len = attr.pkey_tbl_len;
|
|
|
|
immutable->gid_tbl_len = attr.gid_tbl_len;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-27 21:49:53 +08:00
|
|
|
static void get_dev_fw_str(struct ib_device *dev, char *str)
|
2016-06-15 14:21:58 +08:00
|
|
|
{
|
|
|
|
struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
|
|
|
|
ibdev);
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("dev 0x%p\n", dev);
|
2016-06-15 14:21:58 +08:00
|
|
|
|
2017-06-27 21:49:53 +08:00
|
|
|
snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u",
|
2016-06-15 14:21:58 +08:00
|
|
|
FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
|
|
|
|
FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
|
|
|
|
FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
|
|
|
|
FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
|
|
|
|
}
|
|
|
|
|
2018-05-03 23:41:49 +08:00
|
|
|
static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
|
|
|
|
{
|
|
|
|
return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) &&
|
|
|
|
c4iw_restrack_funcs[res->type]) ?
|
|
|
|
c4iw_restrack_funcs[res->type](msg, res) : 0;
|
|
|
|
}
|
|
|
|
|
2018-12-11 03:09:33 +08:00
|
|
|
static const struct ib_device_ops c4iw_dev_ops = {
|
2019-06-06 01:39:26 +08:00
|
|
|
.owner = THIS_MODULE,
|
2019-06-06 01:39:24 +08:00
|
|
|
.driver_id = RDMA_DRIVER_CXGB4,
|
2019-06-06 01:39:25 +08:00
|
|
|
.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION,
|
2019-06-06 01:39:24 +08:00
|
|
|
|
2018-12-11 03:09:33 +08:00
|
|
|
.alloc_hw_stats = c4iw_alloc_stats,
|
|
|
|
.alloc_mr = c4iw_alloc_mr,
|
|
|
|
.alloc_mw = c4iw_alloc_mw,
|
|
|
|
.alloc_pd = c4iw_allocate_pd,
|
|
|
|
.alloc_ucontext = c4iw_alloc_ucontext,
|
|
|
|
.create_cq = c4iw_create_cq,
|
|
|
|
.create_qp = c4iw_create_qp,
|
|
|
|
.create_srq = c4iw_create_srq,
|
|
|
|
.dealloc_mw = c4iw_dealloc_mw,
|
|
|
|
.dealloc_pd = c4iw_deallocate_pd,
|
|
|
|
.dealloc_ucontext = c4iw_dealloc_ucontext,
|
|
|
|
.dereg_mr = c4iw_dereg_mr,
|
|
|
|
.destroy_cq = c4iw_destroy_cq,
|
|
|
|
.destroy_qp = c4iw_destroy_qp,
|
|
|
|
.destroy_srq = c4iw_destroy_srq,
|
2019-01-30 18:49:02 +08:00
|
|
|
.fill_res_entry = fill_res_entry,
|
2018-12-11 03:09:33 +08:00
|
|
|
.get_dev_fw_str = get_dev_fw_str,
|
|
|
|
.get_dma_mr = c4iw_get_dma_mr,
|
|
|
|
.get_hw_stats = c4iw_get_mib,
|
|
|
|
.get_port_immutable = c4iw_port_immutable,
|
2019-04-29 19:59:06 +08:00
|
|
|
.iw_accept = c4iw_accept_cr,
|
|
|
|
.iw_add_ref = c4iw_qp_add_ref,
|
|
|
|
.iw_connect = c4iw_connect,
|
|
|
|
.iw_create_listen = c4iw_create_listen,
|
|
|
|
.iw_destroy_listen = c4iw_destroy_listen,
|
|
|
|
.iw_get_qp = c4iw_get_qp,
|
|
|
|
.iw_reject = c4iw_reject_cr,
|
|
|
|
.iw_rem_ref = c4iw_qp_rem_ref,
|
2018-12-11 03:09:33 +08:00
|
|
|
.map_mr_sg = c4iw_map_mr_sg,
|
|
|
|
.mmap = c4iw_mmap,
|
|
|
|
.modify_qp = c4iw_ib_modify_qp,
|
|
|
|
.modify_srq = c4iw_modify_srq,
|
|
|
|
.poll_cq = c4iw_poll_cq,
|
|
|
|
.post_recv = c4iw_post_receive,
|
|
|
|
.post_send = c4iw_post_send,
|
|
|
|
.post_srq_recv = c4iw_post_srq_recv,
|
|
|
|
.query_device = c4iw_query_device,
|
|
|
|
.query_gid = c4iw_query_gid,
|
|
|
|
.query_pkey = c4iw_query_pkey,
|
|
|
|
.query_port = c4iw_query_port,
|
|
|
|
.query_qp = c4iw_ib_query_qp,
|
|
|
|
.reg_user_mr = c4iw_reg_user_mr,
|
|
|
|
.req_notify_cq = c4iw_arm_cq,
|
2019-02-03 20:55:51 +08:00
|
|
|
INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
|
2019-05-28 19:37:29 +08:00
|
|
|
INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
|
2019-04-03 21:42:43 +08:00
|
|
|
INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
|
2019-02-13 02:39:16 +08:00
|
|
|
INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
|
2018-12-11 03:09:33 +08:00
|
|
|
};
|
|
|
|
|
2019-04-05 03:56:58 +08:00
|
|
|
static int set_netdevs(struct ib_device *ib_dev, struct c4iw_rdev *rdev)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < rdev->lldi.nports; i++) {
|
|
|
|
ret = ib_device_set_netdev(ib_dev, rdev->lldi.ports[i],
|
|
|
|
i + 1);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-11-09 20:17:33 +08:00
|
|
|
void c4iw_register_device(struct work_struct *work)
|
2010-04-22 06:30:06 +08:00
|
|
|
{
|
|
|
|
int ret;
|
2017-11-09 20:17:33 +08:00
|
|
|
struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work);
|
|
|
|
struct c4iw_dev *dev = ctx->dev;
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("c4iw_dev %p\n", dev);
|
2010-04-22 06:30:06 +08:00
|
|
|
memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
|
|
|
|
memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
|
|
|
|
dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
|
|
|
|
if (fastreg_support)
|
|
|
|
dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
|
|
|
|
dev->ibdev.local_dma_lkey = 0;
|
|
|
|
dev->ibdev.uverbs_cmd_mask =
|
|
|
|
(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_REG_MR) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
|
2012-05-18 17:59:33 +08:00
|
|
|
(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
|
2010-04-22 06:30:06 +08:00
|
|
|
(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_POST_SEND) |
|
2018-07-25 23:52:14 +08:00
|
|
|
(1ull << IB_USER_VERBS_CMD_POST_RECV) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
|
|
|
|
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
|
2010-04-22 06:30:06 +08:00
|
|
|
dev->ibdev.node_type = RDMA_NODE_RNIC;
|
2016-08-26 01:57:07 +08:00
|
|
|
BUILD_BUG_ON(sizeof(C4IW_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
|
2010-04-22 06:30:06 +08:00
|
|
|
memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
|
|
|
|
dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
|
2014-06-07 00:10:42 +08:00
|
|
|
dev->ibdev.num_comp_vectors = dev->rdev.lldi.nciq;
|
2017-01-21 05:04:16 +08:00
|
|
|
dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2019-04-29 19:59:06 +08:00
|
|
|
memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
|
|
|
|
sizeof(dev->ibdev.iw_ifname));
|
2010-04-22 06:30:06 +08:00
|
|
|
|
2018-10-12 03:31:54 +08:00
|
|
|
rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
|
2018-12-11 03:09:33 +08:00
|
|
|
ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
|
2019-04-05 03:56:58 +08:00
|
|
|
ret = set_netdevs(&dev->ibdev, &dev->rdev);
|
|
|
|
if (ret)
|
2019-04-29 19:59:06 +08:00
|
|
|
goto err_dealloc_ctx;
|
2018-12-18 20:28:30 +08:00
|
|
|
ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
|
2010-04-22 06:30:06 +08:00
|
|
|
if (ret)
|
2019-04-29 19:59:06 +08:00
|
|
|
goto err_dealloc_ctx;
|
2017-11-09 20:17:33 +08:00
|
|
|
return;
|
2018-10-12 03:31:54 +08:00
|
|
|
|
2017-11-09 20:17:33 +08:00
|
|
|
err_dealloc_ctx:
|
|
|
|
pr_err("%s - Failed registering iwarp device: %d\n",
|
|
|
|
pci_name(ctx->lldi.pdev), ret);
|
|
|
|
c4iw_dealloc(ctx);
|
|
|
|
return;
|
2010-04-22 06:30:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void c4iw_unregister_device(struct c4iw_dev *dev)
|
|
|
|
{
|
2017-09-27 15:35:49 +08:00
|
|
|
pr_debug("c4iw_dev %p\n", dev);
|
2010-04-22 06:30:06 +08:00
|
|
|
ib_unregister_device(&dev->ibdev);
|
|
|
|
return;
|
|
|
|
}
|