2019-05-29 00:57:20 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2006-01-18 17:30:29 +08:00
|
|
|
/******************************************************************************
|
|
|
|
*******************************************************************************
|
|
|
|
**
|
|
|
|
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
|
2010-02-25 01:08:18 +08:00
|
|
|
** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved.
|
2006-01-18 17:30:29 +08:00
|
|
|
**
|
|
|
|
**
|
|
|
|
*******************************************************************************
|
|
|
|
******************************************************************************/
|
|
|
|
|
2021-11-03 03:17:15 +08:00
|
|
|
#include <trace/events/dlm.h>
|
|
|
|
|
2006-01-18 17:30:29 +08:00
|
|
|
#include "dlm_internal.h"
|
2024-03-28 23:48:33 +08:00
|
|
|
#include "lvb_table.h"
|
2022-10-28 04:45:21 +08:00
|
|
|
#include "memory.h"
|
2006-01-18 17:30:29 +08:00
|
|
|
#include "lock.h"
|
2006-07-13 05:44:04 +08:00
|
|
|
#include "user.h"
|
2014-02-09 20:49:17 +08:00
|
|
|
#include "ast.h"
|
2006-01-18 17:30:29 +08:00
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
static void dlm_callback_work(struct work_struct *work)
|
2011-02-22 04:58:21 +08:00
|
|
|
{
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
struct dlm_callback *cb = container_of(work, struct dlm_callback, work);
|
|
|
|
|
|
|
|
if (cb->flags & DLM_CB_BAST) {
|
|
|
|
trace_dlm_bast(cb->ls_id, cb->lkb_id, cb->mode, cb->res_name,
|
|
|
|
cb->res_length);
|
|
|
|
cb->bastfn(cb->astparam, cb->mode);
|
|
|
|
} else if (cb->flags & DLM_CB_CAST) {
|
|
|
|
trace_dlm_ast(cb->ls_id, cb->lkb_id, cb->sb_status,
|
|
|
|
cb->sb_flags, cb->res_name, cb->res_length);
|
|
|
|
cb->lkb_lksb->sb_status = cb->sb_status;
|
|
|
|
cb->lkb_lksb->sb_flags = cb->sb_flags;
|
|
|
|
cb->astfn(cb->astparam);
|
|
|
|
}
|
|
|
|
|
2024-03-28 23:48:42 +08:00
|
|
|
dlm_free_cb(cb);
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int dlm_queue_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
|
|
|
|
int status, uint32_t sbflags,
|
|
|
|
struct dlm_callback **cb)
|
|
|
|
{
|
|
|
|
struct dlm_rsb *rsb = lkb->lkb_resource;
|
2022-10-28 04:45:21 +08:00
|
|
|
int rv = DLM_ENQUEUE_CALLBACK_SUCCESS;
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
struct dlm_ls *ls = rsb->res_ls;
|
2024-03-28 23:48:33 +08:00
|
|
|
int copy_lvb = 0;
|
2011-02-22 04:58:21 +08:00
|
|
|
int prev_mode;
|
|
|
|
|
2022-10-28 04:45:21 +08:00
|
|
|
if (flags & DLM_CB_BAST) {
|
|
|
|
/* if cb is a bast, it should be skipped if the blocking mode is
|
|
|
|
* compatible with the last granted mode
|
|
|
|
*/
|
2024-03-28 23:48:42 +08:00
|
|
|
if (lkb->lkb_last_cast_cb_mode != -1) {
|
|
|
|
if (dlm_modes_compat(mode, lkb->lkb_last_cast_cb_mode)) {
|
2022-10-28 04:45:21 +08:00
|
|
|
log_debug(ls, "skip %x bast mode %d for cast mode %d",
|
|
|
|
lkb->lkb_id, mode,
|
2024-03-28 23:48:42 +08:00
|
|
|
lkb->lkb_last_cast_cb_mode);
|
2022-10-28 04:45:21 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
2011-02-22 04:58:21 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Suppress some redundant basts here, do more on removal.
|
|
|
|
* Don't even add a bast if the callback just before it
|
|
|
|
* is a bast for the same mode or a more restrictive mode.
|
|
|
|
* (the addional > PR check is needed for PR/CW inversion)
|
|
|
|
*/
|
2024-03-28 23:48:42 +08:00
|
|
|
if (lkb->lkb_last_cb_mode != -1 &&
|
|
|
|
lkb->lkb_last_cb_flags & DLM_CB_BAST) {
|
|
|
|
prev_mode = lkb->lkb_last_cb_mode;
|
2011-02-22 04:58:21 +08:00
|
|
|
|
|
|
|
if ((prev_mode == mode) ||
|
|
|
|
(prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
|
2022-10-28 04:45:21 +08:00
|
|
|
log_debug(ls, "skip %x add bast mode %d for bast mode %d",
|
|
|
|
lkb->lkb_id, mode, prev_mode);
|
2011-04-06 02:16:24 +08:00
|
|
|
goto out;
|
2011-02-22 04:58:21 +08:00
|
|
|
}
|
|
|
|
}
|
2024-03-28 23:48:42 +08:00
|
|
|
|
|
|
|
lkb->lkb_last_bast_time = ktime_get();
|
|
|
|
lkb->lkb_last_bast_cb_mode = mode;
|
2024-03-28 23:48:33 +08:00
|
|
|
} else if (flags & DLM_CB_CAST) {
|
|
|
|
if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
|
2024-03-28 23:48:42 +08:00
|
|
|
prev_mode = lkb->lkb_last_cast_cb_mode;
|
2024-03-28 23:48:33 +08:00
|
|
|
|
|
|
|
if (!status && lkb->lkb_lksb->sb_lvbptr &&
|
|
|
|
dlm_lvb_operations[prev_mode + 1][mode + 1])
|
|
|
|
copy_lvb = 1;
|
|
|
|
}
|
2024-03-28 23:48:42 +08:00
|
|
|
|
|
|
|
lkb->lkb_last_cast_cb_mode = mode;
|
|
|
|
lkb->lkb_last_cast_time = ktime_get();
|
2011-02-22 04:58:21 +08:00
|
|
|
}
|
|
|
|
|
2024-03-28 23:48:42 +08:00
|
|
|
lkb->lkb_last_cb_mode = mode;
|
|
|
|
lkb->lkb_last_cb_flags = flags;
|
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
*cb = dlm_allocate_cb();
|
|
|
|
if (!*cb) {
|
2022-10-28 04:45:21 +08:00
|
|
|
rv = DLM_ENQUEUE_CALLBACK_FAILURE;
|
2011-04-06 02:16:24 +08:00
|
|
|
goto out;
|
2011-02-22 04:58:21 +08:00
|
|
|
}
|
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
/* for tracing */
|
|
|
|
(*cb)->lkb_id = lkb->lkb_id;
|
|
|
|
(*cb)->ls_id = ls->ls_global_id;
|
|
|
|
memcpy((*cb)->res_name, rsb->res_name, rsb->res_length);
|
|
|
|
(*cb)->res_length = rsb->res_length;
|
2023-03-07 04:48:08 +08:00
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
(*cb)->flags = flags;
|
|
|
|
(*cb)->mode = mode;
|
|
|
|
(*cb)->sb_status = status;
|
|
|
|
(*cb)->sb_flags = (sbflags & 0x000000FF);
|
|
|
|
(*cb)->copy_lvb = copy_lvb;
|
|
|
|
(*cb)->lkb_lksb = lkb->lkb_lksb;
|
2011-02-22 04:58:21 +08:00
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
rv = DLM_ENQUEUE_CALLBACK_NEED_SCHED;
|
2011-02-22 04:58:21 +08:00
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
out:
|
2011-04-06 02:16:24 +08:00
|
|
|
return rv;
|
2011-02-22 04:58:21 +08:00
|
|
|
}
|
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
uint32_t sbflags)
|
2011-02-22 04:58:21 +08:00
|
|
|
{
|
2011-04-06 02:16:24 +08:00
|
|
|
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
struct dlm_callback *cb;
|
2011-02-22 04:58:21 +08:00
|
|
|
int rv;
|
|
|
|
|
2023-03-07 04:48:15 +08:00
|
|
|
if (test_bit(DLM_DFL_USER_BIT, &lkb->lkb_dflags)) {
|
2022-10-28 04:45:21 +08:00
|
|
|
dlm_user_add_ast(lkb, flags, mode, status, sbflags);
|
2006-07-13 05:44:04 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
rv = dlm_queue_lkb_callback(lkb, flags, mode, status, sbflags,
|
|
|
|
&cb);
|
2022-10-28 04:45:21 +08:00
|
|
|
switch (rv) {
|
|
|
|
case DLM_ENQUEUE_CALLBACK_NEED_SCHED:
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
cb->astfn = lkb->lkb_astfn;
|
|
|
|
cb->bastfn = lkb->lkb_bastfn;
|
|
|
|
cb->astparam = lkb->lkb_astparam;
|
|
|
|
INIT_WORK(&cb->work, dlm_callback_work);
|
2006-01-18 17:30:29 +08:00
|
|
|
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_lock_bh(&ls->ls_cb_lock);
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags))
|
|
|
|
list_add(&cb->list, &ls->ls_cb_delay);
|
|
|
|
else
|
|
|
|
queue_work(ls->ls_callback_wq, &cb->work);
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_unlock_bh(&ls->ls_cb_lock);
|
2022-10-28 04:45:21 +08:00
|
|
|
break;
|
|
|
|
case DLM_ENQUEUE_CALLBACK_SUCCESS:
|
|
|
|
break;
|
2024-03-28 23:48:40 +08:00
|
|
|
case DLM_ENQUEUE_CALLBACK_FAILURE:
|
|
|
|
fallthrough;
|
2022-10-28 04:45:21 +08:00
|
|
|
default:
|
2022-11-18 06:11:42 +08:00
|
|
|
WARN_ON_ONCE(1);
|
2022-10-28 04:45:21 +08:00
|
|
|
break;
|
2011-04-06 02:16:24 +08:00
|
|
|
}
|
2006-01-18 17:30:29 +08:00
|
|
|
}
|
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
int dlm_callback_start(struct dlm_ls *ls)
|
2006-01-18 17:30:29 +08:00
|
|
|
{
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
ls->ls_callback_wq = alloc_ordered_workqueue("dlm_callback",
|
|
|
|
WQ_HIGHPRI | WQ_MEM_RECLAIM);
|
2011-04-06 02:16:24 +08:00
|
|
|
if (!ls->ls_callback_wq) {
|
|
|
|
log_print("can't start dlm_callback workqueue");
|
|
|
|
return -ENOMEM;
|
2006-01-18 17:30:29 +08:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
void dlm_callback_stop(struct dlm_ls *ls)
|
2006-01-18 17:30:29 +08:00
|
|
|
{
|
2011-04-06 02:16:24 +08:00
|
|
|
if (ls->ls_callback_wq)
|
|
|
|
destroy_workqueue(ls->ls_callback_wq);
|
2006-01-18 17:30:29 +08:00
|
|
|
}
|
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
void dlm_callback_suspend(struct dlm_ls *ls)
|
2006-01-18 17:30:29 +08:00
|
|
|
{
|
2022-08-16 03:43:26 +08:00
|
|
|
if (ls->ls_callback_wq) {
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_lock_bh(&ls->ls_cb_lock);
|
2022-08-16 03:43:26 +08:00
|
|
|
set_bit(LSFL_CB_DELAY, &ls->ls_flags);
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_unlock_bh(&ls->ls_cb_lock);
|
2006-01-18 17:30:29 +08:00
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
flush_workqueue(ls->ls_callback_wq);
|
2022-08-16 03:43:26 +08:00
|
|
|
}
|
2006-01-18 17:30:29 +08:00
|
|
|
}
|
|
|
|
|
2018-11-09 03:04:50 +08:00
|
|
|
#define MAX_CB_QUEUE 25
|
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
void dlm_callback_resume(struct dlm_ls *ls)
|
2006-01-18 17:30:29 +08:00
|
|
|
{
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
struct dlm_callback *cb, *safe;
|
2021-11-03 03:17:14 +08:00
|
|
|
int count = 0, sum = 0;
|
2021-12-01 03:47:14 +08:00
|
|
|
bool empty;
|
2006-01-18 17:30:29 +08:00
|
|
|
|
2011-04-06 02:16:24 +08:00
|
|
|
if (!ls->ls_callback_wq)
|
|
|
|
return;
|
|
|
|
|
2018-11-09 03:04:50 +08:00
|
|
|
more:
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_lock_bh(&ls->ls_cb_lock);
|
dlm: fix race between final callback and remove
This patch fixes the following issue:
node 1 is dir
node 2 is master
node 3 is other
1->2: unlock
2: put final lkb, rsb moved to toss
2->1: unlock_reply
1: queue lkb callback with EUNLOCK
2->1: remove
1: receive_remove ignored (rsb on keep because of queued lkb callback)
1: complete lkb callback, put_lkb, move rsb to toss
3->1: lookup
1->3: lookup_reply master=2
3->2: request
2->3: request_reply EBADR
In summary:
An unexpected lkb reference causes the rsb to remain on the wrong list.
The rsb being on the wrong list causes receive_remove to be ignored.
An ignored receive_remove causes inconsistent dir and master state.
This sequence requires an unusually long delay in delivering the unlock
callback, because the remove message from 2->1 usually happens after
some seconds. So, it's not known exactly how frequently this sequence
occurs in pratice. It's possible that the same end result could also
have another unknown cause.
The solution for this issue is to further separate callback state
from the lkb, so that an lkb reference (and from that, an rsb ref)
are not held while a callback remains queued. Then, within the
unlock_reply, the lkb will be freed and the rsb moved to the toss
list. So, the receive_remove will not be ignored.
Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
2024-03-28 23:48:41 +08:00
|
|
|
list_for_each_entry_safe(cb, safe, &ls->ls_cb_delay, list) {
|
|
|
|
list_del(&cb->list);
|
|
|
|
queue_work(ls->ls_callback_wq, &cb->work);
|
2011-04-06 02:16:24 +08:00
|
|
|
count++;
|
2018-11-09 03:04:50 +08:00
|
|
|
if (count == MAX_CB_QUEUE)
|
|
|
|
break;
|
2011-04-06 02:16:24 +08:00
|
|
|
}
|
2021-12-01 03:47:14 +08:00
|
|
|
empty = list_empty(&ls->ls_cb_delay);
|
2022-10-28 04:45:16 +08:00
|
|
|
if (empty)
|
|
|
|
clear_bit(LSFL_CB_DELAY, &ls->ls_flags);
|
2024-04-03 03:18:09 +08:00
|
|
|
spin_unlock_bh(&ls->ls_cb_lock);
|
2011-04-06 02:16:24 +08:00
|
|
|
|
2021-11-03 03:17:14 +08:00
|
|
|
sum += count;
|
2021-12-01 03:47:14 +08:00
|
|
|
if (!empty) {
|
2018-11-09 03:04:50 +08:00
|
|
|
count = 0;
|
|
|
|
cond_resched();
|
|
|
|
goto more;
|
|
|
|
}
|
2021-11-03 03:17:14 +08:00
|
|
|
|
|
|
|
if (sum)
|
|
|
|
log_rinfo(ls, "%s %d", __func__, sum);
|
2006-01-18 17:30:29 +08:00
|
|
|
}
|
|
|
|
|