From feb704bd17786c8ff52a49d7759b8ee4f3a5aaac Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:34 -0400 Subject: [PATCH 01/15] fs: dlm: use sk->sk_socket instead of con->sock Instead of dereference "con->sock" we can get the socket structure over "sk->sk_socket" as well. This patch will switch to this behaviour. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0ea9ae35da0b..bf309c12a2c0 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -583,8 +583,7 @@ static void lowcomms_error_report(struct sock *sk) goto out; orig_report = listen_sock.sk_error_report; - if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { + if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), From d921a23f3e1a703875db436cc7cdb4404cf1565f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:35 -0400 Subject: [PATCH 02/15] fs: dlm: use READ_ONCE for config var This patch will use READ_ONCE to signal the compiler to read this variable only one time. If we don't do that it could be that the compiler read this value more than one time, because some optimizations, from the configure data which might can be changed during this time. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d71aba8c3e64..23c2d7308050 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -498,7 +498,7 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); - size = dlm_config.ci_rsbtbl_size; + size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable))); From b892e4792c992a0d5a272c3bdd6155bf772acfa7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:36 -0400 Subject: [PATCH 03/15] fs: dlm: fix typo in tlv prefix This patch fixes a small typo in a unused struct field. It should named be t_pad instead of o_pad. Came over this as I updated wireshark dissector. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dlm_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 91d1ca3a121a..5f57538b5d45 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -468,7 +468,7 @@ struct dlm_rcom { struct dlm_opt_header { uint16_t t_type; uint16_t t_length; - uint32_t o_pad; + uint32_t t_pad; /* need to be 8 byte aligned */ char t_value[]; }; From 052849beeab02306d1420d03da959e7db883b375 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:37 -0400 Subject: [PATCH 04/15] fs: dlm: clear CF_APP_LIMITED on close If send_to_sock() sets CF_APP_LIMITED limited bit and it has not been cleared by a waiting lowcomms_write_space() yet and a close_connection() apprears we should clear the CF_APP_LIMITED bit again because the connection starts from a new state again at reconnect. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index bf309c12a2c0..1bdd24bce709 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -800,6 +800,7 @@ static void close_connection(struct connection *con, bool and_other, con->rx_leftover = 0; con->retries = 0; + clear_bit(CF_APP_LIMITED, &con->flags); clear_bit(CF_CONNECTED, &con->flags); clear_bit(CF_DELAY_CONNECT, &con->flags); clear_bit(CF_RECONNECT, &con->flags); From 88aa023a2556aae92664f135eae794be9449e4f3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:38 -0400 Subject: [PATCH 05/15] fs: dlm: cleanup and remove _send_rcom The _send_rcom() can be removed and we call directly dlm_rcom_out(). As we doing that we removing the struct dlm_ls parameter which isn't used. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/rcom.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 5651933f54a4..6cba86470278 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -89,22 +89,15 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return 0; } -static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { dlm_rcom_out(rc); -} - -static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, - struct dlm_rcom *rc) -{ - _send_rcom(ls, rc); dlm_midcomms_commit_mhandle(mh); } -static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, - struct dlm_rcom *rc) +static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) { - _send_rcom(ls, rc); + dlm_rcom_out(rc); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); } @@ -204,7 +197,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -287,7 +280,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -327,7 +320,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -356,7 +349,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -373,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) memcpy(rc->rc_buf, r->res_name, r->res_length); rc->rc_id = (unsigned long) r->res_id; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -404,7 +397,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -461,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) pack_rcom_lock(r, lkb, rl); rc->rc_id = (unsigned long) r; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -487,7 +480,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } /* If the lockspace doesn't exist then still send a status message From 66d5955a098d504b5a34c35d56a967c536264673 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:39 -0400 Subject: [PATCH 06/15] fs: dlm: introduce con_next_wq helper This patch introduce a function to determine if something is ready to being send in the writequeue. It's not just that the writequeue is not empty additional the first entry need to have a valid length field. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 57 +++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1bdd24bce709..9d21a8b9e9fb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -175,6 +175,22 @@ static void sctp_connect_to_sock(struct connection *con); static void tcp_connect_to_sock(struct connection *con); static void dlm_tcp_shutdown(struct connection *con); +/* need to held writequeue_lock */ +static struct writequeue_entry *con_next_wq(struct connection *con) +{ + struct writequeue_entry *e; + + if (list_empty(&con->writequeue)) + return NULL; + + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->len == 0) + return NULL; + + return e; +} + static struct connection *__find_con(int nodeid, int r) { struct connection *con; @@ -1646,10 +1662,9 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) /* Send a message */ static void send_to_sock(struct connection *con) { - int ret = 0; const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; - int len, offset; + int len, offset, ret; int count = 0; mutex_lock(&con->sock_mutex); @@ -1658,7 +1673,8 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - if (list_empty(&con->writequeue)) + e = con_next_wq(con); + if (!e) break; e = list_first_entry(&con->writequeue, struct writequeue_entry, list); @@ -1667,25 +1683,22 @@ static void send_to_sock(struct connection *con) BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); - ret = 0; - if (len) { - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; - } + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } + cond_resched(); + goto out; + } else if (ret < 0) + goto out; /* Don't starve people filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { From a66c008cd16a122e340d48f01ddf8cd768368717 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:40 -0400 Subject: [PATCH 07/15] fs: dlm: move to static proto ops This patch moves the per transport socket callbacks to a static const array. We can support only one transport socket for the init namespace which will be determinted by reading the dlm config at lowcomms_start(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 52 +++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9d21a8b9e9fb..9a4e7421567e 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -84,9 +84,6 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - void (*connect_action) (struct connection *); /* What to do to connect */ - void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - bool (*eof_condition)(struct connection *con); /* What to do to eof check */ int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -145,6 +142,15 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +struct dlm_proto_ops { + /* What to do to connect */ + void (*connect_action)(struct connection *con); + /* What to do to shutdown */ + void (*shutdown_action)(struct connection *con); + /* What to do to eof check */ + bool (*eof_condition)(struct connection *con); +}; + static struct listen_sock_callbacks { void (*sk_error_report)(struct sock *); void (*sk_data_ready)(struct sock *); @@ -168,12 +174,13 @@ static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); DEFINE_STATIC_SRCU(connections_srcu); +static const struct dlm_proto_ops *dlm_proto_ops; + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); static void sctp_connect_to_sock(struct connection *con); static void tcp_connect_to_sock(struct connection *con); -static void dlm_tcp_shutdown(struct connection *con); /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) @@ -224,20 +231,6 @@ static int dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - switch (dlm_config.ci_protocol) { - case DLM_PROTO_TCP: - con->connect_action = tcp_connect_to_sock; - con->shutdown_action = dlm_tcp_shutdown; - con->eof_condition = tcp_eof_condition; - break; - case DLM_PROTO_SCTP: - con->connect_action = sctp_connect_to_sock; - break; - default: - kfree(con->rx_buf); - return -EINVAL; - } - return 0; } @@ -962,7 +955,8 @@ out_close: log_print("connection %p got EOF from %d", con, con->nodeid); - if (con->eof_condition && con->eof_condition(con)) { + if (dlm_proto_ops->eof_condition && + dlm_proto_ops->eof_condition(con)) { set_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); } else { @@ -1813,7 +1807,7 @@ static void process_send_sockets(struct work_struct *work) if (con->sock == NULL) { /* not mutex protected so check it inside too */ if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) msleep(1000); - con->connect_action(con); + dlm_proto_ops->connect_action(con); } if (!list_empty(&con->writequeue)) send_to_sock(con); @@ -1853,8 +1847,8 @@ static int work_start(void) static void shutdown_conn(struct connection *con) { - if (con->shutdown_action) - con->shutdown_action(con); + if (dlm_proto_ops->shutdown_action) + dlm_proto_ops->shutdown_action(con); } void dlm_lowcomms_shutdown(void) @@ -1961,8 +1955,20 @@ void dlm_lowcomms_stop(void) srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); + + dlm_proto_ops = NULL; } +static const struct dlm_proto_ops dlm_tcp_ops = { + .connect_action = tcp_connect_to_sock, + .shutdown_action = dlm_tcp_shutdown, + .eof_condition = tcp_eof_condition, +}; + +static const struct dlm_proto_ops dlm_sctp_ops = { + .connect_action = sctp_connect_to_sock, +}; + int dlm_lowcomms_start(void) { int error = -EINVAL; @@ -1989,9 +1995,11 @@ int dlm_lowcomms_start(void) /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: + dlm_proto_ops = &dlm_tcp_ops; error = tcp_listen_for_all(); break; case DLM_PROTO_SCTP: + dlm_proto_ops = &dlm_sctp_ops; error = sctp_listen_for_all(&listen_con); break; default: From 2dc6b1158c28c3a5e86d162628810312f98d5e97 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:41 -0400 Subject: [PATCH 08/15] fs: dlm: introduce generic listen This patch combines each transport layer listen functionality into one listen function. Per transport layer differences are provided by additional callbacks in dlm_proto_ops. This patch drops silently sock_set_keepalive() for listen tcp sockets only. This socket option is not set at connecting sockets, I also don't see the sense of set keepalive for sockets which are created by accept() only. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 228 +++++++++++++++++++++++----------------------- 1 file changed, 113 insertions(+), 115 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9a4e7421567e..a042ea413f74 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -143,6 +143,13 @@ struct dlm_node_addr { }; struct dlm_proto_ops { + const char *name; + int proto; + + int (*listen_validate)(void); + void (*listen_sockopts)(struct socket *sock); + int (*listen_bind)(struct socket *sock); + /* What to do to connect */ void (*connect_action)(struct connection *con); /* What to do to shutdown */ @@ -1327,59 +1334,6 @@ out: return; } -/* On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int tcp_create_listen_sock(struct listen_connection *con, - struct sockaddr_storage *saddr) -{ - struct socket *sock = NULL; - int result = 0; - int addr_len; - - if (dlm_local_addr[0]->ss_family == AF_INET) - addr_len = sizeof(struct sockaddr_in); - else - addr_len = sizeof(struct sockaddr_in6); - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) { - log_print("Can't create listening comms socket"); - goto create_out; - } - - sock_set_mark(sock->sk, dlm_config.ci_mark); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - sock_set_reuseaddr(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to our port */ - make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); - if (result < 0) { - log_print("Can't bind to port %d", dlm_config.ci_tcp_port); - goto create_out; - } - sock_set_keepalive(sock->sk); - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't listen on port %d", dlm_config.ci_tcp_port); - goto create_out; - } - - return 0; - -create_out: - return result; -} - /* Get local addresses */ static void init_local(void) { @@ -1406,63 +1360,6 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -/* Initialise SCTP socket and bind to all interfaces - * On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int sctp_listen_for_all(struct listen_connection *con) -{ - struct socket *sock = NULL; - int result = -EINVAL; - - log_print("Using SCTP for communications"); - - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); - goto out; - } - - sock_set_rcvbuf(sock->sk, NEEDED_RMEM); - sock_set_mark(sock->sk, dlm_config.ci_mark); - sctp_sock_set_nodelay(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to all addresses. */ - result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port); - if (result < 0) - goto out; - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't set socket listening"); - goto out; - } - - return 0; - -out: - return result; -} - -static int tcp_listen_for_all(void) -{ - /* We don't support multi-homed hosts */ - if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, " - "try SCTP"); - return -EINVAL; - } - - log_print("Using TCP for communications"); - - return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]); -} - - - static struct writequeue_entry *new_writequeue_entry(struct connection *con, gfp_t allocation) { @@ -1959,13 +1856,112 @@ void dlm_lowcomms_stop(void) dlm_proto_ops = NULL; } +static int dlm_listen_for_all(void) +{ + struct socket *sock; + int result; + + log_print("Using %s for communications", + dlm_proto_ops->name); + + if (dlm_proto_ops->listen_validate) { + result = dlm_proto_ops->listen_validate(); + if (result < 0) + return result; + } + + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + sock_set_mark(sock->sk, dlm_config.ci_mark); + dlm_proto_ops->listen_sockopts(sock); + + result = dlm_proto_ops->listen_bind(sock); + if (result < 0) + goto out; + + save_listen_callbacks(sock); + add_listen_sock(sock, &listen_con); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); + result = sock->ops->listen(sock, 5); + if (result < 0) { + dlm_close_sock(&listen_con.sock); + goto out; + } + + return 0; + +out: + sock_release(sock); + return result; +} + +static int dlm_tcp_listen_validate(void) +{ + /* We don't support multi-homed hosts */ + if (dlm_local_count > 1) { + log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); + return -EINVAL; + } + + return 0; +} + +static void dlm_tcp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + tcp_sock_set_nodelay(sock->sk); +} + +static void dlm_tcp_listen_sockopts(struct socket *sock) +{ + dlm_tcp_sockopts(sock); + sock_set_reuseaddr(sock->sk); +} + +static int dlm_tcp_listen_bind(struct socket *sock) +{ + int addr_len; + + /* Bind to our port */ + make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + addr_len); +} + static const struct dlm_proto_ops dlm_tcp_ops = { + .name = "TCP", + .proto = IPPROTO_TCP, + .listen_validate = dlm_tcp_listen_validate, + .listen_sockopts = dlm_tcp_listen_sockopts, + .listen_bind = dlm_tcp_listen_bind, .connect_action = tcp_connect_to_sock, .shutdown_action = dlm_tcp_shutdown, .eof_condition = tcp_eof_condition, }; +static int dlm_sctp_bind_listen(struct socket *sock) +{ + return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); +} + +static void dlm_sctp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + sctp_sock_set_nodelay(sock->sk); + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); +} + static const struct dlm_proto_ops dlm_sctp_ops = { + .name = "SCTP", + .proto = IPPROTO_SCTP, + .listen_sockopts = dlm_sctp_sockopts, + .listen_bind = dlm_sctp_bind_listen, .connect_action = sctp_connect_to_sock, }; @@ -1996,24 +1992,26 @@ int dlm_lowcomms_start(void) switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: dlm_proto_ops = &dlm_tcp_ops; - error = tcp_listen_for_all(); break; case DLM_PROTO_SCTP: dlm_proto_ops = &dlm_sctp_ops; - error = sctp_listen_for_all(&listen_con); break; default: log_print("Invalid protocol identifier %d set", dlm_config.ci_protocol); error = -EINVAL; - break; + goto fail_proto_ops; } + + error = dlm_listen_for_all(); if (error) - goto fail_unlisten; + goto fail_listen; return 0; -fail_unlisten: +fail_listen: + dlm_proto_ops = NULL; +fail_proto_ops: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); work_stop(); From 90d21fc0479dc0b5d338d664ddb55e5017b44f3e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:42 -0400 Subject: [PATCH 09/15] fs: dlm: auto load sctp module This patch adds a "for now" better handling of missing SCTP support in the kernel and try to load the sctp module if SCTP is set. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a042ea413f74..30d10d19ecac 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1864,11 +1864,9 @@ static int dlm_listen_for_all(void) log_print("Using %s for communications", dlm_proto_ops->name); - if (dlm_proto_ops->listen_validate) { - result = dlm_proto_ops->listen_validate(); - if (result < 0) - return result; - } + result = dlm_proto_ops->listen_validate(); + if (result < 0) + return result; result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, SOCK_STREAM, dlm_proto_ops->proto, &sock); @@ -1945,6 +1943,17 @@ static const struct dlm_proto_ops dlm_tcp_ops = { .eof_condition = tcp_eof_condition, }; +static int dlm_sctp_listen_validate(void) +{ + if (!IS_ENABLED(CONFIG_IP_SCTP)) { + log_print("SCTP is not enabled by this kernel"); + return -EOPNOTSUPP; + } + + request_module("sctp"); + return 0; +} + static int dlm_sctp_bind_listen(struct socket *sock) { return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); @@ -1960,6 +1969,7 @@ static void dlm_sctp_sockopts(struct socket *sock) static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, + .listen_validate = dlm_sctp_listen_validate, .listen_sockopts = dlm_sctp_sockopts, .listen_bind = dlm_sctp_bind_listen, .connect_action = sctp_connect_to_sock, From 8728a455d20ddadecd767337475fc1371e031d79 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:43 -0400 Subject: [PATCH 10/15] fs: dlm: generic connect func This patch adds a generic connect function for TCP and SCTP. If the connect functionality differs from each other additional callbacks in dlm_proto_ops were added. The sockopts callback handling will guarantee that sockets created by connect() will use the same options as sockets created by accept(). Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 343 ++++++++++++++++++++-------------------------- 1 file changed, 150 insertions(+), 193 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 30d10d19ecac..75e702c3678c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -143,15 +143,17 @@ struct dlm_node_addr { }; struct dlm_proto_ops { + bool try_new_addr; const char *name; int proto; + int (*connect)(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len); + void (*sockopts)(struct socket *sock); + int (*bind)(struct socket *sock); int (*listen_validate)(void); void (*listen_sockopts)(struct socket *sock); int (*listen_bind)(struct socket *sock); - - /* What to do to connect */ - void (*connect_action)(struct connection *con); /* What to do to shutdown */ void (*shutdown_action)(struct connection *con); /* What to do to eof check */ @@ -186,9 +188,6 @@ static const struct dlm_proto_ops *dlm_proto_ops; static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); -static void sctp_connect_to_sock(struct connection *con); -static void tcp_connect_to_sock(struct connection *con); - /* need to held writequeue_lock */ static struct writequeue_entry *con_next_wq(struct connection *con) { @@ -1151,189 +1150,6 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) return result; } -/* Initiate an SCTP association. - This is a special case of send_to_sock() in that we don't yet have a - peeled-off socket for this association, so we use the listening socket - and add the primary IP address of the remote node. - */ -static void sctp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage daddr; - int result; - int addr_len; - struct socket *sock; - unsigned int mark; - - mutex_lock(&con->sock_mutex); - - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - goto out; - } - - memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out; - } - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) - goto socket_err; - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to all addresses. */ - if (sctp_bind_addrs(con->sock, 0)) - goto bind_err; - - make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - sctp_sock_set_nodelay(sock->sk); - - /* - * Make sock->ops->connect() function return in specified time, - * since O_NONBLOCK argument in connect() function does not work here, - * then, we should restore the default value of this attribute. - */ - sock_set_sndtimeo(sock->sk, 5); - result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - 0); - sock_set_sndtimeo(sock->sk, 0); - - if (result == -EINPROGRESS) - result = 0; - if (result == 0) { - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); - goto out; - } - -bind_err: - con->sock = NULL; - sock_release(sock); - -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } - -out: - mutex_unlock(&con->sock_mutex); -} - -/* Connect a new socket to its peer */ -static void tcp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage saddr, src_addr; - unsigned int mark; - int addr_len; - struct socket *sock = NULL; - int result; - - mutex_lock(&con->sock_mutex); - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - /* Some odd races can cause double-connects, ignore them */ - if (con->sock) - goto out; - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) - goto out_err; - - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out_err; - } - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to our cluster-known address connecting to avoid - routing problems */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); - make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, - addr_len); - if (result < 0) { - log_print("could not bind for connect: %d", result); - /* This *may* not indicate a critical error */ - } - - make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, - O_NONBLOCK); - if (result == -EINPROGRESS) - result = 0; - if (result == 0) - goto out; - -out_err: - if (con->sock) { - sock_release(con->sock); - con->sock = NULL; - } else if (sock) { - sock_release(sock); - } - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } -out: - mutex_unlock(&con->sock_mutex); - return; -} - /* Get local addresses */ static void init_local(void) { @@ -1687,6 +1503,74 @@ static void process_listen_recv_socket(struct work_struct *work) accept_from_sock(&listen_con); } +static void dlm_connect(struct connection *con) +{ + struct sockaddr_storage addr; + int result, addr_len; + struct socket *sock; + unsigned int mark; + + /* Some odd races can cause double-connects, ignore them */ + if (con->retries++ > MAX_CONNECT_RETRIES) + return; + + if (con->sock) { + log_print("node %d already connected.", con->nodeid); + return; + } + + memset(&addr, 0, sizeof(addr)); + result = nodeid_to_addr(con->nodeid, &addr, NULL, + dlm_proto_ops->try_new_addr, &mark); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); + return; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) + goto socket_err; + + sock_set_mark(sock->sk, mark); + dlm_proto_ops->sockopts(sock); + + add_sock(sock, con); + + result = dlm_proto_ops->bind(sock); + if (result < 0) + goto add_sock_err; + + log_print_ratelimited("connecting to %d", con->nodeid); + make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); + result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, + addr_len); + if (result < 0) + goto add_sock_err; + + return; + +add_sock_err: + dlm_close_sock(&con->sock); + +socket_err: + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + msleep(1000); + lowcomms_connect_sock(con); + } +} + /* Send workqueue function */ static void process_send_sockets(struct work_struct *work) { @@ -1701,11 +1585,15 @@ static void process_send_sockets(struct work_struct *work) dlm_midcomms_unack_msg_resend(con->nodeid); } - if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (con->sock == NULL) { if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) msleep(1000); - dlm_proto_ops->connect_action(con); + + mutex_lock(&con->sock_mutex); + dlm_connect(con); + mutex_unlock(&con->sock_mutex); } + if (!list_empty(&con->writequeue)) send_to_sock(con); } @@ -1899,6 +1787,43 @@ out: return result; } +static int dlm_tcp_bind(struct socket *sock) +{ + struct sockaddr_storage src_addr; + int result, addr_len; + + /* Bind to our cluster-known address connecting to avoid + * routing problems. + */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + + result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, + addr_len); + if (result < 0) { + /* This *may* not indicate a critical error */ + log_print("could not bind for connect: %d", result); + } + + return 0; +} + +static int dlm_tcp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); + switch (ret) { + case -EINPROGRESS: + fallthrough; + case 0: + return 0; + } + + return ret; +} + static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ @@ -1935,14 +1860,43 @@ static int dlm_tcp_listen_bind(struct socket *sock) static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, + .connect = dlm_tcp_connect, + .sockopts = dlm_tcp_sockopts, + .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, .listen_sockopts = dlm_tcp_listen_sockopts, .listen_bind = dlm_tcp_listen_bind, - .connect_action = tcp_connect_to_sock, .shutdown_action = dlm_tcp_shutdown, .eof_condition = tcp_eof_condition, }; +static int dlm_sctp_bind(struct socket *sock) +{ + return sctp_bind_addrs(sock, 0); +} + +static int dlm_sctp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + sock_set_sndtimeo(sock->sk, 5); + ret = sock->ops->connect(sock, addr, addr_len, 0); + sock_set_sndtimeo(sock->sk, 0); + if (ret < 0) + return ret; + + if (!test_and_set_bit(CF_CONNECTED, &con->flags)) + log_print("successful connected to node %d", con->nodeid); + + return 0; +} + static int dlm_sctp_listen_validate(void) { if (!IS_ENABLED(CONFIG_IP_SCTP)) { @@ -1969,10 +1923,13 @@ static void dlm_sctp_sockopts(struct socket *sock) static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, + .try_new_addr = true, + .connect = dlm_sctp_connect, + .sockopts = dlm_sctp_sockopts, + .bind = dlm_sctp_bind, .listen_validate = dlm_sctp_listen_validate, .listen_sockopts = dlm_sctp_sockopts, .listen_bind = dlm_sctp_bind_listen, - .connect_action = sctp_connect_to_sock, }; int dlm_lowcomms_start(void) From c51b0221798b695ac1beac6597d3a0acf039384b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:44 -0400 Subject: [PATCH 11/15] fs: dlm: fix multiple empty writequeue alloc This patch will add a mutex that a connection can allocate a writequeue entry buffer only at a sleepable context at one time. If multiple caller waits at the writequeue spinlock and the spinlock gets release it could be that multiple new writequeue page buffers were allocated instead of allocate one writequeue page buffer and other waiters will use remaining buffer of it. It will only be the case for sleepable context which is the common case. In non-sleepable contexts like retransmission we just don't care about such behaviour. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 75e702c3678c..11e061619a9f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -84,6 +84,7 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; + struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -264,6 +265,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } + mutex_init(&con->wq_alloc); + spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -1251,19 +1254,37 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, { struct writequeue_entry *e; struct dlm_msg *msg; + bool sleepable; msg = kzalloc(sizeof(*msg), allocation); if (!msg) return NULL; + /* this mutex is being used as a wait to avoid multiple "fast" + * new writequeue page list entry allocs in new_wq_entry in + * normal operation which is sleepable context. Without it + * we could end in multiple writequeue entries with one + * dlm message because multiple callers were waiting at + * the writequeue_lock in new_wq_entry(). + */ + sleepable = gfpflags_normal_context(allocation); + if (sleepable) + mutex_lock(&con->wq_alloc); + kref_init(&msg->ref); e = new_wq_entry(con, len, allocation, ppc, cb, mh); if (!e) { + if (sleepable) + mutex_unlock(&con->wq_alloc); + kfree(msg); return NULL; } + if (sleepable) + mutex_unlock(&con->wq_alloc); + msg->ppc = *ppc; msg->len = len; msg->entry = e; From 62699b3f0a62435fceb8debf295e90a5ea259e04 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 16 Jul 2021 16:22:45 -0400 Subject: [PATCH 12/15] fs: dlm: move receive loop into receive handler This patch moves the kernel_recvmsg() loop call into the receive_from_sock() function instead of doing the loop outside the function and abort the loop over it's return value. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 64 +++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 11e061619a9f..cce1d50aa09f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -895,7 +895,6 @@ static int con_realloc_receive_buf(struct connection *con, int newlen) /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int call_again_soon = 0; struct msghdr msg; struct kvec iov; int ret, buflen; @@ -915,41 +914,39 @@ static int receive_from_sock(struct connection *con) goto out_resched; } - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; + for (;;) { + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - if (ret <= 0) - goto out_close; - else if (ret == iov.iov_len) - call_again_soon = 1; + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + if (ret == -EAGAIN) + break; + else if (ret <= 0) + goto out_close; - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); - call_again_soon = true; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + } } - if (call_again_soon) - goto out_resched; - mutex_unlock(&con->sock_mutex); return 0; @@ -1511,12 +1508,9 @@ int dlm_lowcomms_close(int nodeid) static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); - int err; clear_bit(CF_READ_PENDING, &con->flags); - do { - err = receive_from_sock(con); - } while (!err); + receive_from_sock(con); } static void process_listen_recv_socket(struct work_struct *work) From b97f85259fca5accc2cd5f7c4f42fa0dd8efda48 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 18 Aug 2021 16:27:13 -0400 Subject: [PATCH 13/15] fs: dlm: implement delayed ack handling This patch changes that we don't ack each message. Lowcomms will take care about to send an ack back after a bulk of messages was processed. Currently it's only when the whole receive buffer was processed, there might better positions to send an ack back but only the lowcomms implementation know when there are more data to receive. This patch has also disadvantages that we might retransmit more on errors, however this is a very rare case. Tested with make_panic on gfs2 with three nodes by running: trace-cmd record -p function -l 'dlm_send_ack' sleep 100 and trace-cmd report | wc -l Before patch: - 20548 - 21376 - 21398 After patch: - 18338 - 20679 - 19949 Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 + fs/dlm/lowcomms.h | 1 + fs/dlm/midcomms.c | 56 ++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cce1d50aa09f..8f715c620e1f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -947,6 +947,7 @@ static int receive_from_sock(struct connection *con) } } + dlm_midcomms_receive_done(con->nodeid); mutex_unlock(&con->sock_mutex); return 0; diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index aaae7115c00d..4ccae07cf005 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -46,6 +46,7 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); +void dlm_midcomms_receive_done(int nodeid); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e3de268898ed..7ae39ec8d9b0 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -109,12 +109,6 @@ * compatibility. There exists better ways to make a better handling. * However this should be changed in the next major version bump of dlm. * - * Ack handling: - * - * Currently we send an ack message for every dlm message. However we - * can ack multiple dlm messages with one ack by just delaying the ack - * message. Will reduce some traffic but makes the drop detection slower. - * * Tail Size checking: * * There exists a message tail payload in e.g. DLM_MSG however we don't @@ -169,6 +163,7 @@ struct midcomms_node { #define DLM_NODE_FLAG_CLOSE 1 #define DLM_NODE_FLAG_STOP_TX 2 #define DLM_NODE_FLAG_STOP_RX 3 +#define DLM_NODE_ULP_DELIVERED 4 unsigned long flags; wait_queue_head_t shutdown_wait; @@ -480,11 +475,12 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, { if (seq == node->seq_next) { node->seq_next++; - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); switch (p->header.h_cmd) { case DLM_FIN: + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -534,6 +530,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, default: WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, node->nodeid); + set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; } } else { @@ -933,6 +930,49 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) return ret; } +void dlm_midcomms_receive_done(int nodeid) +{ + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we do nothing */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* do nothing if we didn't delivered stateful to ulp */ + if (!test_and_clear_bit(DLM_NODE_ULP_DELIVERED, + &node->flags)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + /* we only ack if state is ESTABLISHED */ + switch (node->state) { + case DLM_ESTABLISHED: + spin_unlock(&node->state_lock); + dlm_send_ack(node->nodeid, node->seq_next); + break; + default: + spin_unlock(&node->state_lock); + /* do nothing FIN has it's own ack send */ + break; + }; + srcu_read_unlock(&nodes_srcu, idx); +} + void dlm_midcomms_unack_msg_resend(int nodeid) { struct midcomms_node *node; From aee742c9928ab4f5f4e0b00f41fb2d2cffae179e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 18 Aug 2021 16:27:14 -0400 Subject: [PATCH 14/15] fs: dlm: fix return -EINTR on recovery stopped This patch will return -EINTR instead of 1 if recovery is stopped. In case of ping_members() the return value will be checked if the error is -EINTR for signaling another recovery was triggered and the whole recovery process will come to a clean end to process the next one. Returning 1 will abort the recovery process and can leave the recovery in a broken state. It was reported with the following kernel log message attached and a gfs2 mount stopped working: "dlm: bobvirt1: dlm_recover_members error 1" whereas 1 was returned because of a conversion of "dlm_recovery_stopped()" to an errno was missing which this patch will introduce. While on it all other possible missing errno conversions at other places were added as they are done as in other places. It might be worth to check the error case at this recovery level, because some of the functionality also returns -ENOBUFS and check why recovery ends in a broken state. However this will fix the issue if another recovery was triggered at some points of recovery handling. Reported-by: Bob Peterson Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/dir.c | 4 +++- fs/dlm/member.c | 4 +++- fs/dlm/recoverd.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 10c36ae1a8f9..45ebbe602bbf 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -85,8 +85,10 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto out_free; + } error = dlm_rcom_names(ls, memb->nodeid, last_name, last_len); diff --git a/fs/dlm/member.c b/fs/dlm/member.c index d9e1e4170eb1..731d489aa323 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -443,8 +443,10 @@ static int ping_members(struct dlm_ls *ls) list_for_each_entry(memb, &ls->ls_nodes, list) { error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; break; + } error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) break; diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 85e245392715..97d052cea5a9 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -125,8 +125,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto fail; + } if (neg || dlm_no_directory(ls)) { /* From ecd95673142ef80169a6c003b569b8a86d1e6329 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 26 Aug 2021 10:06:31 -0400 Subject: [PATCH 15/15] fs: dlm: avoid comms shutdown delay in release_lockspace When dlm_release_lockspace does active shutdown on connections to other nodes, the active shutdown will wait for any exisitng passive shutdowns to be resolved. But, the sequence of operations during dlm_release_lockspace can prevent the normal resolution of passive shutdowns (processed normally by way of lockspace recovery.) This disruption of passive shutdown handling can cause the active shutdown to wait for a full timeout period, delaying the completion of dlm_release_lockspace. To fix this, make dlm_release_lockspace resolve existing passive shutdowns (by calling dlm_clear_members earlier), before it does active shutdowns. The active shutdowns will not find any passive shutdowns to wait for, and will not be delayed. Reported-by: Chris Mackowski Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 23c2d7308050..10eddfa6c3d7 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -793,6 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); + dlm_clear_members(ls); dlm_midcomms_shutdown(); }