2006-01-03 02:04:38 +08:00
|
|
|
/*
|
|
|
|
* net/tipc/link.c: TIPC link code
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2016-03-04 03:23:21 +08:00
|
|
|
* Copyright (c) 1996-2007, 2012-2016, Ericsson AB
|
2013-06-17 22:54:42 +08:00
|
|
|
* Copyright (c) 2004-2007, 2010-2013, Wind River Systems
|
2006-01-03 02:04:38 +08:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
2006-01-03 02:04:38 +08:00
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. Neither the names of the copyright holders nor the names of its
|
|
|
|
* contributors may be used to endorse or promote products derived from
|
|
|
|
* this software without specific prior written permission.
|
2006-01-03 02:04:38 +08:00
|
|
|
*
|
2006-01-11 20:30:43 +08:00
|
|
|
* Alternatively, this software may be distributed under the terms of the
|
|
|
|
* GNU General Public License ("GPL") version 2 as published by the Free
|
|
|
|
* Software Foundation.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
2006-01-03 02:04:38 +08:00
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "core.h"
|
tipc: clean up handling of message priorities
Messages transferred by TIPC are assigned an "importance priority", -an
integer value indicating how to treat the message when there is link or
destination socket congestion.
There is no separate header field for this value. Instead, the message
user values have been chosen in ascending order according to perceived
importance, so that the message user field can be used for this.
This is not a good solution. First, we have many more users than the
needed priority levels, so we end up with treating more priority
levels than necessary. Second, the user field cannot always
accurately reflect the priority of the message. E.g., a message
fragment packet should really have the priority of the enveloped
user data message, and not the priority of the MSG_FRAGMENTER user.
Until now, we have been working around this problem in different ways,
but it is now time to implement a consistent way of handling such
priorities, although still within the constraint that we cannot
allocate any more bits in the regular data message header for this.
In this commit, we define a new priority level, TIPC_SYSTEM_IMPORTANCE,
that will be the only one used apart from the four (lower) user data
levels. All non-data messages map down to this priority. Furthermore,
we take some free bits from the MSG_FRAGMENTER header and allocate
them to store the priority of the enveloped message. We then adjust
the functions msg_importance()/msg_set_importance() so that they
read/set the correct header fields depending on user type.
This small protocol change is fully compatible, because the code at
the receiving end of a link currently reads the importance level
only from user data messages, where there is no change.
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-14 04:08:11 +08:00
|
|
|
#include "subscr.h"
|
2006-01-03 02:04:38 +08:00
|
|
|
#include "link.h"
|
2014-11-20 17:29:12 +08:00
|
|
|
#include "bcast.h"
|
2014-05-14 17:39:15 +08:00
|
|
|
#include "socket.h"
|
2006-01-03 02:04:38 +08:00
|
|
|
#include "name_distr.h"
|
|
|
|
#include "discover.h"
|
2014-11-20 17:29:07 +08:00
|
|
|
#include "netlink.h"
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
#include "monitor.h"
|
tipc: enable tracepoints in tipc
As for the sake of debugging/tracing, the commit enables tracepoints in
TIPC along with some general trace_events as shown below. It also
defines some 'tipc_*_dump()' functions that allow to dump TIPC object
data whenever needed, that is, for general debug purposes, ie. not just
for the trace_events.
The following trace_events are now available:
- trace_tipc_skb_dump(): allows to trace and dump TIPC msg & skb data,
e.g. message type, user, droppable, skb truesize, cloned skb, etc.
- trace_tipc_list_dump(): allows to trace and dump any TIPC buffers or
queues, e.g. TIPC link transmq, socket receive queue, etc.
- trace_tipc_sk_dump(): allows to trace and dump TIPC socket data, e.g.
sk state, sk type, connection type, rmem_alloc, socket queues, etc.
- trace_tipc_link_dump(): allows to trace and dump TIPC link data, e.g.
link state, silent_intv_cnt, gap, bc_gap, link queues, etc.
- trace_tipc_node_dump(): allows to trace and dump TIPC node data, e.g.
node state, active links, capabilities, link entries, etc.
How to use:
Put the trace functions at any places where we want to dump TIPC data
or events.
Note:
a) The dump functions will generate raw data only, that is, to offload
the trace event's processing, it can require a tool or script to parse
the data but this should be simple.
b) The trace_tipc_*_dump() should be reserved for a failure cases only
(e.g. the retransmission failure case) or where we do not expect to
happen too often, then we can consider enabling these events by default
since they will almost not take any effects under normal conditions,
but once the rare condition or failure occurs, we get the dumped data
fully for post-analysis.
For other trace purposes, we can reuse these trace classes as template
but different events.
c) A trace_event is only effective when we enable it. To enable the
TIPC trace_events, echo 1 to 'enable' files in the events/tipc/
directory in the 'debugfs' file system. Normally, they are located at:
/sys/kernel/debug/tracing/events/tipc/
For example:
To enable the tipc_link_dump event:
echo 1 > /sys/kernel/debug/tracing/events/tipc/tipc_link_dump/enable
To enable all the TIPC trace_events:
echo 1 > /sys/kernel/debug/tracing/events/tipc/enable
To collect the trace data:
cat trace
or
cat trace_pipe > /trace.out &
To disable all the TIPC trace_events:
echo 0 > /sys/kernel/debug/tracing/events/tipc/enable
To clear the trace buffer:
echo > trace
d) Like the other trace_events, the feature like 'filter' or 'trigger'
is also usable for the tipc trace_events.
For more details, have a look at:
Documentation/trace/ftrace.txt
MAINTAINERS | add two new files 'trace.h' & 'trace.c' in tipc
Acked-by: Ying Xue <ying.xue@windriver.com>
Tested-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-19 10:17:56 +08:00
|
|
|
#include "trace.h"
|
2019-11-08 13:05:11 +08:00
|
|
|
#include "crypto.h"
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2013-06-17 22:54:48 +08:00
|
|
|
#include <linux/pkt_sched.h>
|
|
|
|
|
2015-11-20 03:30:46 +08:00
|
|
|
struct tipc_stats {
|
2016-11-25 23:35:02 +08:00
|
|
|
u32 sent_pkts;
|
|
|
|
u32 recv_pkts;
|
2015-11-20 03:30:46 +08:00
|
|
|
u32 sent_states;
|
|
|
|
u32 recv_states;
|
|
|
|
u32 sent_probes;
|
|
|
|
u32 recv_probes;
|
|
|
|
u32 sent_nacks;
|
|
|
|
u32 recv_nacks;
|
|
|
|
u32 sent_acks;
|
|
|
|
u32 sent_bundled;
|
|
|
|
u32 sent_bundles;
|
|
|
|
u32 recv_bundled;
|
|
|
|
u32 recv_bundles;
|
|
|
|
u32 retransmitted;
|
|
|
|
u32 sent_fragmented;
|
|
|
|
u32 sent_fragments;
|
|
|
|
u32 recv_fragmented;
|
|
|
|
u32 recv_fragments;
|
|
|
|
u32 link_congs; /* # port sends blocked by congestion */
|
|
|
|
u32 deferred_recv;
|
|
|
|
u32 duplicates;
|
|
|
|
u32 max_queue_sz; /* send queue size high water mark */
|
|
|
|
u32 accu_queue_sz; /* used for send queue size profiling */
|
|
|
|
u32 queue_sz_counts; /* used for send queue size profiling */
|
|
|
|
u32 msg_length_counts; /* used for message length profiling */
|
|
|
|
u32 msg_lengths_total; /* used for message length profiling */
|
|
|
|
u32 msg_length_profile[7]; /* used for msg. length profiling */
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* struct tipc_link - TIPC link data structure
|
|
|
|
* @addr: network address of link's peer node
|
|
|
|
* @name: link name character string
|
|
|
|
* @media_addr: media address to use when sending messages over link
|
|
|
|
* @timer: link timer
|
|
|
|
* @net: pointer to namespace struct
|
|
|
|
* @refcnt: reference counter for permanent references (owner node & timer)
|
|
|
|
* @peer_session: link session # being used by peer end of link
|
|
|
|
* @peer_bearer_id: bearer id used by link's peer endpoint
|
|
|
|
* @bearer_id: local bearer id used by link
|
|
|
|
* @tolerance: minimum link continuity loss needed to reset link [in ms]
|
|
|
|
* @abort_limit: # of unacknowledged continuity probes needed to reset link
|
|
|
|
* @state: current state of link FSM
|
|
|
|
* @peer_caps: bitmap describing capabilities of peer node
|
|
|
|
* @silent_intv_cnt: # of timer intervals without any reception from peer
|
|
|
|
* @proto_msg: template for control messages generated by link
|
|
|
|
* @pmsg: convenience pointer to "proto_msg" field
|
|
|
|
* @priority: current link priority
|
|
|
|
* @net_plane: current link network plane ('A' through 'H')
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
* @mon_state: cookie with information needed by link monitor
|
2015-11-20 03:30:46 +08:00
|
|
|
* @backlog_limit: backlog queue congestion thresholds (indexed by importance)
|
|
|
|
* @exp_msg_count: # of tunnelled messages expected during link changeover
|
|
|
|
* @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
|
|
|
|
* @mtu: current maximum packet size for this link
|
|
|
|
* @advertised_mtu: advertised own mtu when link is being established
|
|
|
|
* @transmitq: queue for sent, non-acked messages
|
|
|
|
* @backlogq: queue for messages waiting to be sent
|
|
|
|
* @snt_nxt: next sequence number to use for outbound messages
|
|
|
|
* @ackers: # of peers that needs to ack each packet before it can be released
|
|
|
|
* @acked: # last packet acked by a certain peer. Used for broadcast.
|
|
|
|
* @rcv_nxt: next sequence number to expect for inbound messages
|
|
|
|
* @deferred_queue: deferred queue saved OOS b'cast message received from node
|
|
|
|
* @unacked_window: # of inbound messages rx'd without ack'ing back to peer
|
|
|
|
* @inputq: buffer queue for messages to be delivered upwards
|
|
|
|
* @namedq: buffer queue for name table messages to be delivered upwards
|
|
|
|
* @next_out: ptr to first unsent outbound message in queue
|
|
|
|
* @wakeupq: linked list of wakeup msgs waiting for link congestion to abate
|
|
|
|
* @long_msg_seq_no: next identifier to use for outbound fragmented messages
|
|
|
|
* @reasm_buf: head of partially reassembled inbound message fragments
|
|
|
|
* @bc_rcvr: marks that this is a broadcast receiver link
|
|
|
|
* @stats: collects statistics regarding link activity
|
2020-11-30 02:32:44 +08:00
|
|
|
* @session: session to be used by link
|
|
|
|
* @snd_nxt_state: next send seq number
|
|
|
|
* @rcv_nxt_state: next rcv seq number
|
|
|
|
* @in_session: have received ACTIVATE_MSG from peer
|
|
|
|
* @active: link is active
|
|
|
|
* @if_name: associated interface name
|
|
|
|
* @rst_cnt: link reset counter
|
|
|
|
* @drop_point: seq number for failover handling (FIXME)
|
|
|
|
* @failover_reasm_skb: saved failover msg ptr (FIXME)
|
|
|
|
* @failover_deferdq: deferred message queue for failover processing (FIXME)
|
|
|
|
* @transmq: the link's transmit queue
|
|
|
|
* @backlog: link's backlog by priority (importance)
|
|
|
|
* @snd_nxt: next sequence number to be used
|
|
|
|
* @rcv_unacked: # messages read by user, but not yet acked back to peer
|
|
|
|
* @deferdq: deferred receive queue
|
|
|
|
* @window: sliding window size for congestion handling
|
|
|
|
* @min_win: minimal send window to be used by link
|
|
|
|
* @ssthresh: slow start threshold for congestion handling
|
|
|
|
* @max_win: maximal send window to be used by link
|
|
|
|
* @cong_acks: congestion acks for congestion avoidance (FIXME)
|
|
|
|
* @checkpoint: seq number for congestion window size handling
|
|
|
|
* @reasm_tnlmsg: fragmentation/reassembly area for tunnel protocol message
|
|
|
|
* @last_gap: last gap ack blocks for bcast (FIXME)
|
|
|
|
* @last_ga: ptr to gap ack blocks
|
|
|
|
* @bc_rcvlink: the peer specific link used for broadcast reception
|
|
|
|
* @bc_sndlink: the namespace global link used for broadcast sending
|
|
|
|
* @nack_state: bcast nack state
|
|
|
|
* @bc_peer_is_up: peer has acked the bcast init msg
|
2015-11-20 03:30:46 +08:00
|
|
|
*/
|
|
|
|
struct tipc_link {
|
|
|
|
u32 addr;
|
|
|
|
char name[TIPC_MAX_LINK_NAME];
|
|
|
|
struct net *net;
|
|
|
|
|
|
|
|
/* Management and link supervision data */
|
2018-07-10 07:07:36 +08:00
|
|
|
u16 peer_session;
|
|
|
|
u16 session;
|
2018-07-10 07:07:35 +08:00
|
|
|
u16 snd_nxt_state;
|
|
|
|
u16 rcv_nxt_state;
|
2015-11-20 03:30:46 +08:00
|
|
|
u32 peer_bearer_id;
|
|
|
|
u32 bearer_id;
|
|
|
|
u32 tolerance;
|
|
|
|
u32 abort_limit;
|
|
|
|
u32 state;
|
|
|
|
u16 peer_caps;
|
2018-07-10 07:07:36 +08:00
|
|
|
bool in_session;
|
2015-11-20 03:30:46 +08:00
|
|
|
bool active;
|
|
|
|
u32 silent_intv_cnt;
|
2016-03-04 03:23:21 +08:00
|
|
|
char if_name[TIPC_MAX_IF_NAME];
|
2015-11-20 03:30:46 +08:00
|
|
|
u32 priority;
|
|
|
|
char net_plane;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
struct tipc_mon_state mon_state;
|
2016-04-16 01:33:04 +08:00
|
|
|
u16 rst_cnt;
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
/* Failover/synch */
|
|
|
|
u16 drop_point;
|
|
|
|
struct sk_buff *failover_reasm_skb;
|
2019-04-04 12:09:53 +08:00
|
|
|
struct sk_buff_head failover_deferdq;
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
/* Max packet negotiation */
|
|
|
|
u16 mtu;
|
|
|
|
u16 advertised_mtu;
|
|
|
|
|
|
|
|
/* Sending */
|
|
|
|
struct sk_buff_head transmq;
|
|
|
|
struct sk_buff_head backlogq;
|
|
|
|
struct {
|
|
|
|
u16 len;
|
|
|
|
u16 limit;
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
struct sk_buff *target_bskb;
|
2015-11-20 03:30:46 +08:00
|
|
|
} backlog[5];
|
|
|
|
u16 snd_nxt;
|
|
|
|
|
|
|
|
/* Reception */
|
|
|
|
u16 rcv_nxt;
|
|
|
|
u32 rcv_unacked;
|
|
|
|
struct sk_buff_head deferdq;
|
|
|
|
struct sk_buff_head *inputq;
|
|
|
|
struct sk_buff_head *namedq;
|
|
|
|
|
|
|
|
/* Congestion handling */
|
|
|
|
struct sk_buff_head wakeupq;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
u16 window;
|
|
|
|
u16 min_win;
|
|
|
|
u16 ssthresh;
|
|
|
|
u16 max_win;
|
|
|
|
u16 cong_acks;
|
|
|
|
u16 checkpoint;
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
/* Fragmentation/reassembly */
|
|
|
|
struct sk_buff *reasm_buf;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
struct sk_buff *reasm_tnlmsg;
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
/* Broadcast */
|
|
|
|
u16 ackers;
|
|
|
|
u16 acked;
|
2020-05-26 17:38:34 +08:00
|
|
|
u16 last_gap;
|
|
|
|
struct tipc_gap_ack_blks *last_ga;
|
2015-11-20 03:30:46 +08:00
|
|
|
struct tipc_link *bc_rcvlink;
|
|
|
|
struct tipc_link *bc_sndlink;
|
2016-09-02 01:52:50 +08:00
|
|
|
u8 nack_state;
|
2015-11-20 03:30:46 +08:00
|
|
|
bool bc_peer_is_up;
|
|
|
|
|
|
|
|
/* Statistics */
|
|
|
|
struct tipc_stats stats;
|
|
|
|
};
|
|
|
|
|
2012-06-29 12:16:37 +08:00
|
|
|
/*
|
|
|
|
* Error message prefixes
|
|
|
|
*/
|
2015-07-31 06:24:19 +08:00
|
|
|
static const char *link_co_err = "Link tunneling error, ";
|
2012-06-29 12:16:37 +08:00
|
|
|
static const char *link_rst_msg = "Resetting link ";
|
2014-11-20 17:29:12 +08:00
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
/* Send states for broadcast NACKs
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
BC_NACK_SND_CONDITIONAL,
|
|
|
|
BC_NACK_SND_UNCONDITIONAL,
|
|
|
|
BC_NACK_SND_SUPPRESS,
|
|
|
|
};
|
|
|
|
|
2019-06-28 23:06:20 +08:00
|
|
|
#define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10))
|
tipc: reduce duplicate packets for unicast traffic
For unicast transmission, the current NACK sending althorithm is over-
active that forces the sending side to retransmit a packet that is not
really lost but just arrived at the receiving side with some delay, or
even retransmit same packets that have already been retransmitted
before. As a result, many duplicates are observed also under normal
condition, ie. without packet loss.
One example case is: node1 transmits 1 2 3 4 10 5 6 7 8 9, when node2
receives packet #10, it puts into the deferdq. When the packet #5 comes
it sends NACK with gap [6 - 9]. However, shortly after that, when
packet #6 arrives, it pulls out packet #10 from the deferfq, but it is
still out of order, so it makes another NACK with gap [7 - 9] and so on
... Finally, node1 has to retransmit the packets 5 6 7 8 9 a number of
times, but in fact all the packets are not lost at all, so duplicates!
This commit reduces duplicates by changing the condition to send NACK,
also restricting the retransmissions on individual packets via a timer
of about 1ms. However, it also needs to say that too tricky condition
for NACKs or too long timeout value for retransmissions will result in
performance reducing! The criterias in this commit are found to be
effective for both the requirements to reduce duplicates but not affect
performance.
The tipc_link_rcv() is also improved to only dequeue skb from the link
deferdq if it is expected (ie. its seqno <= rcv_nxt).
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:52 +08:00
|
|
|
#define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1))
|
2016-09-02 01:52:50 +08:00
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
/* Link FSM states:
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
tipc: clean up definitions and usage of link flags
The status flag LINK_STOPPED is not needed any more, since the
mechanism for delayed deletion of links has been removed.
Likewise, LINK_STARTED and LINK_START_EVT are unnecessary,
because we can just as well start the link timer directly from
inside tipc_link_create().
We eliminate these flags in this commit.
Instead of the above flags, we now introduce three new link modes,
TIPC_LINK_OPEN, TIPC_LINK_BLOCKED and TIPC_LINK_TUNNEL. The values
indicate whether, and in the case of TIPC_LINK_TUNNEL, which, messages
the link is allowed to receive in this state. TIPC_LINK_BLOCKED also
blocks timer-driven protocol messages to be sent out, and any change
to the link FSM. Since the modes are mutually exclusive, we convert
them to state values, and rename the 'flags' field in struct tipc_link
to 'exec_mode'.
Finally, we move the #defines for link FSM states and events from link.h
into enums inside the file link.c, which is the real usage scope of
these definitions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:25 +08:00
|
|
|
enum {
|
2015-07-31 06:24:21 +08:00
|
|
|
LINK_ESTABLISHED = 0xe,
|
|
|
|
LINK_ESTABLISHING = 0xe << 4,
|
|
|
|
LINK_RESET = 0x1 << 8,
|
|
|
|
LINK_RESETTING = 0x2 << 12,
|
|
|
|
LINK_PEER_RESET = 0xd << 16,
|
|
|
|
LINK_FAILINGOVER = 0xf << 20,
|
|
|
|
LINK_SYNCHING = 0xc << 24
|
tipc: clean up definitions and usage of link flags
The status flag LINK_STOPPED is not needed any more, since the
mechanism for delayed deletion of links has been removed.
Likewise, LINK_STARTED and LINK_START_EVT are unnecessary,
because we can just as well start the link timer directly from
inside tipc_link_create().
We eliminate these flags in this commit.
Instead of the above flags, we now introduce three new link modes,
TIPC_LINK_OPEN, TIPC_LINK_BLOCKED and TIPC_LINK_TUNNEL. The values
indicate whether, and in the case of TIPC_LINK_TUNNEL, which, messages
the link is allowed to receive in this state. TIPC_LINK_BLOCKED also
blocks timer-driven protocol messages to be sent out, and any change
to the link FSM. Since the modes are mutually exclusive, we convert
them to state values, and rename the 'flags' field in struct tipc_link
to 'exec_mode'.
Finally, we move the #defines for link FSM states and events from link.h
into enums inside the file link.c, which is the real usage scope of
these definitions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:25 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Link FSM state checking routines
|
|
|
|
*/
|
2015-07-31 06:24:21 +08:00
|
|
|
static int link_is_up(struct tipc_link *l)
|
tipc: clean up definitions and usage of link flags
The status flag LINK_STOPPED is not needed any more, since the
mechanism for delayed deletion of links has been removed.
Likewise, LINK_STARTED and LINK_START_EVT are unnecessary,
because we can just as well start the link timer directly from
inside tipc_link_create().
We eliminate these flags in this commit.
Instead of the above flags, we now introduce three new link modes,
TIPC_LINK_OPEN, TIPC_LINK_BLOCKED and TIPC_LINK_TUNNEL. The values
indicate whether, and in the case of TIPC_LINK_TUNNEL, which, messages
the link is allowed to receive in this state. TIPC_LINK_BLOCKED also
blocks timer-driven protocol messages to be sent out, and any change
to the link FSM. Since the modes are mutually exclusive, we convert
them to state values, and rename the 'flags' field in struct tipc_link
to 'exec_mode'.
Finally, we move the #defines for link FSM states and events from link.h
into enums inside the file link.c, which is the real usage scope of
these definitions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:25 +08:00
|
|
|
{
|
2015-07-31 06:24:21 +08:00
|
|
|
return l->state & (LINK_ESTABLISHED | LINK_SYNCHING);
|
tipc: clean up definitions and usage of link flags
The status flag LINK_STOPPED is not needed any more, since the
mechanism for delayed deletion of links has been removed.
Likewise, LINK_STARTED and LINK_START_EVT are unnecessary,
because we can just as well start the link timer directly from
inside tipc_link_create().
We eliminate these flags in this commit.
Instead of the above flags, we now introduce three new link modes,
TIPC_LINK_OPEN, TIPC_LINK_BLOCKED and TIPC_LINK_TUNNEL. The values
indicate whether, and in the case of TIPC_LINK_TUNNEL, which, messages
the link is allowed to receive in this state. TIPC_LINK_BLOCKED also
blocks timer-driven protocol messages to be sent out, and any change
to the link FSM. Since the modes are mutually exclusive, we convert
them to state values, and rename the 'flags' field in struct tipc_link
to 'exec_mode'.
Finally, we move the #defines for link FSM states and events from link.h
into enums inside the file link.c, which is the real usage scope of
these definitions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:25 +08:00
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
|
|
|
|
struct sk_buff_head *xmitq);
|
2015-07-17 04:54:26 +08:00
|
|
|
static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
bool probe_reply, u16 rcvgap,
|
|
|
|
int tolerance, int priority,
|
2015-07-17 04:54:26 +08:00
|
|
|
struct sk_buff_head *xmitq);
|
2015-11-20 03:30:47 +08:00
|
|
|
static void link_print(struct tipc_link *l, const char *str);
|
2016-09-02 01:52:51 +08:00
|
|
|
static int tipc_link_build_nack_msg(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq);
|
2015-10-22 20:51:41 +08:00
|
|
|
static void tipc_link_build_bc_init_msg(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq);
|
2020-05-26 17:38:34 +08:00
|
|
|
static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga,
|
|
|
|
struct tipc_link *l, u8 start_index);
|
|
|
|
static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr);
|
|
|
|
static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r,
|
|
|
|
u16 acked, u16 gap,
|
2019-06-17 13:15:42 +08:00
|
|
|
struct tipc_gap_ack_blks *ga,
|
2020-05-26 17:38:34 +08:00
|
|
|
struct sk_buff_head *xmitq,
|
|
|
|
bool *retransmitted, int *rc);
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
static void tipc_link_update_cwin(struct tipc_link *l, int released,
|
|
|
|
bool retransmitted);
|
2006-01-03 02:04:38 +08:00
|
|
|
/*
|
2006-03-21 14:37:04 +08:00
|
|
|
* Simple non-static link routines (i.e. referenced outside this file)
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-07-31 06:24:21 +08:00
|
|
|
bool tipc_link_is_up(struct tipc_link *l)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-07-31 06:24:21 +08:00
|
|
|
return link_is_up(l);
|
|
|
|
}
|
|
|
|
|
2015-10-16 02:52:46 +08:00
|
|
|
bool tipc_link_peer_is_down(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state == LINK_PEER_RESET;
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
bool tipc_link_is_reset(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING);
|
|
|
|
}
|
|
|
|
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
bool tipc_link_is_establishing(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state == LINK_ESTABLISHING;
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
bool tipc_link_is_synching(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state == LINK_SYNCHING;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool tipc_link_is_failingover(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state == LINK_FAILINGOVER;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool tipc_link_is_blocked(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-10-24 22:56:01 +08:00
|
|
|
static bool link_is_bc_sndlink(struct tipc_link *l)
|
2015-10-22 20:51:41 +08:00
|
|
|
{
|
|
|
|
return !l->bc_sndlink;
|
|
|
|
}
|
|
|
|
|
2015-10-24 22:56:01 +08:00
|
|
|
static bool link_is_bc_rcvlink(struct tipc_link *l)
|
2015-10-22 20:51:41 +08:00
|
|
|
{
|
|
|
|
return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:46 +08:00
|
|
|
void tipc_link_set_active(struct tipc_link *l, bool active)
|
|
|
|
{
|
|
|
|
l->active = active;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-11-20 03:30:46 +08:00
|
|
|
u32 tipc_link_id(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->peer_bearer_id << 16 | l->bearer_id;
|
|
|
|
}
|
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
int tipc_link_min_win(struct tipc_link *l)
|
2015-11-20 03:30:46 +08:00
|
|
|
{
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
return l->min_win;
|
|
|
|
}
|
|
|
|
|
|
|
|
int tipc_link_max_win(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->max_win;
|
2015-11-20 03:30:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int tipc_link_prio(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->priority;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long tipc_link_tolerance(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->tolerance;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sk_buff_head *tipc_link_inputq(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->inputq;
|
|
|
|
}
|
|
|
|
|
|
|
|
char tipc_link_plane(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->net_plane;
|
|
|
|
}
|
|
|
|
|
2021-05-18 10:09:08 +08:00
|
|
|
struct net *tipc_link_net(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->net;
|
|
|
|
}
|
|
|
|
|
2018-07-10 07:07:35 +08:00
|
|
|
void tipc_link_update_caps(struct tipc_link *l, u16 capabilities)
|
|
|
|
{
|
|
|
|
l->peer_caps = capabilities;
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
void tipc_link_add_bc_peer(struct tipc_link *snd_l,
|
|
|
|
struct tipc_link *uc_l,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-10-22 20:51:39 +08:00
|
|
|
{
|
2015-10-22 20:51:41 +08:00
|
|
|
struct tipc_link *rcv_l = uc_l->bc_rcvlink;
|
|
|
|
|
|
|
|
snd_l->ackers++;
|
|
|
|
rcv_l->acked = snd_l->snd_nxt - 1;
|
tipc: correct settings of broadcast link state
Since commit 5266698661401afc5e ("tipc: let broadcast packet
reception use new link receive function") the broadcast send
link state was meant to always be set to LINK_ESTABLISHED, since
we don't need this link to follow the regular link FSM rules. It
was also the intention that this state anyway shouldn't impact
the run-time working state of the link, since the latter in
reality is controlled by the number of registered peers.
We have now discovered that this assumption is not quite correct.
If the broadcast link is reset because of too many retransmissions,
its state will inadvertently go to LINK_RESETTING, and never go
back to LINK_ESTABLISHED, because the LINK_FAILURE event was not
anticipated. This will work well once, but if it happens a second
time, the reset on a link in LINK_RESETTING has has no effect, and
neither the broadcast link nor the unicast links will go down as
they should.
Furthermore, it is confusing that the management tool shows that
this link is in UP state when that obviously isn't the case.
We now ensure that this state strictly follows the true working
state of the link. The state is set to LINK_ESTABLISHED when
the number of peers is non-zero, and to LINK_RESET otherwise.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-20 03:12:50 +08:00
|
|
|
snd_l->state = LINK_ESTABLISHED;
|
2015-10-22 20:51:41 +08:00
|
|
|
tipc_link_build_bc_init_msg(uc_l, xmitq);
|
2015-10-22 20:51:39 +08:00
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
|
|
|
|
struct tipc_link *rcv_l,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-10-22 20:51:39 +08:00
|
|
|
{
|
2015-10-22 20:51:41 +08:00
|
|
|
u16 ack = snd_l->snd_nxt - 1;
|
|
|
|
|
|
|
|
snd_l->ackers--;
|
2016-07-12 04:08:36 +08:00
|
|
|
rcv_l->bc_peer_is_up = true;
|
|
|
|
rcv_l->state = LINK_ESTABLISHED;
|
2020-05-26 17:38:36 +08:00
|
|
|
tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq, NULL);
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!");
|
2015-10-22 20:51:41 +08:00
|
|
|
tipc_link_reset(rcv_l);
|
|
|
|
rcv_l->state = LINK_RESET;
|
|
|
|
if (!snd_l->ackers) {
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!");
|
2015-10-22 20:51:41 +08:00
|
|
|
tipc_link_reset(snd_l);
|
tipc: correct settings of broadcast link state
Since commit 5266698661401afc5e ("tipc: let broadcast packet
reception use new link receive function") the broadcast send
link state was meant to always be set to LINK_ESTABLISHED, since
we don't need this link to follow the regular link FSM rules. It
was also the intention that this state anyway shouldn't impact
the run-time working state of the link, since the latter in
reality is controlled by the number of registered peers.
We have now discovered that this assumption is not quite correct.
If the broadcast link is reset because of too many retransmissions,
its state will inadvertently go to LINK_RESETTING, and never go
back to LINK_ESTABLISHED, because the LINK_FAILURE event was not
anticipated. This will work well once, but if it happens a second
time, the reset on a link in LINK_RESETTING has has no effect, and
neither the broadcast link nor the unicast links will go down as
they should.
Furthermore, it is confusing that the management tool shows that
this link is in UP state when that obviously isn't the case.
We now ensure that this state strictly follows the true working
state of the link. The state is set to LINK_ESTABLISHED when
the number of peers is non-zero, and to LINK_RESET otherwise.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-11-20 03:12:50 +08:00
|
|
|
snd_l->state = LINK_RESET;
|
2015-10-22 20:51:41 +08:00
|
|
|
__skb_queue_purge(xmitq);
|
|
|
|
}
|
2015-10-22 20:51:39 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int tipc_link_bc_peers(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->ackers;
|
|
|
|
}
|
|
|
|
|
2018-07-19 17:16:59 +08:00
|
|
|
static u16 link_bc_rcv_gap(struct tipc_link *l)
|
2016-09-02 01:52:49 +08:00
|
|
|
{
|
|
|
|
struct sk_buff *skb = skb_peek(&l->deferdq);
|
|
|
|
u16 gap = 0;
|
|
|
|
|
|
|
|
if (more(l->snd_nxt, l->rcv_nxt))
|
|
|
|
gap = l->snd_nxt - l->rcv_nxt;
|
|
|
|
if (skb)
|
|
|
|
gap = buf_seqno(skb) - l->rcv_nxt;
|
|
|
|
return gap;
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:43 +08:00
|
|
|
void tipc_link_set_mtu(struct tipc_link *l, int mtu)
|
|
|
|
{
|
|
|
|
l->mtu = mtu;
|
|
|
|
}
|
|
|
|
|
|
|
|
int tipc_link_mtu(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->mtu;
|
|
|
|
}
|
|
|
|
|
2019-11-08 13:05:11 +08:00
|
|
|
int tipc_link_mss(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_TIPC_CRYPTO
|
|
|
|
return l->mtu - INT_H_SIZE - EMSG_OVERHEAD;
|
|
|
|
#else
|
|
|
|
return l->mtu - INT_H_SIZE;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-11-20 03:30:46 +08:00
|
|
|
u16 tipc_link_rcv_nxt(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->rcv_nxt;
|
|
|
|
}
|
|
|
|
|
|
|
|
u16 tipc_link_acked(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->acked;
|
|
|
|
}
|
|
|
|
|
|
|
|
char *tipc_link_name(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->name;
|
|
|
|
}
|
|
|
|
|
2018-09-27 03:00:54 +08:00
|
|
|
u32 tipc_link_state(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return l->state;
|
|
|
|
}
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
/**
|
2006-01-18 07:38:21 +08:00
|
|
|
* tipc_link_create - create a new link
|
2020-07-13 07:15:14 +08:00
|
|
|
* @net: pointer to associated network namespace
|
2015-10-22 20:51:36 +08:00
|
|
|
* @if_name: associated interface name
|
|
|
|
* @bearer_id: id (index) of associated bearer
|
|
|
|
* @tolerance: link tolerance to be used by link
|
|
|
|
* @net_plane: network plane (A,B,c..) this link belongs to
|
|
|
|
* @mtu: mtu to be advertised by link
|
|
|
|
* @priority: priority to be used by link
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
* @min_win: minimal send window to be used by link
|
|
|
|
* @max_win: maximal send window to be used by link
|
2015-10-22 20:51:36 +08:00
|
|
|
* @session: session to be used by link
|
|
|
|
* @peer: node id of peer node
|
2015-10-22 20:51:40 +08:00
|
|
|
* @peer_caps: bitmap describing peer node capabilities
|
2015-10-22 20:51:41 +08:00
|
|
|
* @bc_sndlink: the namespace global link used for broadcast sending
|
|
|
|
* @bc_rcvlink: the peer specific link used for broadcast reception
|
2015-07-31 06:24:26 +08:00
|
|
|
* @inputq: queue to put messages ready for delivery
|
|
|
|
* @namedq: queue to put binding table update messages ready for delivery
|
|
|
|
* @link: return value, pointer to put the created link
|
2020-11-30 02:32:44 +08:00
|
|
|
* @self: local unicast link id
|
|
|
|
* @peer_id: 128-bit ID of peer
|
2007-02-09 22:25:21 +08:00
|
|
|
*
|
2020-11-30 02:32:48 +08:00
|
|
|
* Return: true if link was created, otherwise false
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-10-22 20:51:46 +08:00
|
|
|
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
|
2015-10-22 20:51:36 +08:00
|
|
|
int tolerance, char net_plane, u32 mtu, int priority,
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
u32 min_win, u32 max_win, u32 session, u32 self,
|
tipc: handle collisions of 32-bit node address hash values
When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.
We do this as follows:
- We don't apply the generated address immediately to the node, but do
instead initiate a 1 sec trial period to allow other cluster members
to discover and handle such collisions.
- During the trial period the node periodically sends out a new type
of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
to all the other nodes in the cluster.
- When a node is receiving such a message, it must check that the
presented 32-bit identifier either is unused, or was used by the very
same peer in a previous session. In both cases it accepts the request
by not responding to it.
- If it finds that the same node has been up before using a different
address, it responds with a DSC_TRIAL_FAIL_MSG containing that
address.
- If it finds that the address has already been taken by some other
node, it generates a new, unused address and returns it to the
requester.
- During the trial period the requesting node must always be prepared
to accept a failure message, i.e., a message where a peer suggests a
different (or equal) address to the one tried. In those cases it
must apply the suggested value as trial address and restart the trial
period.
This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 03:42:51 +08:00
|
|
|
u32 peer, u8 *peer_id, u16 peer_caps,
|
2015-10-22 20:51:41 +08:00
|
|
|
struct tipc_link *bc_sndlink,
|
|
|
|
struct tipc_link *bc_rcvlink,
|
|
|
|
struct sk_buff_head *inputq,
|
|
|
|
struct sk_buff_head *namedq,
|
2015-07-31 06:24:26 +08:00
|
|
|
struct tipc_link **link)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
tipc: handle collisions of 32-bit node address hash values
When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.
We do this as follows:
- We don't apply the generated address immediately to the node, but do
instead initiate a 1 sec trial period to allow other cluster members
to discover and handle such collisions.
- During the trial period the node periodically sends out a new type
of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
to all the other nodes in the cluster.
- When a node is receiving such a message, it must check that the
presented 32-bit identifier either is unused, or was used by the very
same peer in a previous session. In both cases it accepts the request
by not responding to it.
- If it finds that the same node has been up before using a different
address, it responds with a DSC_TRIAL_FAIL_MSG containing that
address.
- If it finds that the address has already been taken by some other
node, it generates a new, unused address and returns it to the
requester.
- During the trial period the requesting node must always be prepared
to accept a failure message, i.e., a message where a peer suggests a
different (or equal) address to the one tried. In those cases it
must apply the suggested value as trial address and restart the trial
period.
This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 03:42:51 +08:00
|
|
|
char peer_str[NODE_ID_STR_LEN] = {0,};
|
|
|
|
char self_str[NODE_ID_STR_LEN] = {0,};
|
2015-07-31 06:24:26 +08:00
|
|
|
struct tipc_link *l;
|
2011-03-01 00:32:27 +08:00
|
|
|
|
2015-07-31 06:24:26 +08:00
|
|
|
l = kzalloc(sizeof(*l), GFP_ATOMIC);
|
|
|
|
if (!l)
|
|
|
|
return false;
|
|
|
|
*link = l;
|
2016-03-04 03:23:21 +08:00
|
|
|
l->session = session;
|
2011-03-01 00:32:27 +08:00
|
|
|
|
tipc: handle collisions of 32-bit node address hash values
When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.
We do this as follows:
- We don't apply the generated address immediately to the node, but do
instead initiate a 1 sec trial period to allow other cluster members
to discover and handle such collisions.
- During the trial period the node periodically sends out a new type
of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
to all the other nodes in the cluster.
- When a node is receiving such a message, it must check that the
presented 32-bit identifier either is unused, or was used by the very
same peer in a previous session. In both cases it accepts the request
by not responding to it.
- If it finds that the same node has been up before using a different
address, it responds with a DSC_TRIAL_FAIL_MSG containing that
address.
- If it finds that the address has already been taken by some other
node, it generates a new, unused address and returns it to the
requester.
- During the trial period the requesting node must always be prepared
to accept a failure message, i.e., a message where a peer suggests a
different (or equal) address to the one tried. In those cases it
must apply the suggested value as trial address and restart the trial
period.
This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 03:42:51 +08:00
|
|
|
/* Set link name for unicast links only */
|
|
|
|
if (peer_id) {
|
|
|
|
tipc_nodeid2string(self_str, tipc_own_id(net));
|
|
|
|
if (strlen(self_str) > 16)
|
|
|
|
sprintf(self_str, "%x", self);
|
|
|
|
tipc_nodeid2string(peer_str, peer_id);
|
|
|
|
if (strlen(peer_str) > 16)
|
|
|
|
sprintf(peer_str, "%x", peer);
|
|
|
|
}
|
|
|
|
/* Peer i/f name will be completed by reset/activate message */
|
2018-03-30 05:20:45 +08:00
|
|
|
snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown",
|
|
|
|
self_str, if_name, peer_str);
|
tipc: handle collisions of 32-bit node address hash values
When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.
We do this as follows:
- We don't apply the generated address immediately to the node, but do
instead initiate a 1 sec trial period to allow other cluster members
to discover and handle such collisions.
- During the trial period the node periodically sends out a new type
of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
to all the other nodes in the cluster.
- When a node is receiving such a message, it must check that the
presented 32-bit identifier either is unused, or was used by the very
same peer in a previous session. In both cases it accepts the request
by not responding to it.
- If it finds that the same node has been up before using a different
address, it responds with a DSC_TRIAL_FAIL_MSG containing that
address.
- If it finds that the address has already been taken by some other
node, it generates a new, unused address and returns it to the
requester.
- During the trial period the requesting node must always be prepared
to accept a failure message, i.e., a message where a peer suggests a
different (or equal) address to the one tried. In those cases it
must apply the suggested value as trial address and restart the trial
period.
This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-03-23 03:42:51 +08:00
|
|
|
|
2016-03-04 03:23:21 +08:00
|
|
|
strcpy(l->if_name, if_name);
|
2015-07-31 06:24:26 +08:00
|
|
|
l->addr = peer;
|
2015-10-22 20:51:40 +08:00
|
|
|
l->peer_caps = peer_caps;
|
2015-10-22 20:51:46 +08:00
|
|
|
l->net = net;
|
2018-07-10 07:07:36 +08:00
|
|
|
l->in_session = false;
|
2015-10-22 20:51:36 +08:00
|
|
|
l->bearer_id = bearer_id;
|
|
|
|
l->tolerance = tolerance;
|
2018-10-10 23:34:01 +08:00
|
|
|
if (bc_rcvlink)
|
|
|
|
bc_rcvlink->tolerance = tolerance;
|
2015-10-22 20:51:36 +08:00
|
|
|
l->net_plane = net_plane;
|
|
|
|
l->advertised_mtu = mtu;
|
|
|
|
l->mtu = mtu;
|
|
|
|
l->priority = priority;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
tipc_link_set_queue_limits(l, min_win, max_win);
|
tipc: make struct tipc_link generic to support broadcast
Realizing that unicast is just a special case of broadcast, we also see
that we can go in the other direction, i.e., that modest changes to the
current unicast link can make it generic enough to support broadcast.
The following changes are introduced here:
- A new counter ("ackers") in struct tipc_link, to indicate how many
peers need to ack a packet before it can be released.
- A corresponding counter in the skb user area, to keep track of how
many peers a are left to ack before a buffer can be released.
- A new counter ("acked"), to keep persistent track of how far a peer
has acked at the moment, i.e., where in the transmission queue to
start updating buffers when the next ack arrives. This is to avoid
double acknowledgements from a peer, with inadvertent relase of
packets as a result.
- A more generic tipc_link_retrans() function, where retransmit starts
from a given sequence number, instead of the first packet in the
transmision queue. This is to minimize the number of retransmitted
packets on the broadcast media.
When the new functionality is taken into use in the next commits,
we expect it to have minimal effect on unicast mode performance.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-22 20:51:38 +08:00
|
|
|
l->ackers = 1;
|
2015-10-22 20:51:41 +08:00
|
|
|
l->bc_sndlink = bc_sndlink;
|
|
|
|
l->bc_rcvlink = bc_rcvlink;
|
2015-07-31 06:24:26 +08:00
|
|
|
l->inputq = inputq;
|
|
|
|
l->namedq = namedq;
|
|
|
|
l->state = LINK_RESETTING;
|
|
|
|
__skb_queue_head_init(&l->transmq);
|
|
|
|
__skb_queue_head_init(&l->backlogq);
|
|
|
|
__skb_queue_head_init(&l->deferdq);
|
2019-04-04 12:09:53 +08:00
|
|
|
__skb_queue_head_init(&l->failover_deferdq);
|
2015-07-31 06:24:26 +08:00
|
|
|
skb_queue_head_init(&l->wakeupq);
|
|
|
|
skb_queue_head_init(l->inputq);
|
|
|
|
return true;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:37 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_bc_create - create new link to be used for broadcast
|
2020-07-13 07:15:14 +08:00
|
|
|
* @net: pointer to associated network namespace
|
2017-11-30 23:47:25 +08:00
|
|
|
* @mtu: mtu to be used initially if no peers
|
2020-09-15 10:39:55 +08:00
|
|
|
* @min_win: minimal send window to be used by link
|
|
|
|
* @max_win: maximal send window to be used by link
|
2015-10-22 20:51:37 +08:00
|
|
|
* @inputq: queue to put messages ready for delivery
|
|
|
|
* @namedq: queue to put binding table update messages ready for delivery
|
|
|
|
* @link: return value, pointer to put the created link
|
2020-11-30 02:32:44 +08:00
|
|
|
* @ownnode: identity of own node
|
|
|
|
* @peer: node id of peer node
|
|
|
|
* @peer_id: 128-bit ID of peer
|
|
|
|
* @peer_caps: bitmap describing peer node capabilities
|
|
|
|
* @bc_sndlink: the namespace global link used for broadcast sending
|
2015-10-22 20:51:37 +08:00
|
|
|
*
|
2020-11-30 02:32:48 +08:00
|
|
|
* Return: true if link was created, otherwise false
|
2015-10-22 20:51:37 +08:00
|
|
|
*/
|
2020-05-26 17:38:37 +08:00
|
|
|
bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id,
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
int mtu, u32 min_win, u32 max_win, u16 peer_caps,
|
2015-10-22 20:51:37 +08:00
|
|
|
struct sk_buff_head *inputq,
|
|
|
|
struct sk_buff_head *namedq,
|
2015-10-22 20:51:41 +08:00
|
|
|
struct tipc_link *bc_sndlink,
|
2015-10-22 20:51:37 +08:00
|
|
|
struct tipc_link **link)
|
|
|
|
{
|
|
|
|
struct tipc_link *l;
|
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win,
|
|
|
|
max_win, 0, ownnode, peer, NULL, peer_caps,
|
|
|
|
bc_sndlink, NULL, inputq, namedq, link))
|
2015-10-22 20:51:37 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
l = *link;
|
2020-05-26 17:38:37 +08:00
|
|
|
if (peer_id) {
|
|
|
|
char peer_str[NODE_ID_STR_LEN] = {0,};
|
|
|
|
|
|
|
|
tipc_nodeid2string(peer_str, peer_id);
|
|
|
|
if (strlen(peer_str) > 16)
|
|
|
|
sprintf(peer_str, "%x", peer);
|
|
|
|
/* Broadcast receiver link name: "broadcast-link:<peer>" */
|
|
|
|
snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name,
|
|
|
|
peer_str);
|
|
|
|
} else {
|
|
|
|
strcpy(l->name, tipc_bclink_name);
|
|
|
|
}
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!");
|
2015-10-22 20:51:37 +08:00
|
|
|
tipc_link_reset(l);
|
2015-10-22 20:51:41 +08:00
|
|
|
l->state = LINK_RESET;
|
2015-10-22 20:51:39 +08:00
|
|
|
l->ackers = 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
l->bc_rcvlink = l;
|
2015-10-22 20:51:37 +08:00
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
/* Broadcast send link is always up */
|
|
|
|
if (link_is_bc_sndlink(l))
|
|
|
|
l->state = LINK_ESTABLISHED;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
2017-01-19 02:50:53 +08:00
|
|
|
/* Disable replicast if even a single peer doesn't support it */
|
|
|
|
if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST))
|
2019-11-21 11:01:09 +08:00
|
|
|
tipc_bcast_toggle_rcast(net, false);
|
2017-01-19 02:50:53 +08:00
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
return true;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
|
|
|
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_fsm_evt - link finite state machine
|
|
|
|
* @l: pointer to link
|
|
|
|
* @evt: state machine event to be processed
|
|
|
|
*/
|
2015-07-31 06:24:21 +08:00
|
|
|
int tipc_link_fsm_evt(struct tipc_link *l, int evt)
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
{
|
2015-07-31 06:24:20 +08:00
|
|
|
int rc = 0;
|
2018-12-19 10:17:57 +08:00
|
|
|
int old_state = l->state;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
|
|
|
|
switch (l->state) {
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_RESETTING:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
switch (evt) {
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
l->state = LINK_PEER_RESET;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_RESET_EVT:
|
|
|
|
l->state = LINK_RESET;
|
|
|
|
break;
|
|
|
|
case LINK_FAILURE_EVT:
|
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
case LINK_SYNCH_END_EVT:
|
|
|
|
default:
|
|
|
|
goto illegal_evt;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case LINK_RESET:
|
|
|
|
switch (evt) {
|
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
l->state = LINK_ESTABLISHING;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
l->state = LINK_FAILINGOVER;
|
2020-11-21 02:40:08 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_FAILURE_EVT:
|
|
|
|
case LINK_RESET_EVT:
|
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
case LINK_SYNCH_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
default:
|
2015-07-31 06:24:21 +08:00
|
|
|
goto illegal_evt;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
}
|
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_PEER_RESET:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
switch (evt) {
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_RESET_EVT:
|
|
|
|
l->state = LINK_ESTABLISHING;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
case LINK_FAILURE_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
case LINK_SYNCH_END_EVT:
|
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
|
|
|
default:
|
|
|
|
goto illegal_evt;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case LINK_FAILINGOVER:
|
|
|
|
switch (evt) {
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
|
|
|
l->state = LINK_RESET;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
case LINK_RESET_EVT:
|
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
case LINK_FAILURE_EVT:
|
|
|
|
break;
|
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
case LINK_SYNCH_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
default:
|
2015-07-31 06:24:21 +08:00
|
|
|
goto illegal_evt;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
}
|
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_ESTABLISHING:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
switch (evt) {
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
l->state = LINK_ESTABLISHED;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
l->state = LINK_FAILINGOVER;
|
|
|
|
break;
|
|
|
|
case LINK_RESET_EVT:
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
l->state = LINK_RESET;
|
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_FAILURE_EVT:
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
case LINK_PEER_RESET_EVT:
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
|
|
|
break;
|
|
|
|
case LINK_SYNCH_END_EVT:
|
|
|
|
default:
|
|
|
|
goto illegal_evt;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case LINK_ESTABLISHED:
|
|
|
|
switch (evt) {
|
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
l->state = LINK_PEER_RESET;
|
|
|
|
rc |= TIPC_LINK_DOWN_EVT;
|
|
|
|
break;
|
|
|
|
case LINK_FAILURE_EVT:
|
|
|
|
l->state = LINK_RESETTING;
|
|
|
|
rc |= TIPC_LINK_DOWN_EVT;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_RESET_EVT:
|
|
|
|
l->state = LINK_RESET;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_ESTABLISH_EVT:
|
2015-08-20 14:12:55 +08:00
|
|
|
case LINK_SYNCH_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
|
|
|
l->state = LINK_SYNCHING;
|
|
|
|
break;
|
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
default:
|
2015-07-31 06:24:21 +08:00
|
|
|
goto illegal_evt;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
}
|
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCHING:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
switch (evt) {
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_PEER_RESET_EVT:
|
|
|
|
l->state = LINK_PEER_RESET;
|
|
|
|
rc |= TIPC_LINK_DOWN_EVT;
|
|
|
|
break;
|
|
|
|
case LINK_FAILURE_EVT:
|
|
|
|
l->state = LINK_RESETTING;
|
|
|
|
rc |= TIPC_LINK_DOWN_EVT;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_RESET_EVT:
|
|
|
|
l->state = LINK_RESET;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_ESTABLISH_EVT:
|
|
|
|
case LINK_SYNCH_BEGIN_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_SYNCH_END_EVT:
|
|
|
|
l->state = LINK_ESTABLISHED;
|
|
|
|
break;
|
|
|
|
case LINK_FAILOVER_BEGIN_EVT:
|
|
|
|
case LINK_FAILOVER_END_EVT:
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
default:
|
2015-07-31 06:24:21 +08:00
|
|
|
goto illegal_evt;
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
2015-07-31 06:24:21 +08:00
|
|
|
pr_err("Unknown FSM state %x in %s\n", l->state, l->name);
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
}
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_fsm(l->name, old_state, l->state, evt);
|
2015-07-31 06:24:21 +08:00
|
|
|
return rc;
|
|
|
|
illegal_evt:
|
|
|
|
pr_err("Illegal FSM event %x in state %x on link %s\n",
|
|
|
|
evt, l->state, l->name);
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_fsm(l->name, old_state, l->state, evt);
|
tipc: improve link FSM implementation
The link FSM implementation is currently unnecessarily complex.
It sometimes checks for conditional state outside the FSM data
before deciding next state, and often performs actions directly
inside the FSM logics.
In this commit, we create a second, simpler FSM implementation,
that as far as possible acts only on states and events that it is
strictly defined for, and postpone any actions until it is finished
with its decisions. It also returns an event flag field and an a
buffer queue which may potentially contain a protocol message to
be sent by the caller.
Unfortunately, we cannot yet make the FSM "clean", in the sense
that its decisions are only based on FSM state and event, and that
state changes happen only here. That will have to wait until the
activate/reset logics has been cleaned up in a future commit.
We also rename the link states as follows:
WORKING_WORKING -> TIPC_LINK_WORKING
WORKING_UNKNOWN -> TIPC_LINK_PROBING
RESET_UNKNOWN -> TIPC_LINK_RESETTING
RESET_RESET -> TIPC_LINK_ESTABLISHING
The existing FSM function, link_state_event(), is still needed for
a while, so we redesign it to make use of the new function.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:27 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:28 +08:00
|
|
|
/* link_profile_stats - update statistical profiling of traffic
|
|
|
|
*/
|
|
|
|
static void link_profile_stats(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct tipc_msg *msg;
|
|
|
|
int length;
|
|
|
|
|
|
|
|
/* Update counters used in statistical profiling of send traffic */
|
|
|
|
l->stats.accu_queue_sz += skb_queue_len(&l->transmq);
|
|
|
|
l->stats.queue_sz_counts++;
|
|
|
|
|
|
|
|
skb = skb_peek(&l->transmq);
|
|
|
|
if (!skb)
|
|
|
|
return;
|
|
|
|
msg = buf_msg(skb);
|
|
|
|
length = msg_size(msg);
|
|
|
|
|
|
|
|
if (msg_user(msg) == MSG_FRAGMENTER) {
|
|
|
|
if (msg_type(msg) != FIRST_FRAGMENT)
|
|
|
|
return;
|
2019-06-26 01:37:00 +08:00
|
|
|
length = msg_size(msg_inner_hdr(msg));
|
2015-07-17 04:54:28 +08:00
|
|
|
}
|
|
|
|
l->stats.msg_lengths_total += length;
|
|
|
|
l->stats.msg_length_counts++;
|
|
|
|
if (length <= 64)
|
|
|
|
l->stats.msg_length_profile[0]++;
|
|
|
|
else if (length <= 256)
|
|
|
|
l->stats.msg_length_profile[1]++;
|
|
|
|
else if (length <= 1024)
|
|
|
|
l->stats.msg_length_profile[2]++;
|
|
|
|
else if (length <= 4096)
|
|
|
|
l->stats.msg_length_profile[3]++;
|
|
|
|
else if (length <= 16384)
|
|
|
|
l->stats.msg_length_profile[4]++;
|
|
|
|
else if (length <= 32768)
|
|
|
|
l->stats.msg_length_profile[5]++;
|
|
|
|
else
|
|
|
|
l->stats.msg_length_profile[6]++;
|
|
|
|
}
|
|
|
|
|
2018-12-19 10:17:57 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_too_silent - check if link is "too silent"
|
|
|
|
* @l: tipc link to be checked
|
|
|
|
*
|
2020-11-30 02:32:48 +08:00
|
|
|
* Return: true if the link 'silent_intv_cnt' is about to reach the
|
2018-12-19 10:17:57 +08:00
|
|
|
* 'abort_limit' value, otherwise false
|
|
|
|
*/
|
|
|
|
bool tipc_link_too_silent(struct tipc_link *l)
|
|
|
|
{
|
|
|
|
return (l->silent_intv_cnt + 2 > l->abort_limit);
|
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:28 +08:00
|
|
|
/* tipc_link_timeout - perform periodic task as instructed from node timeout
|
|
|
|
*/
|
|
|
|
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
|
|
|
|
{
|
2016-06-15 14:11:31 +08:00
|
|
|
int mtyp = 0;
|
|
|
|
int rc = 0;
|
2016-04-16 01:33:05 +08:00
|
|
|
bool state = false;
|
|
|
|
bool probe = false;
|
|
|
|
bool setup = false;
|
2015-10-22 20:51:41 +08:00
|
|
|
u16 bc_snt = l->bc_sndlink->snd_nxt - 1;
|
|
|
|
u16 bc_acked = l->bc_rcvlink->acked;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
struct tipc_mon_state *mstate = &l->mon_state;
|
2015-07-31 06:24:20 +08:00
|
|
|
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_timeout(l, TIPC_DUMP_NONE, " ");
|
|
|
|
trace_tipc_link_too_silent(l, TIPC_DUMP_ALL, " ");
|
2015-07-31 06:24:21 +08:00
|
|
|
switch (l->state) {
|
|
|
|
case LINK_ESTABLISHED:
|
|
|
|
case LINK_SYNCHING:
|
2016-04-16 01:33:05 +08:00
|
|
|
mtyp = STATE_MSG;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
link_profile_stats(l);
|
|
|
|
tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id);
|
|
|
|
if (mstate->reset || (l->silent_intv_cnt > l->abort_limit))
|
|
|
|
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
2016-04-16 01:33:05 +08:00
|
|
|
state = bc_acked != bc_snt;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
state |= l->bc_rcvlink->rcv_unacked;
|
|
|
|
state |= l->rcv_unacked;
|
|
|
|
state |= !skb_queue_empty(&l->transmq);
|
|
|
|
probe = mstate->probing;
|
|
|
|
probe |= l->silent_intv_cnt;
|
|
|
|
if (probe || mstate->monitoring)
|
|
|
|
l->silent_intv_cnt++;
|
2020-07-21 09:57:05 +08:00
|
|
|
probe |= !skb_queue_empty(&l->deferdq);
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
if (l->snd_nxt == l->checkpoint) {
|
|
|
|
tipc_link_update_cwin(l, 0, 0);
|
|
|
|
probe = true;
|
|
|
|
}
|
|
|
|
l->checkpoint = l->snd_nxt;
|
2015-07-31 06:24:21 +08:00
|
|
|
break;
|
|
|
|
case LINK_RESET:
|
2016-04-16 01:33:05 +08:00
|
|
|
setup = l->rst_cnt++ <= 4;
|
|
|
|
setup |= !(l->rst_cnt % 16);
|
2015-07-31 06:24:20 +08:00
|
|
|
mtyp = RESET_MSG;
|
2015-07-31 06:24:21 +08:00
|
|
|
break;
|
|
|
|
case LINK_ESTABLISHING:
|
2016-04-16 01:33:05 +08:00
|
|
|
setup = true;
|
2015-07-31 06:24:20 +08:00
|
|
|
mtyp = ACTIVATE_MSG;
|
2015-07-31 06:24:21 +08:00
|
|
|
break;
|
|
|
|
case LINK_PEER_RESET:
|
2015-07-31 06:24:23 +08:00
|
|
|
case LINK_RESETTING:
|
2015-07-31 06:24:21 +08:00
|
|
|
case LINK_FAILINGOVER:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
2015-07-31 06:24:20 +08:00
|
|
|
}
|
2015-07-31 06:24:21 +08:00
|
|
|
|
2016-04-16 01:33:05 +08:00
|
|
|
if (state || probe || setup)
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq);
|
2015-07-31 06:24:20 +08:00
|
|
|
|
2015-07-17 04:54:28 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
/**
|
2015-03-26 00:07:25 +08:00
|
|
|
* link_schedule_user - schedule a message sender for wakeup after congestion
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
* @l: congested link
|
|
|
|
* @hdr: header of message that is being sent
|
2014-08-23 06:09:07 +08:00
|
|
|
* Create pseudo msg to send back to user when congestion abates
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
u32 dnode = tipc_own_addr(l->net);
|
|
|
|
u32 dport = msg_origport(hdr);
|
2015-03-26 00:07:25 +08:00
|
|
|
struct sk_buff *skb;
|
|
|
|
|
|
|
|
/* Create and schedule wakeup pseudo message */
|
|
|
|
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
dnode, l->addr, dport, 0, 0);
|
2015-03-26 00:07:25 +08:00
|
|
|
if (!skb)
|
2021-08-11 09:22:09 +08:00
|
|
|
return -ENOBUFS;
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
msg_set_dest_droppable(buf_msg(skb), true);
|
|
|
|
TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
|
|
|
|
skb_queue_tail(&l->wakeupq, skb);
|
|
|
|
l->stats.link_congs++;
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_link_conges(l, TIPC_DUMP_ALL, "wakeup scheduled!");
|
2015-03-26 00:07:25 +08:00
|
|
|
return -ELINKCONG;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2014-08-23 06:09:07 +08:00
|
|
|
/**
|
|
|
|
* link_prepare_wakeup - prepare users for wakeup after congestion
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
* @l: congested link
|
|
|
|
* Wake up a number of waiting users, as permitted by available space
|
|
|
|
* in the send queue
|
2014-08-23 06:09:07 +08:00
|
|
|
*/
|
2018-07-19 17:16:59 +08:00
|
|
|
static void link_prepare_wakeup(struct tipc_link *l)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2019-07-30 22:23:18 +08:00
|
|
|
struct sk_buff_head *wakeupq = &l->wakeupq;
|
|
|
|
struct sk_buff_head *inputq = l->inputq;
|
2014-11-26 11:41:51 +08:00
|
|
|
struct sk_buff *skb, *tmp;
|
2019-07-30 22:23:18 +08:00
|
|
|
struct sk_buff_head tmpq;
|
|
|
|
int avail[5] = {0,};
|
|
|
|
int imp = 0;
|
2014-08-23 06:09:07 +08:00
|
|
|
|
2019-07-30 22:23:18 +08:00
|
|
|
__skb_queue_head_init(&tmpq);
|
|
|
|
|
|
|
|
for (; imp <= TIPC_SYSTEM_IMPORTANCE; imp++)
|
|
|
|
avail[imp] = l->backlog[imp].limit - l->backlog[imp].len;
|
|
|
|
|
|
|
|
skb_queue_walk_safe(wakeupq, skb, tmp) {
|
tipc: introduce starvation free send algorithm
Currently, we only use a single counter; the length of the backlog
queue, to determine whether a message should be accepted to the queue
or not. Each time a message is being sent, the queue length is compared
to a threshold value for the message's importance priority. If the queue
length is beyond this threshold, the message is rejected. This algorithm
implies a risk of starvation of low importance senders during very high
load, because it may take a long time before the backlog queue has
decreased enough to accept a lower level message.
We now eliminate this risk by introducing a counter for each importance
priority. When a message is sent, we check only the queue level for that
particular message's priority. If that is ok, the message can be added
to the backlog, irrespective of the queue level for other priorities.
This way, each level is guaranteed a certain portion of the total
bandwidth, and any risk of starvation is eliminated.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 00:07:24 +08:00
|
|
|
imp = TIPC_SKB_CB(skb)->chain_imp;
|
2019-07-30 22:23:18 +08:00
|
|
|
if (avail[imp] <= 0)
|
|
|
|
continue;
|
|
|
|
avail[imp]--;
|
|
|
|
__skb_unlink(skb, wakeupq);
|
|
|
|
__skb_queue_tail(&tmpq, skb);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2019-07-30 22:23:18 +08:00
|
|
|
|
|
|
|
spin_lock_bh(&inputq->lock);
|
|
|
|
skb_queue_splice_tail(&tmpq, inputq);
|
|
|
|
spin_unlock_bh(&inputq->lock);
|
|
|
|
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2020-07-09 05:06:44 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_set_skb_retransmit_time - set the time at which retransmission of
|
|
|
|
* the given skb should be next attempted
|
|
|
|
* @skb: skb to set a future retransmission time for
|
|
|
|
* @l: link the skb will be transmitted on
|
|
|
|
*/
|
|
|
|
static void tipc_link_set_skb_retransmit_time(struct sk_buff *skb,
|
|
|
|
struct tipc_link *l)
|
|
|
|
{
|
|
|
|
if (link_is_bc_sndlink(l))
|
|
|
|
TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
|
|
|
|
else
|
|
|
|
TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME;
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
void tipc_link_reset(struct tipc_link *l)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
tipc: eliminate possible recursive locking detected by LOCKDEP
When booting kernel with LOCKDEP option, below warning info was found:
WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&list->lock)->rlock#4);
lock(&(&list->lock)->rlock#4);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by swapper/0/1:
#0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1af/0x295 lib/dump_stack.c:113
print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
check_deadlock kernel/locking/lockdep.c:1803 [inline]
validate_chain kernel/locking/lockdep.c:2399 [inline]
__lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
_raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
spin_lock_bh include/linux/spinlock.h:334 [inline]
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
tipc_init_net+0x472/0x610 net/tipc/core.c:82
ops_init+0xf7/0x520 net/core/net_namespace.c:129
__register_pernet_operations net/core/net_namespace.c:940 [inline]
register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
tipc_init+0x83/0x104 net/tipc/core.c:140
do_one_initcall+0x109/0x70a init/main.c:885
do_initcall_level init/main.c:953 [inline]
do_initcalls init/main.c:961 [inline]
do_basic_setup init/main.c:979 [inline]
kernel_init_freeable+0x4bd/0x57f init/main.c:1144
kernel_init+0x13/0x180 init/main.c:1063
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.
Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-11 19:57:56 +08:00
|
|
|
struct sk_buff_head list;
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
u32 imp;
|
tipc: eliminate possible recursive locking detected by LOCKDEP
When booting kernel with LOCKDEP option, below warning info was found:
WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&list->lock)->rlock#4);
lock(&(&list->lock)->rlock#4);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by swapper/0/1:
#0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1af/0x295 lib/dump_stack.c:113
print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
check_deadlock kernel/locking/lockdep.c:1803 [inline]
validate_chain kernel/locking/lockdep.c:2399 [inline]
__lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
_raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
spin_lock_bh include/linux/spinlock.h:334 [inline]
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
tipc_init_net+0x472/0x610 net/tipc/core.c:82
ops_init+0xf7/0x520 net/core/net_namespace.c:129
__register_pernet_operations net/core/net_namespace.c:940 [inline]
register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
tipc_init+0x83/0x104 net/tipc/core.c:140
do_one_initcall+0x109/0x70a init/main.c:885
do_initcall_level init/main.c:953 [inline]
do_initcalls init/main.c:961 [inline]
do_basic_setup init/main.c:979 [inline]
kernel_init_freeable+0x4bd/0x57f init/main.c:1144
kernel_init+0x13/0x180 init/main.c:1063
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.
Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-11 19:57:56 +08:00
|
|
|
|
|
|
|
__skb_queue_head_init(&list);
|
|
|
|
|
2018-07-10 07:07:36 +08:00
|
|
|
l->in_session = false;
|
2019-04-16 11:48:07 +08:00
|
|
|
/* Force re-synch of peer session number before establishing */
|
|
|
|
l->peer_session--;
|
2016-03-04 03:23:21 +08:00
|
|
|
l->session++;
|
2015-07-31 06:24:19 +08:00
|
|
|
l->mtu = l->advertised_mtu;
|
tipc: eliminate possible recursive locking detected by LOCKDEP
When booting kernel with LOCKDEP option, below warning info was found:
WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&list->lock)->rlock#4);
lock(&(&list->lock)->rlock#4);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by swapper/0/1:
#0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1af/0x295 lib/dump_stack.c:113
print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
check_deadlock kernel/locking/lockdep.c:1803 [inline]
validate_chain kernel/locking/lockdep.c:2399 [inline]
__lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
_raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
spin_lock_bh include/linux/spinlock.h:334 [inline]
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
tipc_init_net+0x472/0x610 net/tipc/core.c:82
ops_init+0xf7/0x520 net/core/net_namespace.c:129
__register_pernet_operations net/core/net_namespace.c:940 [inline]
register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
tipc_init+0x83/0x104 net/tipc/core.c:140
do_one_initcall+0x109/0x70a init/main.c:885
do_initcall_level init/main.c:953 [inline]
do_initcalls init/main.c:961 [inline]
do_basic_setup init/main.c:979 [inline]
kernel_init_freeable+0x4bd/0x57f init/main.c:1144
kernel_init+0x13/0x180 init/main.c:1063
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.
Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-11 19:57:56 +08:00
|
|
|
|
2018-09-26 04:09:10 +08:00
|
|
|
spin_lock_bh(&l->wakeupq.lock);
|
tipc: eliminate possible recursive locking detected by LOCKDEP
When booting kernel with LOCKDEP option, below warning info was found:
WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&list->lock)->rlock#4);
lock(&(&list->lock)->rlock#4);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by swapper/0/1:
#0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1af/0x295 lib/dump_stack.c:113
print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
check_deadlock kernel/locking/lockdep.c:1803 [inline]
validate_chain kernel/locking/lockdep.c:2399 [inline]
__lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
_raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
spin_lock_bh include/linux/spinlock.h:334 [inline]
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
tipc_init_net+0x472/0x610 net/tipc/core.c:82
ops_init+0xf7/0x520 net/core/net_namespace.c:129
__register_pernet_operations net/core/net_namespace.c:940 [inline]
register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
tipc_init+0x83/0x104 net/tipc/core.c:140
do_one_initcall+0x109/0x70a init/main.c:885
do_initcall_level init/main.c:953 [inline]
do_initcalls init/main.c:961 [inline]
do_basic_setup init/main.c:979 [inline]
kernel_init_freeable+0x4bd/0x57f init/main.c:1144
kernel_init+0x13/0x180 init/main.c:1063
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.
Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-11 19:57:56 +08:00
|
|
|
skb_queue_splice_init(&l->wakeupq, &list);
|
|
|
|
spin_unlock_bh(&l->wakeupq.lock);
|
|
|
|
|
2018-09-26 04:09:10 +08:00
|
|
|
spin_lock_bh(&l->inputq->lock);
|
tipc: eliminate possible recursive locking detected by LOCKDEP
When booting kernel with LOCKDEP option, below warning info was found:
WARNING: possible recursive locking detected
4.19.0-rc7+ #14 Not tainted
--------------------------------------------
swapper/0/1 is trying to acquire lock:
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000dcfc0fc8 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
but task is already holding lock:
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at: spin_lock_bh
include/linux/spinlock.h:334 [inline]
00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&(&list->lock)->rlock#4);
lock(&(&list->lock)->rlock#4);
*** DEADLOCK ***
May be due to missing lock nesting notation
2 locks held by swapper/0/1:
#0: 00000000f7539d34 (pernet_ops_rwsem){+.+.}, at:
register_pernet_subsys+0x19/0x40 net/core/net_namespace.c:1051
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
spin_lock_bh include/linux/spinlock.h:334 [inline]
#1: 00000000cbb9b036 (&(&list->lock)->rlock#4){+...}, at:
tipc_link_reset+0xfa/0xdf0 net/tipc/link.c:849
stack backtrace:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.19.0-rc7+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1af/0x295 lib/dump_stack.c:113
print_deadlock_bug kernel/locking/lockdep.c:1759 [inline]
check_deadlock kernel/locking/lockdep.c:1803 [inline]
validate_chain kernel/locking/lockdep.c:2399 [inline]
__lock_acquire+0xf1e/0x3c60 kernel/locking/lockdep.c:3411
lock_acquire+0x1db/0x520 kernel/locking/lockdep.c:3900
__raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
_raw_spin_lock_bh+0x31/0x40 kernel/locking/spinlock.c:168
spin_lock_bh include/linux/spinlock.h:334 [inline]
tipc_link_reset+0x125/0xdf0 net/tipc/link.c:850
tipc_link_bc_create+0xb5/0x1f0 net/tipc/link.c:526
tipc_bcast_init+0x59b/0xab0 net/tipc/bcast.c:521
tipc_init_net+0x472/0x610 net/tipc/core.c:82
ops_init+0xf7/0x520 net/core/net_namespace.c:129
__register_pernet_operations net/core/net_namespace.c:940 [inline]
register_pernet_operations+0x453/0xac0 net/core/net_namespace.c:1011
register_pernet_subsys+0x28/0x40 net/core/net_namespace.c:1052
tipc_init+0x83/0x104 net/tipc/core.c:140
do_one_initcall+0x109/0x70a init/main.c:885
do_initcall_level init/main.c:953 [inline]
do_initcalls init/main.c:961 [inline]
do_basic_setup init/main.c:979 [inline]
kernel_init_freeable+0x4bd/0x57f init/main.c:1144
kernel_init+0x13/0x180 init/main.c:1063
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:413
The reason why the noise above was complained by LOCKDEP is because we
nested to hold l->wakeupq.lock and l->inputq->lock in tipc_link_reset
function. In fact it's unnecessary to move skb buffer from l->wakeupq
queue to l->inputq queue while holding the two locks at the same time.
Instead, we can move skb buffers in l->wakeupq queue to a temporary
list first and then move the buffers of the temporary list to l->inputq
queue, which is also safe for us.
Fixes: 3f32d0be6c16 ("tipc: lock wakeup & inputq at tipc_link_reset()")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-10-11 19:57:56 +08:00
|
|
|
skb_queue_splice_init(&list, l->inputq);
|
2018-09-26 04:09:10 +08:00
|
|
|
spin_unlock_bh(&l->inputq->lock);
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
__skb_queue_purge(&l->transmq);
|
|
|
|
__skb_queue_purge(&l->deferdq);
|
2015-10-22 20:51:48 +08:00
|
|
|
__skb_queue_purge(&l->backlogq);
|
2019-04-04 12:09:53 +08:00
|
|
|
__skb_queue_purge(&l->failover_deferdq);
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) {
|
|
|
|
l->backlog[imp].len = 0;
|
|
|
|
l->backlog[imp].target_bskb = NULL;
|
|
|
|
}
|
2015-07-31 06:24:19 +08:00
|
|
|
kfree_skb(l->reasm_buf);
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
kfree_skb(l->reasm_tnlmsg);
|
2015-07-31 06:24:19 +08:00
|
|
|
kfree_skb(l->failover_reasm_skb);
|
|
|
|
l->reasm_buf = NULL;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
l->reasm_tnlmsg = NULL;
|
2015-07-31 06:24:19 +08:00
|
|
|
l->failover_reasm_skb = NULL;
|
|
|
|
l->rcv_unacked = 0;
|
|
|
|
l->snd_nxt = 1;
|
|
|
|
l->rcv_nxt = 1;
|
2018-07-10 07:07:35 +08:00
|
|
|
l->snd_nxt_state = 1;
|
|
|
|
l->rcv_nxt_state = 1;
|
tipc: make struct tipc_link generic to support broadcast
Realizing that unicast is just a special case of broadcast, we also see
that we can go in the other direction, i.e., that modest changes to the
current unicast link can make it generic enough to support broadcast.
The following changes are introduced here:
- A new counter ("ackers") in struct tipc_link, to indicate how many
peers need to ack a packet before it can be released.
- A corresponding counter in the skb user area, to keep track of how
many peers a are left to ack before a buffer can be released.
- A new counter ("acked"), to keep persistent track of how far a peer
has acked at the moment, i.e., where in the transmission queue to
start updating buffers when the next ack arrives. This is to avoid
double acknowledgements from a peer, with inadvertent relase of
packets as a result.
- A more generic tipc_link_retrans() function, where retransmit starts
from a given sequence number, instead of the first packet in the
transmision queue. This is to minimize the number of retransmitted
packets on the broadcast media.
When the new functionality is taken into use in the next commits,
we expect it to have minimal effect on unicast mode performance.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-22 20:51:38 +08:00
|
|
|
l->acked = 0;
|
2020-05-26 17:38:34 +08:00
|
|
|
l->last_gap = 0;
|
|
|
|
kfree(l->last_ga);
|
|
|
|
l->last_ga = NULL;
|
2015-07-31 06:24:19 +08:00
|
|
|
l->silent_intv_cnt = 0;
|
2016-04-16 01:33:04 +08:00
|
|
|
l->rst_cnt = 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
l->bc_peer_is_up = false;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
memset(&l->mon_state, 0, sizeof(l->mon_state));
|
2015-11-20 03:30:46 +08:00
|
|
|
tipc_link_reset_stats(l);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:24 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_xmit(): enqueue buffer list according to queue situation
|
2020-07-13 07:15:14 +08:00
|
|
|
* @l: link to use
|
2015-07-17 04:54:24 +08:00
|
|
|
* @list: chain of buffers containing message
|
|
|
|
* @xmitq: returned list of packets to be sent by caller
|
|
|
|
*
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
* Consumes the buffer chain.
|
2015-07-17 04:54:24 +08:00
|
|
|
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
|
2021-08-11 09:22:09 +08:00
|
|
|
* Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
|
2015-07-17 04:54:24 +08:00
|
|
|
*/
|
|
|
|
int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
struct sk_buff_head *backlogq = &l->backlogq;
|
|
|
|
struct sk_buff_head *transmq = &l->transmq;
|
|
|
|
struct sk_buff *skb, *_skb;
|
|
|
|
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
|
2015-07-17 04:54:24 +08:00
|
|
|
u16 ack = l->rcv_nxt - 1;
|
|
|
|
u16 seqno = l->snd_nxt;
|
2016-11-25 23:35:02 +08:00
|
|
|
int pkt_cnt = skb_queue_len(list);
|
2019-11-08 13:05:11 +08:00
|
|
|
unsigned int mss = tipc_link_mss(l);
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
unsigned int cwin = l->window;
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
unsigned int mtu = l->mtu;
|
2021-01-08 15:13:37 +08:00
|
|
|
struct tipc_msg *hdr;
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
bool new_bundle;
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
int rc = 0;
|
2021-01-08 15:13:37 +08:00
|
|
|
int imp;
|
2015-07-17 04:54:24 +08:00
|
|
|
|
2021-01-08 15:13:37 +08:00
|
|
|
if (pkt_cnt <= 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
hdr = buf_msg(skb_peek(list));
|
2016-02-11 17:43:15 +08:00
|
|
|
if (unlikely(msg_size(hdr) > mtu)) {
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
|
|
|
|
skb_queue_len(list), msg_user(hdr),
|
|
|
|
msg_type(hdr), msg_size(hdr), mtu);
|
2019-08-15 22:42:50 +08:00
|
|
|
__skb_queue_purge(list);
|
2015-07-17 04:54:24 +08:00
|
|
|
return -EMSGSIZE;
|
2016-02-11 17:43:15 +08:00
|
|
|
}
|
2015-07-17 04:54:24 +08:00
|
|
|
|
2021-01-08 15:13:37 +08:00
|
|
|
imp = msg_importance(hdr);
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
/* Allow oversubscription of one data msg per source at congestion */
|
|
|
|
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
|
|
|
|
if (imp == TIPC_SYSTEM_IMPORTANCE) {
|
|
|
|
pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
|
|
|
|
return -ENOBUFS;
|
|
|
|
}
|
|
|
|
rc = link_schedule_user(l, hdr);
|
|
|
|
}
|
|
|
|
|
2016-11-25 23:35:02 +08:00
|
|
|
if (pkt_cnt > 1) {
|
|
|
|
l->stats.sent_fragmented++;
|
|
|
|
l->stats.sent_fragments += pkt_cnt;
|
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:24 +08:00
|
|
|
/* Prepare each packet for sending, and add to relevant queue: */
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
while ((skb = __skb_dequeue(list))) {
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
if (likely(skb_queue_len(transmq) < cwin)) {
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
hdr = buf_msg(skb);
|
|
|
|
msg_set_seqno(hdr, seqno);
|
|
|
|
msg_set_ack(hdr, ack);
|
|
|
|
msg_set_bcast_ack(hdr, bc_ack);
|
2015-07-17 04:54:24 +08:00
|
|
|
_skb = skb_clone(skb, GFP_ATOMIC);
|
2016-02-11 17:43:15 +08:00
|
|
|
if (!_skb) {
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
kfree_skb(skb);
|
2019-08-15 22:42:50 +08:00
|
|
|
__skb_queue_purge(list);
|
2021-08-11 09:22:09 +08:00
|
|
|
return -ENOBUFS;
|
2016-02-11 17:43:15 +08:00
|
|
|
}
|
2015-07-17 04:54:24 +08:00
|
|
|
__skb_queue_tail(transmq, skb);
|
2020-07-09 05:06:44 +08:00
|
|
|
tipc_link_set_skb_retransmit_time(skb, l);
|
2015-07-17 04:54:24 +08:00
|
|
|
__skb_queue_tail(xmitq, _skb);
|
tipc: make struct tipc_link generic to support broadcast
Realizing that unicast is just a special case of broadcast, we also see
that we can go in the other direction, i.e., that modest changes to the
current unicast link can make it generic enough to support broadcast.
The following changes are introduced here:
- A new counter ("ackers") in struct tipc_link, to indicate how many
peers need to ack a packet before it can be released.
- A corresponding counter in the skb user area, to keep track of how
many peers a are left to ack before a buffer can be released.
- A new counter ("acked"), to keep persistent track of how far a peer
has acked at the moment, i.e., where in the transmission queue to
start updating buffers when the next ack arrives. This is to avoid
double acknowledgements from a peer, with inadvertent relase of
packets as a result.
- A more generic tipc_link_retrans() function, where retransmit starts
from a given sequence number, instead of the first packet in the
transmision queue. This is to minimize the number of retransmitted
packets on the broadcast media.
When the new functionality is taken into use in the next commits,
we expect it to have minimal effect on unicast mode performance.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-22 20:51:38 +08:00
|
|
|
TIPC_SKB_CB(skb)->ackers = l->ackers;
|
2015-07-17 04:54:24 +08:00
|
|
|
l->rcv_unacked = 0;
|
2016-11-25 23:35:02 +08:00
|
|
|
l->stats.sent_pkts++;
|
2015-07-17 04:54:24 +08:00
|
|
|
seqno++;
|
|
|
|
continue;
|
|
|
|
}
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb,
|
2019-11-08 13:05:11 +08:00
|
|
|
mss, l->addr, &new_bundle)) {
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
if (skb) {
|
|
|
|
/* Keep a ref. to the skb for next try */
|
|
|
|
l->backlog[imp].target_bskb = skb;
|
|
|
|
l->backlog[imp].len++;
|
|
|
|
__skb_queue_tail(backlogq, skb);
|
|
|
|
} else {
|
|
|
|
if (new_bundle) {
|
|
|
|
l->stats.sent_bundles++;
|
|
|
|
l->stats.sent_bundled++;
|
|
|
|
}
|
|
|
|
l->stats.sent_bundled++;
|
|
|
|
}
|
2015-07-17 04:54:24 +08:00
|
|
|
continue;
|
|
|
|
}
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
l->backlog[imp].target_bskb = NULL;
|
tipc: improve message bundling algorithm
As mentioned in commit e95584a889e1 ("tipc: fix unlimited bundling of
small messages"), the current message bundling algorithm is inefficient
that can generate bundles of only one payload message, that causes
unnecessary overheads for both the sender and receiver.
This commit re-designs the 'tipc_msg_make_bundle()' function (now named
as 'tipc_msg_try_bundle()'), so that when a message comes at the first
place, we will just check & keep a reference to it if the message is
suitable for bundling. The message buffer will be put into the link
backlog queue and processed as normal. Later on, when another one comes
we will make a bundle with the first message if possible and so on...
This way, a bundle if really needed will always consist of at least two
payload messages. Otherwise, we let the first buffer go its way without
any need of bundling, so reduce the overheads to zero.
Moreover, since now we have both the messages in hand, we can even
optimize the 'tipc_msg_bundle()' function, make bundle of a very large
(size ~ MSS) and small messages which is not with the current algorithm
e.g. [1400-byte message] + [10-byte message] (MTU = 1500).
Acked-by: Ying Xue <ying.xue@windreiver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-01 10:58:57 +08:00
|
|
|
l->backlog[imp].len += (1 + skb_queue_len(list));
|
|
|
|
__skb_queue_tail(backlogq, skb);
|
2015-07-17 04:54:24 +08:00
|
|
|
skb_queue_splice_tail_init(list, backlogq);
|
|
|
|
}
|
|
|
|
l->snd_nxt = seqno;
|
tipc: reduce risk of user starvation during link congestion
The socket code currently handles link congestion by either blocking
and trying to send again when the congestion has abated, or just
returning to the user with -EAGAIN and let him re-try later.
This mechanism is prone to starvation, because the wakeup algorithm is
non-atomic. During the time the link issues a wakeup signal, until the
socket wakes up and re-attempts sending, other senders may have come
in between and occupied the free buffer space in the link. This in turn
may lead to a socket having to make many send attempts before it is
successful. In extremely loaded systems we have observed latency times
of several seconds before a low-priority socket is able to send out a
message.
In this commit, we simplify this mechanism and reduce the risk of the
described scenario happening. When a message is attempted sent via a
congested link, we now let it be added to the link's backlog queue
anyway, thus permitting an oversubscription of one message per source
socket. We still create a wakeup item and return an error code, hence
instructing the sender to block or stop sending. Only when enough space
has been freed up in the link's backlog queue do we issue a wakeup event
that allows the sender to continue with the next message, if any.
The fact that a socket now can consider a message sent even when the
link returns a congestion code means that the sending socket code can
be simplified. Also, since this is a good opportunity to get rid of the
obsolete 'mtu change' condition in the three socket send functions, we
now choose to refactor those functions completely.
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-03 23:55:11 +08:00
|
|
|
return rc;
|
2015-07-17 04:54:24 +08:00
|
|
|
}
|
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
static void tipc_link_update_cwin(struct tipc_link *l, int released,
|
|
|
|
bool retransmitted)
|
|
|
|
{
|
|
|
|
int bklog_len = skb_queue_len(&l->backlogq);
|
|
|
|
struct sk_buff_head *txq = &l->transmq;
|
|
|
|
int txq_len = skb_queue_len(txq);
|
|
|
|
u16 cwin = l->window;
|
|
|
|
|
|
|
|
/* Enter fast recovery */
|
|
|
|
if (unlikely(retransmitted)) {
|
|
|
|
l->ssthresh = max_t(u16, l->window / 2, 300);
|
2020-04-15 19:34:49 +08:00
|
|
|
l->window = min_t(u16, l->ssthresh, l->window);
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Enter slow start */
|
|
|
|
if (unlikely(!released)) {
|
|
|
|
l->ssthresh = max_t(u16, l->window / 2, 300);
|
|
|
|
l->window = l->min_win;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Don't increase window if no pressure on the transmit queue */
|
|
|
|
if (txq_len + bklog_len < cwin)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Don't increase window if there are holes the transmit queue */
|
|
|
|
if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len)
|
|
|
|
return;
|
|
|
|
|
|
|
|
l->cong_acks += released;
|
|
|
|
|
|
|
|
/* Slow start */
|
|
|
|
if (cwin <= l->ssthresh) {
|
|
|
|
l->window = min_t(u16, cwin + released, l->max_win);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Congestion avoidance */
|
|
|
|
if (l->cong_acks < cwin)
|
|
|
|
return;
|
|
|
|
l->window = min_t(u16, ++cwin, l->max_win);
|
|
|
|
l->cong_acks = 0;
|
|
|
|
}
|
|
|
|
|
2018-07-19 17:16:59 +08:00
|
|
|
static void tipc_link_advance_backlog(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq)
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
{
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
|
|
|
|
struct sk_buff_head *txq = &l->transmq;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
struct sk_buff *skb, *_skb;
|
|
|
|
u16 ack = l->rcv_nxt - 1;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
u16 seqno = l->snd_nxt;
|
|
|
|
struct tipc_msg *hdr;
|
|
|
|
u16 cwin = l->window;
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
u32 imp;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
while (skb_queue_len(txq) < cwin) {
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
skb = skb_peek(&l->backlogq);
|
|
|
|
if (!skb)
|
|
|
|
break;
|
|
|
|
_skb = skb_clone(skb, GFP_ATOMIC);
|
|
|
|
if (!_skb)
|
|
|
|
break;
|
|
|
|
__skb_dequeue(&l->backlogq);
|
|
|
|
hdr = buf_msg(skb);
|
tipc: fix unlimited bundling of small messages
We have identified a problem with the "oversubscription" policy in the
link transmission code.
When small messages are transmitted, and the sending link has reached
the transmit window limit, those messages will be bundled and put into
the link backlog queue. However, bundles of data messages are counted
at the 'CRITICAL' level, so that the counter for that level, instead of
the counter for the real, bundled message's level is the one being
increased.
Subsequent, to-be-bundled data messages at non-CRITICAL levels continue
to be tested against the unchanged counter for their own level, while
contributing to an unrestrained increase at the CRITICAL backlog level.
This leaves a gap in congestion control algorithm for small messages
that can result in starvation for other users or a "real" CRITICAL
user. Even that eventually can lead to buffer exhaustion & link reset.
We fix this by keeping a 'target_bskb' buffer pointer at each levels,
then when bundling, we only bundle messages at the same importance
level only. This way, we know exactly how many slots a certain level
have occupied in the queue, so can manage level congestion accurately.
By bundling messages at the same level, we even have more benefits. Let
consider this:
- One socket sends 64-byte messages at the 'CRITICAL' level;
- Another sends 4096-byte messages at the 'LOW' level;
When a 64-byte message comes and is bundled the first time, we put the
overhead of message bundle to it (+ 40-byte header, data copy, etc.)
for later use, but the next message can be a 4096-byte one that cannot
be bundled to the previous one. This means the last bundle carries only
one payload message which is totally inefficient, as for the receiver
also! Later on, another 64-byte message comes, now we make a new bundle
and the same story repeats...
With the new bundling algorithm, this will not happen, the 64-byte
messages will be bundled together even when the 4096-byte message(s)
comes in between. However, if the 4096-byte messages are sent at the
same level i.e. 'CRITICAL', the bundling algorithm will again cause the
same overhead.
Also, the same will happen even with only one socket sending small
messages at a rate close to the link transmit's one, so that, when one
message is bundled, it's transmitted shortly. Then, another message
comes, a new bundle is created and so on...
We will solve this issue radically by another patch.
Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-10-02 19:49:43 +08:00
|
|
|
imp = msg_importance(hdr);
|
|
|
|
l->backlog[imp].len--;
|
|
|
|
if (unlikely(skb == l->backlog[imp].target_bskb))
|
|
|
|
l->backlog[imp].target_bskb = NULL;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
__skb_queue_tail(&l->transmq, skb);
|
2020-07-09 05:06:44 +08:00
|
|
|
tipc_link_set_skb_retransmit_time(skb, l);
|
2018-12-19 12:42:19 +08:00
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
__skb_queue_tail(xmitq, _skb);
|
tipc: make struct tipc_link generic to support broadcast
Realizing that unicast is just a special case of broadcast, we also see
that we can go in the other direction, i.e., that modest changes to the
current unicast link can make it generic enough to support broadcast.
The following changes are introduced here:
- A new counter ("ackers") in struct tipc_link, to indicate how many
peers need to ack a packet before it can be released.
- A corresponding counter in the skb user area, to keep track of how
many peers a are left to ack before a buffer can be released.
- A new counter ("acked"), to keep persistent track of how far a peer
has acked at the moment, i.e., where in the transmission queue to
start updating buffers when the next ack arrives. This is to avoid
double acknowledgements from a peer, with inadvertent relase of
packets as a result.
- A more generic tipc_link_retrans() function, where retransmit starts
from a given sequence number, instead of the first packet in the
transmision queue. This is to minimize the number of retransmitted
packets on the broadcast media.
When the new functionality is taken into use in the next commits,
we expect it to have minimal effect on unicast mode performance.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-22 20:51:38 +08:00
|
|
|
TIPC_SKB_CB(skb)->ackers = l->ackers;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
msg_set_seqno(hdr, seqno);
|
2015-10-22 20:51:41 +08:00
|
|
|
msg_set_ack(hdr, ack);
|
|
|
|
msg_set_bcast_ack(hdr, bc_ack);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->rcv_unacked = 0;
|
2016-11-25 23:35:02 +08:00
|
|
|
l->stats.sent_pkts++;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
seqno++;
|
|
|
|
}
|
|
|
|
l->snd_nxt = seqno;
|
|
|
|
}
|
|
|
|
|
2019-06-17 13:15:42 +08:00
|
|
|
/**
|
|
|
|
* link_retransmit_failure() - Detect repeated retransmit failures
|
|
|
|
* @l: tipc link sender
|
|
|
|
* @r: tipc link receiver (= l in case of unicast)
|
|
|
|
* @rc: returned code
|
|
|
|
*
|
|
|
|
* Return: true if the repeated retransmit failures happens, otherwise
|
|
|
|
* false
|
|
|
|
*/
|
|
|
|
static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
|
2019-08-15 11:24:08 +08:00
|
|
|
int *rc)
|
2006-06-26 14:40:01 +08:00
|
|
|
{
|
2019-06-17 13:15:42 +08:00
|
|
|
struct sk_buff *skb = skb_peek(&l->transmq);
|
|
|
|
struct tipc_msg *hdr;
|
|
|
|
|
|
|
|
if (!skb)
|
|
|
|
return false;
|
|
|
|
|
2019-08-15 11:24:08 +08:00
|
|
|
if (!TIPC_SKB_CB(skb)->retr_cnt)
|
|
|
|
return false;
|
2019-06-17 13:15:42 +08:00
|
|
|
|
2019-08-15 11:24:08 +08:00
|
|
|
if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp +
|
2019-11-06 14:26:10 +08:00
|
|
|
msecs_to_jiffies(r->tolerance * 10)))
|
2019-08-15 11:24:08 +08:00
|
|
|
return false;
|
|
|
|
|
|
|
|
hdr = buf_msg(skb);
|
|
|
|
if (link_is_bc_sndlink(l) && !less(r->acked, msg_seqno(hdr)))
|
|
|
|
return false;
|
2019-06-17 13:15:42 +08:00
|
|
|
|
2019-08-15 11:24:08 +08:00
|
|
|
pr_warn("Retransmission failure on link <%s>\n", l->name);
|
|
|
|
link_print(l, "State of link ");
|
|
|
|
pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
|
|
|
|
msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
|
|
|
|
pr_info("sqno %u, prev: %x, dest: %x\n",
|
|
|
|
msg_seqno(hdr), msg_prevnode(hdr), msg_destnode(hdr));
|
|
|
|
pr_info("retr_stamp %d, retr_cnt %d\n",
|
|
|
|
jiffies_to_msecs(TIPC_SKB_CB(skb)->retr_stamp),
|
|
|
|
TIPC_SKB_CB(skb)->retr_cnt);
|
|
|
|
|
|
|
|
trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
|
|
|
|
trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
|
|
|
|
trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
|
|
|
|
|
|
|
|
if (link_is_bc_sndlink(l)) {
|
|
|
|
r->state = LINK_RESET;
|
2020-05-26 17:38:34 +08:00
|
|
|
*rc |= TIPC_LINK_DOWN_EVT;
|
2019-08-15 11:24:08 +08:00
|
|
|
} else {
|
2020-05-26 17:38:34 +08:00
|
|
|
*rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
2019-06-17 13:15:42 +08:00
|
|
|
}
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2019-08-15 11:24:08 +08:00
|
|
|
return true;
|
2006-06-26 14:40:01 +08:00
|
|
|
}
|
|
|
|
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
/* tipc_data_input - deliver data and name distr msgs to upper layer
|
2014-07-01 16:22:40 +08:00
|
|
|
*
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
* Consumes buffer if message is of right type
|
2014-07-01 16:22:40 +08:00
|
|
|
* Node lock must be held
|
|
|
|
*/
|
2015-10-22 20:51:41 +08:00
|
|
|
static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb,
|
2015-07-31 06:24:25 +08:00
|
|
|
struct sk_buff_head *inputq)
|
2014-07-01 16:22:40 +08:00
|
|
|
{
|
2017-10-13 17:04:32 +08:00
|
|
|
struct sk_buff_head *mc_inputq = l->bc_rcvlink->inputq;
|
2017-01-19 02:50:52 +08:00
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
|
|
|
|
|
|
|
switch (msg_user(hdr)) {
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
case TIPC_LOW_IMPORTANCE:
|
|
|
|
case TIPC_MEDIUM_IMPORTANCE:
|
|
|
|
case TIPC_HIGH_IMPORTANCE:
|
|
|
|
case TIPC_CRITICAL_IMPORTANCE:
|
tipc: guarantee that group broadcast doesn't bypass group unicast
We need a mechanism guaranteeing that group unicasts sent out from a
socket are not bypassed by later sent broadcasts from the same socket.
We do this as follows:
- Each time a unicast is sent, we set a the broadcast method for the
socket to "replicast" and "mandatory". This forces the first
subsequent broadcast message to follow the same network and data path
as the preceding unicast to a destination, hence preventing it from
overtaking the latter.
- In order to make the 'same data path' statement above true, we let
group unicasts pass through the multicast link input queue, instead
of as previously through the unicast link input queue.
- In the first broadcast following a unicast, we set a new header flag,
requiring all recipients to immediately acknowledge its reception.
- During the period before all the expected acknowledges are received,
the socket refuses to accept any more broadcast attempts, i.e., by
blocking or returning EAGAIN. This period should typically not be
longer than a few microseconds.
- When all acknowledges have been received, the sending socket will
open up for subsequent broadcasts, this time giving the link layer
freedom to itself select the best transmission method.
- The forced and/or abrupt transmission method changes described above
may lead to broadcasts arriving out of order to the recipients. We
remedy this by introducing code that checks and if necessary
re-orders such messages at the receiving end.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-13 17:04:31 +08:00
|
|
|
if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) {
|
2017-10-13 17:04:32 +08:00
|
|
|
skb_queue_tail(mc_inputq, skb);
|
2017-01-19 02:50:52 +08:00
|
|
|
return true;
|
|
|
|
}
|
2020-08-18 20:07:13 +08:00
|
|
|
fallthrough;
|
tipc: guarantee that group broadcast doesn't bypass group unicast
We need a mechanism guaranteeing that group unicasts sent out from a
socket are not bypassed by later sent broadcasts from the same socket.
We do this as follows:
- Each time a unicast is sent, we set a the broadcast method for the
socket to "replicast" and "mandatory". This forces the first
subsequent broadcast message to follow the same network and data path
as the preceding unicast to a destination, hence preventing it from
overtaking the latter.
- In order to make the 'same data path' statement above true, we let
group unicasts pass through the multicast link input queue, instead
of as previously through the unicast link input queue.
- In the first broadcast following a unicast, we set a new header flag,
requiring all recipients to immediately acknowledge its reception.
- During the period before all the expected acknowledges are received,
the socket refuses to accept any more broadcast attempts, i.e., by
blocking or returning EAGAIN. This period should typically not be
longer than a few microseconds.
- When all acknowledges have been received, the sending socket will
open up for subsequent broadcasts, this time giving the link layer
freedom to itself select the best transmission method.
- The forced and/or abrupt transmission method changes described above
may lead to broadcasts arriving out of order to the recipients. We
remedy this by introducing code that checks and if necessary
re-orders such messages at the receiving end.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-10-13 17:04:31 +08:00
|
|
|
case CONN_MANAGER:
|
2017-10-16 22:04:51 +08:00
|
|
|
skb_queue_tail(inputq, skb);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
return true;
|
2017-10-13 17:04:32 +08:00
|
|
|
case GROUP_PROTOCOL:
|
|
|
|
skb_queue_tail(mc_inputq, skb);
|
|
|
|
return true;
|
2014-07-01 16:22:40 +08:00
|
|
|
case NAME_DISTRIBUTOR:
|
2015-10-22 20:51:41 +08:00
|
|
|
l->bc_rcvlink->state = LINK_ESTABLISHED;
|
|
|
|
skb_queue_tail(l->namedq, skb);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
return true;
|
|
|
|
case MSG_BUNDLER:
|
tipc: eliminate delayed link deletion at link failover
When a bearer is disabled manually, all its links have to be reset
and deleted. However, if there is a remaining, parallel link ready
to take over a deleted link's traffic, we currently delay the delete
of the removed link until the failover procedure is finished. This
is because the remaining link needs to access state from the reset
link, such as the last received packet number, and any partially
reassembled buffer, in order to perform a successful failover.
In this commit, we do instead move the state data over to the new
link, so that it can fulfill the procedure autonomously, without
accessing any data on the old link. This means that we can now
proceed and delete all pertaining links immediately when a bearer
is disabled. This saves us from some unnecessary complexity in such
situations.
We also choose to change the confusing definitions CHANGEOVER_PROTOCOL,
ORIGINAL_MSG and DUPLICATE_MSG to the more descriptive TUNNEL_PROTOCOL,
FAILOVER_MSG and SYNCH_MSG respectively.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 21:33:01 +08:00
|
|
|
case TUNNEL_PROTOCOL:
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
case MSG_FRAGMENTER:
|
2014-07-01 16:22:40 +08:00
|
|
|
case BCAST_PROTOCOL:
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
return false;
|
tipc: add automatic session key exchange
With support from the master key option in the previous commit, it
becomes easy to make frequent updates/exchanges of session keys between
authenticated cluster nodes.
Basically, there are two situations where the key exchange will take in
place:
- When a new node joins the cluster (with the master key), it will need
to get its peer's TX key, so that be able to decrypt further messages
from that peer.
- When a new session key is generated (by either user manual setting or
later automatic rekeying feature), the key will be distributed to all
peer nodes in the cluster.
A key to be exchanged is encapsulated in the data part of a 'MSG_CRYPTO
/KEY_DISTR_MSG' TIPC v2 message, then xmit-ed as usual and encrypted by
using the master key before sending out. Upon receipt of the message it
will be decrypted in the same way as regular messages, then attached as
the sender's RX key in the receiver node.
In this way, the key exchange is reliable by the link layer, as well as
security, integrity and authenticity by the crypto layer.
Also, the forward security will be easily achieved by user changing the
master key actively but this should not be required very frequently.
The key exchange feature is independent on the presence of a master key
Note however that the master key still is needed for new nodes to be
able to join the cluster. It is also optional, and can be turned off/on
via the sysfs: 'net/tipc/key_exchange_enabled' [default 1: enabled].
Backward compatibility is guaranteed because for nodes that do not have
master key support, key exchange using master key ie. tx_key = 0 if any
will be shortly discarded at the message validation step. In other
words, the key exchange feature will be automatically disabled to those
nodes.
v2: fix the "implicit declaration of function 'tipc_crypto_key_flush'"
error in node.c. The function only exists when built with the TIPC
"CONFIG_TIPC_CRYPTO" option.
v3: use 'info->extack' for a message emitted due to netlink operations
instead (- David's comment).
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-18 09:17:28 +08:00
|
|
|
#ifdef CONFIG_TIPC_CRYPTO
|
|
|
|
case MSG_CRYPTO:
|
2021-12-11 02:50:40 +08:00
|
|
|
if (sysctl_tipc_key_exchange_enabled &&
|
|
|
|
TIPC_SKB_CB(skb)->decrypted) {
|
2021-11-15 20:45:24 +08:00
|
|
|
tipc_crypto_msg_rcv(l->net, skb);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
fallthrough;
|
tipc: add automatic session key exchange
With support from the master key option in the previous commit, it
becomes easy to make frequent updates/exchanges of session keys between
authenticated cluster nodes.
Basically, there are two situations where the key exchange will take in
place:
- When a new node joins the cluster (with the master key), it will need
to get its peer's TX key, so that be able to decrypt further messages
from that peer.
- When a new session key is generated (by either user manual setting or
later automatic rekeying feature), the key will be distributed to all
peer nodes in the cluster.
A key to be exchanged is encapsulated in the data part of a 'MSG_CRYPTO
/KEY_DISTR_MSG' TIPC v2 message, then xmit-ed as usual and encrypted by
using the master key before sending out. Upon receipt of the message it
will be decrypted in the same way as regular messages, then attached as
the sender's RX key in the receiver node.
In this way, the key exchange is reliable by the link layer, as well as
security, integrity and authenticity by the crypto layer.
Also, the forward security will be easily achieved by user changing the
master key actively but this should not be required very frequently.
The key exchange feature is independent on the presence of a master key
Note however that the master key still is needed for new nodes to be
able to join the cluster. It is also optional, and can be turned off/on
via the sysfs: 'net/tipc/key_exchange_enabled' [default 1: enabled].
Backward compatibility is guaranteed because for nodes that do not have
master key support, key exchange using master key ie. tx_key = 0 if any
will be shortly discarded at the message validation step. In other
words, the key exchange feature will be automatically disabled to those
nodes.
v2: fix the "implicit declaration of function 'tipc_crypto_key_flush'"
error in node.c. The function only exists when built with the TIPC
"CONFIG_TIPC_CRYPTO" option.
v3: use 'info->extack' for a message emitted due to netlink operations
instead (- David's comment).
Reported-by: kernel test robot <lkp@intel.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-09-18 09:17:28 +08:00
|
|
|
#endif
|
2014-07-01 16:22:40 +08:00
|
|
|
default:
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
pr_warn("Dropping received illegal msg type\n");
|
|
|
|
kfree_skb(skb);
|
2019-02-11 10:18:28 +08:00
|
|
|
return true;
|
2020-11-01 23:58:22 +08:00
|
|
|
}
|
2014-07-01 16:22:40 +08:00
|
|
|
}
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
|
|
|
|
/* tipc_link_input - process packet that has passed link protocol check
|
|
|
|
*
|
|
|
|
* Consumes buffer
|
2014-07-01 16:22:40 +08:00
|
|
|
*/
|
2015-07-31 06:24:25 +08:00
|
|
|
static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb,
|
2019-04-04 12:09:53 +08:00
|
|
|
struct sk_buff_head *inputq,
|
|
|
|
struct sk_buff **reasm_skb)
|
2014-07-01 16:22:40 +08:00
|
|
|
{
|
2015-07-31 06:24:19 +08:00
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
struct sk_buff *iskb;
|
2015-10-16 02:52:40 +08:00
|
|
|
struct sk_buff_head tmpq;
|
2015-07-31 06:24:19 +08:00
|
|
|
int usr = msg_user(hdr);
|
|
|
|
int pos = 0;
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
if (usr == MSG_BUNDLER) {
|
2015-10-16 02:52:40 +08:00
|
|
|
skb_queue_head_init(&tmpq);
|
2015-07-31 06:24:19 +08:00
|
|
|
l->stats.recv_bundles++;
|
|
|
|
l->stats.recv_bundled += msg_msgcnt(hdr);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
while (tipc_msg_extract(skb, &iskb, &pos))
|
2015-10-16 02:52:40 +08:00
|
|
|
tipc_data_input(l, iskb, &tmpq);
|
|
|
|
tipc_skb_queue_splice_tail(&tmpq, inputq);
|
2015-07-31 06:24:21 +08:00
|
|
|
return 0;
|
2015-07-31 06:24:19 +08:00
|
|
|
} else if (usr == MSG_FRAGMENTER) {
|
|
|
|
l->stats.recv_fragments++;
|
|
|
|
if (tipc_buf_append(reasm_skb, &skb)) {
|
|
|
|
l->stats.recv_fragmented++;
|
2015-07-31 06:24:25 +08:00
|
|
|
tipc_data_input(l, skb, inputq);
|
2015-10-22 20:51:41 +08:00
|
|
|
} else if (!*reasm_skb && !link_is_bc_rcvlink(l)) {
|
|
|
|
pr_warn_ratelimited("Unable to build fragment list\n");
|
2015-07-31 06:24:21 +08:00
|
|
|
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
tipc: resolve race problem at unicast message reception
TIPC handles message cardinality and sequencing at the link layer,
before passing messages upwards to the destination sockets. During the
upcall from link to socket no locks are held. It is therefore possible,
and we see it happen occasionally, that messages arriving in different
threads and delivered in sequence still bypass each other before they
reach the destination socket. This must not happen, since it violates
the sequentiality guarantee.
We solve this by adding a new input buffer queue to the link structure.
Arriving messages are added safely to the tail of that queue by the
link, while the head of the queue is consumed, also safely, by the
receiving socket. Sequentiality is secured per socket by only allowing
buffers to be dequeued inside the socket lock. Since there may be multiple
simultaneous readers of the queue, we use a 'filter' parameter to reduce
the risk that they peek the same buffer from the queue, hence also
reducing the risk of contention on the receiving socket locks.
This solves the sequentiality problem, and seems to cause no measurable
performance degradation.
A nice side effect of this change is that lock handling in the functions
tipc_rcv() and tipc_bcast_rcv() now becomes uniform, something that
will enable future simplifications of those functions.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-02-05 21:36:41 +08:00
|
|
|
}
|
2015-07-31 06:24:21 +08:00
|
|
|
return 0;
|
2015-07-31 06:24:19 +08:00
|
|
|
} else if (usr == BCAST_PROTOCOL) {
|
2015-10-22 20:51:46 +08:00
|
|
|
tipc_bcast_lock(l->net);
|
2015-10-22 20:51:41 +08:00
|
|
|
tipc_link_bc_init_rcv(l->bc_rcvlink, hdr);
|
2015-10-22 20:51:46 +08:00
|
|
|
tipc_bcast_unlock(l->net);
|
2015-07-31 06:24:19 +08:00
|
|
|
}
|
2019-04-04 12:09:53 +08:00
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
kfree_skb(skb);
|
2015-07-31 06:24:21 +08:00
|
|
|
return 0;
|
2014-07-01 16:22:40 +08:00
|
|
|
}
|
|
|
|
|
2019-04-04 12:09:53 +08:00
|
|
|
/* tipc_link_tnl_rcv() - receive TUNNEL_PROTOCOL message, drop or process the
|
|
|
|
* inner message along with the ones in the old link's
|
|
|
|
* deferdq
|
|
|
|
* @l: tunnel link
|
|
|
|
* @skb: TUNNEL_PROTOCOL message
|
|
|
|
* @inputq: queue to put messages ready for delivery
|
|
|
|
*/
|
|
|
|
static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
|
|
|
|
struct sk_buff_head *inputq)
|
|
|
|
{
|
|
|
|
struct sk_buff **reasm_skb = &l->failover_reasm_skb;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg;
|
2019-04-04 12:09:53 +08:00
|
|
|
struct sk_buff_head *fdefq = &l->failover_deferdq;
|
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
|
|
|
struct sk_buff *iskb;
|
|
|
|
int ipos = 0;
|
|
|
|
int rc = 0;
|
|
|
|
u16 seqno;
|
|
|
|
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
if (msg_type(hdr) == SYNCH_MSG) {
|
|
|
|
kfree_skb(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
2019-04-04 12:09:53 +08:00
|
|
|
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
/* Not a fragment? */
|
|
|
|
if (likely(!msg_nof_fragms(hdr))) {
|
|
|
|
if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) {
|
|
|
|
pr_warn_ratelimited("Unable to extract msg, defq: %d\n",
|
|
|
|
skb_queue_len(fdefq));
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
kfree_skb(skb);
|
|
|
|
} else {
|
|
|
|
/* Set fragment type for buf_append */
|
|
|
|
if (msg_fragm_no(hdr) == 1)
|
|
|
|
msg_set_type(hdr, FIRST_FRAGMENT);
|
|
|
|
else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr))
|
|
|
|
msg_set_type(hdr, FRAGMENT);
|
|
|
|
else
|
|
|
|
msg_set_type(hdr, LAST_FRAGMENT);
|
|
|
|
|
|
|
|
if (!tipc_buf_append(reasm_tnlmsg, &skb)) {
|
|
|
|
/* Successful but non-complete reassembly? */
|
|
|
|
if (*reasm_tnlmsg || link_is_bc_rcvlink(l))
|
|
|
|
return 0;
|
|
|
|
pr_warn_ratelimited("Unable to reassemble tunnel msg\n");
|
|
|
|
return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
|
|
|
}
|
|
|
|
iskb = skb;
|
2019-04-04 12:09:53 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
do {
|
|
|
|
seqno = buf_seqno(iskb);
|
|
|
|
if (unlikely(less(seqno, l->drop_point))) {
|
|
|
|
kfree_skb(iskb);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (unlikely(seqno != l->drop_point)) {
|
|
|
|
__tipc_skb_queue_sorted(fdefq, seqno, iskb);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
l->drop_point++;
|
|
|
|
if (!tipc_data_input(l, iskb, inputq))
|
|
|
|
rc |= tipc_link_input(l, iskb, inputq, reasm_skb);
|
|
|
|
if (unlikely(rc))
|
|
|
|
break;
|
|
|
|
} while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point)));
|
|
|
|
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
/**
|
|
|
|
* tipc_get_gap_ack_blks - get Gap ACK blocks from PROTOCOL/STATE_MSG
|
|
|
|
* @ga: returned pointer to the Gap ACK blocks if any
|
|
|
|
* @l: the tipc link
|
|
|
|
* @hdr: the PROTOCOL/STATE_MSG header
|
|
|
|
* @uc: desired Gap ACK blocks type, i.e. unicast (= 1) or broadcast (= 0)
|
|
|
|
*
|
|
|
|
* Return: the total Gap ACK blocks size
|
|
|
|
*/
|
|
|
|
u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l,
|
|
|
|
struct tipc_msg *hdr, bool uc)
|
|
|
|
{
|
|
|
|
struct tipc_gap_ack_blks *p;
|
|
|
|
u16 sz = 0;
|
|
|
|
|
|
|
|
/* Does peer support the Gap ACK blocks feature? */
|
|
|
|
if (l->peer_caps & TIPC_GAP_ACK_BLOCK) {
|
|
|
|
p = (struct tipc_gap_ack_blks *)msg_data(hdr);
|
|
|
|
sz = ntohs(p->len);
|
|
|
|
/* Sanity check */
|
2023-09-16 03:16:26 +08:00
|
|
|
if (sz == struct_size(p, gacks, size_add(p->ugack_cnt, p->bgack_cnt))) {
|
2020-05-26 17:38:34 +08:00
|
|
|
/* Good, check if the desired type exists */
|
|
|
|
if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt))
|
|
|
|
goto ok;
|
|
|
|
/* Backward compatible: peer might not support bc, but uc? */
|
2020-06-18 21:35:00 +08:00
|
|
|
} else if (uc && sz == struct_size(p, gacks, p->ugack_cnt)) {
|
2020-05-26 17:38:34 +08:00
|
|
|
if (p->ugack_cnt) {
|
|
|
|
p->bgack_cnt = 0;
|
|
|
|
goto ok;
|
|
|
|
}
|
|
|
|
}
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
2020-05-26 17:38:34 +08:00
|
|
|
/* Other cases: ignore! */
|
|
|
|
p = NULL;
|
|
|
|
|
|
|
|
ok:
|
|
|
|
*ga = p;
|
|
|
|
return sz;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga,
|
|
|
|
struct tipc_link *l, u8 start_index)
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
{
|
2020-05-26 17:38:34 +08:00
|
|
|
struct tipc_gap_ack *gacks = &ga->gacks[start_index];
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
struct sk_buff *skb = skb_peek(&l->deferdq);
|
2020-05-26 17:38:34 +08:00
|
|
|
u16 expect, seqno = 0;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
u8 n = 0;
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
if (!skb)
|
|
|
|
return 0;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
|
|
|
|
expect = buf_seqno(skb);
|
|
|
|
skb_queue_walk(&l->deferdq, skb) {
|
|
|
|
seqno = buf_seqno(skb);
|
|
|
|
if (unlikely(more(seqno, expect))) {
|
2020-05-26 17:38:34 +08:00
|
|
|
gacks[n].ack = htons(expect - 1);
|
|
|
|
gacks[n].gap = htons(seqno - expect);
|
|
|
|
if (++n >= MAX_GAP_ACK_BLKS / 2) {
|
|
|
|
pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n",
|
2020-05-26 17:38:37 +08:00
|
|
|
l->name, n,
|
2020-05-26 17:38:34 +08:00
|
|
|
skb_queue_len(&l->deferdq));
|
|
|
|
return n;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
}
|
|
|
|
} else if (unlikely(less(seqno, expect))) {
|
|
|
|
pr_warn("Unexpected skb in deferdq!\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
expect = seqno + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* last block */
|
2020-05-26 17:38:34 +08:00
|
|
|
gacks[n].ack = htons(seqno);
|
|
|
|
gacks[n].gap = 0;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
n++;
|
2020-05-26 17:38:34 +08:00
|
|
|
return n;
|
|
|
|
}
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
/* tipc_build_gap_ack_blks - build Gap ACK blocks
|
|
|
|
* @l: tipc unicast link
|
|
|
|
* @hdr: the tipc message buffer to store the Gap ACK blocks after built
|
|
|
|
*
|
|
|
|
* The function builds Gap ACK blocks for both the unicast & broadcast receiver
|
|
|
|
* links of a certain peer, the buffer after built has the network data format
|
|
|
|
* as found at the struct tipc_gap_ack_blks definition.
|
|
|
|
*
|
|
|
|
* returns the actual allocated memory size
|
|
|
|
*/
|
|
|
|
static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr)
|
|
|
|
{
|
|
|
|
struct tipc_link *bcl = l->bc_rcvlink;
|
|
|
|
struct tipc_gap_ack_blks *ga;
|
|
|
|
u16 len;
|
|
|
|
|
|
|
|
ga = (struct tipc_gap_ack_blks *)msg_data(hdr);
|
|
|
|
|
|
|
|
/* Start with broadcast link first */
|
|
|
|
tipc_bcast_lock(bcl->net);
|
|
|
|
msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1);
|
|
|
|
msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl));
|
|
|
|
ga->bgack_cnt = __tipc_build_gap_ack_blks(ga, bcl, 0);
|
|
|
|
tipc_bcast_unlock(bcl->net);
|
|
|
|
|
|
|
|
/* Now for unicast link, but an explicit NACK only (???) */
|
|
|
|
ga->ugack_cnt = (msg_seq_gap(hdr)) ?
|
|
|
|
__tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0;
|
|
|
|
|
|
|
|
/* Total len */
|
2023-09-16 03:16:26 +08:00
|
|
|
len = struct_size(ga, gacks, size_add(ga->bgack_cnt, ga->ugack_cnt));
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
ga->len = htons(len);
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* tipc_link_advance_transmq - advance TIPC link transmq queue by releasing
|
|
|
|
* acked packets, also doing retransmissions if
|
|
|
|
* gaps found
|
|
|
|
* @l: tipc link with transmq queue to be advanced
|
2020-05-26 17:38:34 +08:00
|
|
|
* @r: tipc link "receiver" i.e. in case of broadcast (= "l" if unicast)
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
* @acked: seqno of last packet acked by peer without any gaps before
|
|
|
|
* @gap: # of gap packets
|
|
|
|
* @ga: buffer pointer to Gap ACK blocks from peer
|
|
|
|
* @xmitq: queue for accumulating the retransmitted packets if any
|
2020-05-26 17:38:34 +08:00
|
|
|
* @retransmitted: returned boolean value if a retransmission is really issued
|
|
|
|
* @rc: returned code e.g. TIPC_LINK_DOWN_EVT if a repeated retransmit failures
|
|
|
|
* happens (- unlikely case)
|
2019-06-17 13:15:42 +08:00
|
|
|
*
|
2020-05-26 17:38:34 +08:00
|
|
|
* Return: the number of packets released from the link transmq
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
*/
|
2020-05-26 17:38:34 +08:00
|
|
|
static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r,
|
|
|
|
u16 acked, u16 gap,
|
2019-06-17 13:15:42 +08:00
|
|
|
struct tipc_gap_ack_blks *ga,
|
2020-05-26 17:38:34 +08:00
|
|
|
struct sk_buff_head *xmitq,
|
|
|
|
bool *retransmitted, int *rc)
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
{
|
2020-05-26 17:38:34 +08:00
|
|
|
struct tipc_gap_ack_blks *last_ga = r->last_ga, *this_ga = NULL;
|
|
|
|
struct tipc_gap_ack *gacks = NULL;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
struct sk_buff *skb, *_skb, *tmp;
|
|
|
|
struct tipc_msg *hdr;
|
2020-05-26 17:38:34 +08:00
|
|
|
u32 qlen = skb_queue_len(&l->transmq);
|
|
|
|
u16 nacked = acked, ngap = gap, gack_cnt = 0;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
|
|
|
|
u16 ack = l->rcv_nxt - 1;
|
2019-06-17 13:15:42 +08:00
|
|
|
u16 seqno, n = 0;
|
2020-05-26 17:38:34 +08:00
|
|
|
u16 end = r->acked, start = end, offset = r->last_gap;
|
|
|
|
u16 si = (last_ga) ? last_ga->start_index : 0;
|
|
|
|
bool is_uc = !link_is_bc_sndlink(l);
|
|
|
|
bool bc_has_acked = false;
|
|
|
|
|
2020-05-26 17:38:35 +08:00
|
|
|
trace_tipc_link_retrans(r, acked + 1, acked + gap, &l->transmq);
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
/* Determine Gap ACK blocks if any for the particular link */
|
|
|
|
if (ga && is_uc) {
|
|
|
|
/* Get the Gap ACKs, uc part */
|
|
|
|
gack_cnt = ga->ugack_cnt;
|
|
|
|
gacks = &ga->gacks[ga->bgack_cnt];
|
|
|
|
} else if (ga) {
|
|
|
|
/* Copy the Gap ACKs, bc part, for later renewal if needed */
|
2020-06-18 21:35:00 +08:00
|
|
|
this_ga = kmemdup(ga, struct_size(ga, gacks, ga->bgack_cnt),
|
2020-05-26 17:38:34 +08:00
|
|
|
GFP_ATOMIC);
|
|
|
|
if (likely(this_ga)) {
|
|
|
|
this_ga->start_index = 0;
|
|
|
|
/* Start with the bc Gap ACKs */
|
|
|
|
gack_cnt = this_ga->bgack_cnt;
|
|
|
|
gacks = &this_ga->gacks[0];
|
|
|
|
} else {
|
|
|
|
/* Hmm, we can get in trouble..., simply ignore it */
|
|
|
|
pr_warn_ratelimited("Ignoring bc Gap ACKs, no memory\n");
|
|
|
|
}
|
|
|
|
}
|
2019-06-17 13:15:42 +08:00
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
/* Advance the link transmq */
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
skb_queue_walk_safe(&l->transmq, skb, tmp) {
|
|
|
|
seqno = buf_seqno(skb);
|
|
|
|
|
|
|
|
next_gap_ack:
|
2020-05-26 17:38:34 +08:00
|
|
|
if (less_eq(seqno, nacked)) {
|
|
|
|
if (is_uc)
|
|
|
|
goto release;
|
|
|
|
/* Skip packets peer has already acked */
|
|
|
|
if (!more(seqno, r->acked))
|
|
|
|
continue;
|
|
|
|
/* Get the next of last Gap ACK blocks */
|
|
|
|
while (more(seqno, end)) {
|
|
|
|
if (!last_ga || si >= last_ga->bgack_cnt)
|
|
|
|
break;
|
|
|
|
start = end + offset + 1;
|
|
|
|
end = ntohs(last_ga->gacks[si].ack);
|
|
|
|
offset = ntohs(last_ga->gacks[si].gap);
|
|
|
|
si++;
|
|
|
|
WARN_ONCE(more(start, end) ||
|
|
|
|
(!offset &&
|
|
|
|
si < last_ga->bgack_cnt) ||
|
|
|
|
si > MAX_GAP_ACK_BLKS,
|
|
|
|
"Corrupted Gap ACK: %d %d %d %d %d\n",
|
|
|
|
start, end, offset, si,
|
|
|
|
last_ga->bgack_cnt);
|
|
|
|
}
|
|
|
|
/* Check against the last Gap ACK block */
|
minmax: add in_range() macro
Patch series "New page table range API", v6.
This patchset changes the API used by the MM to set up page table entries.
The four APIs are:
set_ptes(mm, addr, ptep, pte, nr)
update_mmu_cache_range(vma, addr, ptep, nr)
flush_dcache_folio(folio)
flush_icache_pages(vma, page, nr)
flush_dcache_folio() isn't technically new, but no architecture
implemented it, so I've done that for them. The old APIs remain around
but are mostly implemented by calling the new interfaces.
The new APIs are based around setting up N page table entries at once.
The N entries belong to the same PMD, the same folio and the same VMA, so
ptep++ is a legitimate operation, and locking is taken care of for you.
Some architectures can do a better job of it than just a loop, but I have
hesitated to make too deep a change to architectures I don't understand
well.
One thing I have changed in every architecture is that PG_arch_1 is now a
per-folio bit instead of a per-page bit when used for dcache clean/dirty
tracking. This was something that would have to happen eventually, and it
makes sense to do it now rather than iterate over every page involved in a
cache flush and figure out if it needs to happen.
The point of all this is better performance, and Fengwei Yin has measured
improvement on x86. I suspect you'll see improvement on your architecture
too. Try the new will-it-scale test mentioned here:
https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/
You'll need to run it on an XFS filesystem and have
CONFIG_TRANSPARENT_HUGEPAGE set.
This patchset is the basis for much of the anonymous large folio work
being done by Ryan, so it's received quite a lot of testing over the last
few months.
This patch (of 38):
Determine if a value lies within a range more efficiently (subtraction +
comparison vs two comparisons and an AND). It also has useful (under some
circumstances) behaviour if the range exceeds the maximum value of the
type. Convert all the conflicting definitions of in_range() within the
kernel; some can use the generic definition while others need their own
definition.
Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-02 23:13:29 +08:00
|
|
|
if (tipc_in_range(seqno, start, end))
|
2020-05-26 17:38:34 +08:00
|
|
|
continue;
|
|
|
|
/* Update/release the packet peer is acking */
|
|
|
|
bc_has_acked = true;
|
|
|
|
if (--TIPC_SKB_CB(skb)->ackers)
|
|
|
|
continue;
|
|
|
|
release:
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
/* release skb */
|
|
|
|
__skb_unlink(skb, &l->transmq);
|
|
|
|
kfree_skb(skb);
|
2020-05-26 17:38:34 +08:00
|
|
|
} else if (less_eq(seqno, nacked + ngap)) {
|
|
|
|
/* First gap: check if repeated retrans failures? */
|
|
|
|
if (unlikely(seqno == acked + 1 &&
|
|
|
|
link_retransmit_failure(l, r, rc))) {
|
|
|
|
/* Ignore this bc Gap ACKs if any */
|
|
|
|
kfree(this_ga);
|
|
|
|
this_ga = NULL;
|
|
|
|
break;
|
|
|
|
}
|
2019-08-15 11:24:08 +08:00
|
|
|
/* retransmit skb if unrestricted*/
|
tipc: reduce duplicate packets for unicast traffic
For unicast transmission, the current NACK sending althorithm is over-
active that forces the sending side to retransmit a packet that is not
really lost but just arrived at the receiving side with some delay, or
even retransmit same packets that have already been retransmitted
before. As a result, many duplicates are observed also under normal
condition, ie. without packet loss.
One example case is: node1 transmits 1 2 3 4 10 5 6 7 8 9, when node2
receives packet #10, it puts into the deferdq. When the packet #5 comes
it sends NACK with gap [6 - 9]. However, shortly after that, when
packet #6 arrives, it pulls out packet #10 from the deferfq, but it is
still out of order, so it makes another NACK with gap [7 - 9] and so on
... Finally, node1 has to retransmit the packets 5 6 7 8 9 a number of
times, but in fact all the packets are not lost at all, so duplicates!
This commit reduces duplicates by changing the condition to send NACK,
also restricting the retransmissions on individual packets via a timer
of about 1ms. However, it also needs to say that too tricky condition
for NACKs or too long timeout value for retransmissions will result in
performance reducing! The criterias in this commit are found to be
effective for both the requirements to reduce duplicates but not affect
performance.
The tipc_link_rcv() is also improved to only dequeue skb from the link
deferdq if it is expected (ie. its seqno <= rcv_nxt).
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:52 +08:00
|
|
|
if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
|
|
|
|
continue;
|
2020-07-09 05:06:44 +08:00
|
|
|
tipc_link_set_skb_retransmit_time(skb, l);
|
2019-11-08 13:05:11 +08:00
|
|
|
_skb = pskb_copy(skb, GFP_ATOMIC);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
if (!_skb)
|
|
|
|
continue;
|
|
|
|
hdr = buf_msg(_skb);
|
|
|
|
msg_set_ack(hdr, ack);
|
|
|
|
msg_set_bcast_ack(hdr, bc_ack);
|
|
|
|
_skb->priority = TC_PRIO_CONTROL;
|
|
|
|
__skb_queue_tail(xmitq, _skb);
|
|
|
|
l->stats.retransmitted++;
|
2020-05-26 17:38:37 +08:00
|
|
|
if (!is_uc)
|
|
|
|
r->stats.retransmitted++;
|
2020-05-26 17:38:34 +08:00
|
|
|
*retransmitted = true;
|
2019-08-15 11:24:08 +08:00
|
|
|
/* Increase actual retrans counter & mark first time */
|
|
|
|
if (!TIPC_SKB_CB(skb)->retr_cnt++)
|
|
|
|
TIPC_SKB_CB(skb)->retr_stamp = jiffies;
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
} else {
|
|
|
|
/* retry with Gap ACK blocks if any */
|
2020-05-26 17:38:34 +08:00
|
|
|
if (n >= gack_cnt)
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
break;
|
2020-05-26 17:38:34 +08:00
|
|
|
nacked = ntohs(gacks[n].ack);
|
|
|
|
ngap = ntohs(gacks[n].gap);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
n++;
|
|
|
|
goto next_gap_ack;
|
|
|
|
}
|
|
|
|
}
|
2020-05-26 17:38:34 +08:00
|
|
|
|
|
|
|
/* Renew last Gap ACK blocks for bc if needed */
|
|
|
|
if (bc_has_acked) {
|
|
|
|
if (this_ga) {
|
|
|
|
kfree(last_ga);
|
|
|
|
r->last_ga = this_ga;
|
|
|
|
r->last_gap = gap;
|
|
|
|
} else if (last_ga) {
|
|
|
|
if (less(acked, start)) {
|
|
|
|
si--;
|
|
|
|
offset = start - acked - 1;
|
|
|
|
} else if (less(acked, end)) {
|
|
|
|
acked = end;
|
|
|
|
}
|
|
|
|
if (si < last_ga->bgack_cnt) {
|
|
|
|
last_ga->start_index = si;
|
|
|
|
r->last_gap = offset;
|
|
|
|
} else {
|
|
|
|
kfree(last_ga);
|
|
|
|
r->last_ga = NULL;
|
|
|
|
r->last_gap = 0;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
r->last_gap = 0;
|
|
|
|
}
|
|
|
|
r->acked = acked;
|
|
|
|
} else {
|
|
|
|
kfree(this_ga);
|
|
|
|
}
|
|
|
|
|
|
|
|
return qlen - skb_queue_len(&l->transmq);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
}
|
|
|
|
|
2016-04-16 01:33:07 +08:00
|
|
|
/* tipc_link_build_state_msg: prepare link state message for transmission
|
2015-10-22 20:51:41 +08:00
|
|
|
*
|
|
|
|
* Note that sending of broadcast ack is coordinated among nodes, to reduce
|
|
|
|
* risk of ack storms towards the sender
|
2015-10-16 02:52:41 +08:00
|
|
|
*/
|
2016-04-16 01:33:07 +08:00
|
|
|
int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
|
2015-10-16 02:52:41 +08:00
|
|
|
{
|
2015-10-22 20:51:41 +08:00
|
|
|
if (!l)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Broadcast ACK must be sent via a unicast link => defer to caller */
|
|
|
|
if (link_is_bc_rcvlink(l)) {
|
2016-03-04 03:23:21 +08:00
|
|
|
if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
|
2015-10-22 20:51:41 +08:00
|
|
|
return 0;
|
|
|
|
l->rcv_unacked = 0;
|
2016-09-02 01:52:49 +08:00
|
|
|
|
|
|
|
/* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */
|
|
|
|
l->snd_nxt = l->rcv_nxt;
|
|
|
|
return TIPC_LINK_SND_STATE;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
/* Unicast ACK */
|
2015-10-16 02:52:41 +08:00
|
|
|
l->rcv_unacked = 0;
|
|
|
|
l->stats.sent_acks++;
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq);
|
2015-10-22 20:51:41 +08:00
|
|
|
return 0;
|
2015-10-16 02:52:41 +08:00
|
|
|
}
|
|
|
|
|
2015-10-16 02:52:45 +08:00
|
|
|
/* tipc_link_build_reset_msg: prepare link RESET or ACTIVATE message
|
|
|
|
*/
|
|
|
|
void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
int mtyp = RESET_MSG;
|
tipc: guarantee peer bearer id exchange after reboot
When a link endpoint is going down locally, e.g., because its interface
is being stopped, it will spontaneously send out a RESET message to
its peer, informing it about this fact. This saves the peer from
detecting the failure via probing, and hence gives both speedier and
less resource consuming failure detection on the peer side.
According to the link FSM, a receiver of a RESET message, ignoring the
reason for it, must now consider the sender ready to come back up, and
starts periodically sending out ACTIVATE messages to the peer in order
to re-establish the link. Also, according to the FSM, the receiver of
an ACTIVATE message can now go directly to state ESTABLISHED and start
sending regular traffic packets. This is a well-proven and robust FSM.
However, in the case of a reboot, there is a small possibilty that link
endpoint on the rebooted node may have been re-created with a new bearer
identity between the moment it sent its (pre-boot) RESET and the moment
it receives the ACTIVATE from the peer. The new bearer identity cannot
be known by the peer according to this scenario, since traffic headers
don't convey such information. This is a problem, because both endpoints
need to know the correct value of the peer's bearer id at any moment in
time in order to be able to produce correct link events for their users.
The only way to guarantee this is to enforce a full setup message
exchange (RESET + ACTIVATE) even after the reboot, since those messages
carry the bearer idientity in their header.
In this commit we do this by introducing and setting a "stopping" bit in
the header of the spontaneously generated RESET messages, informing the
peer that the sender will not be immediately ready to re-establish the
link. A receiver seeing this bit must act as if this were a locally
detected connectivity failure, and hence has to go through a full two-
way setup message exchange before any link can be re-established.
Although never reported, this problem seems to have always been around.
This protocol addition is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-16 01:33:03 +08:00
|
|
|
struct sk_buff *skb;
|
2015-10-16 02:52:45 +08:00
|
|
|
|
|
|
|
if (l->state == LINK_ESTABLISHING)
|
|
|
|
mtyp = ACTIVATE_MSG;
|
|
|
|
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq);
|
tipc: guarantee peer bearer id exchange after reboot
When a link endpoint is going down locally, e.g., because its interface
is being stopped, it will spontaneously send out a RESET message to
its peer, informing it about this fact. This saves the peer from
detecting the failure via probing, and hence gives both speedier and
less resource consuming failure detection on the peer side.
According to the link FSM, a receiver of a RESET message, ignoring the
reason for it, must now consider the sender ready to come back up, and
starts periodically sending out ACTIVATE messages to the peer in order
to re-establish the link. Also, according to the FSM, the receiver of
an ACTIVATE message can now go directly to state ESTABLISHED and start
sending regular traffic packets. This is a well-proven and robust FSM.
However, in the case of a reboot, there is a small possibilty that link
endpoint on the rebooted node may have been re-created with a new bearer
identity between the moment it sent its (pre-boot) RESET and the moment
it receives the ACTIVATE from the peer. The new bearer identity cannot
be known by the peer according to this scenario, since traffic headers
don't convey such information. This is a problem, because both endpoints
need to know the correct value of the peer's bearer id at any moment in
time in order to be able to produce correct link events for their users.
The only way to guarantee this is to enforce a full setup message
exchange (RESET + ACTIVATE) even after the reboot, since those messages
carry the bearer idientity in their header.
In this commit we do this by introducing and setting a "stopping" bit in
the header of the spontaneously generated RESET messages, informing the
peer that the sender will not be immediately ready to re-establish the
link. A receiver seeing this bit must act as if this were a locally
detected connectivity failure, and hence has to go through a full two-
way setup message exchange before any link can be re-established.
Although never reported, this problem seems to have always been around.
This protocol addition is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-16 01:33:03 +08:00
|
|
|
|
|
|
|
/* Inform peer that this endpoint is going down if applicable */
|
|
|
|
skb = skb_peek_tail(xmitq);
|
|
|
|
if (skb && (l->state == LINK_RESET))
|
|
|
|
msg_set_peer_stopping(buf_msg(skb), 1);
|
2015-10-16 02:52:45 +08:00
|
|
|
}
|
|
|
|
|
2015-10-16 02:52:41 +08:00
|
|
|
/* tipc_link_build_nack_msg: prepare link nack message for transmission
|
2016-09-02 01:52:51 +08:00
|
|
|
* Note that sending of broadcast NACK is coordinated among nodes, to
|
|
|
|
* reduce the risk of NACK storms towards the sender
|
2015-10-16 02:52:41 +08:00
|
|
|
*/
|
2016-09-02 01:52:51 +08:00
|
|
|
static int tipc_link_build_nack_msg(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-10-16 02:52:41 +08:00
|
|
|
{
|
|
|
|
u32 def_cnt = ++l->stats.deferred_recv;
|
tipc: eliminate gap indicator from ACK messages
When we increase the link send window we sometimes observe the
following scenario:
1) A packet #N arrives out of order far ahead of a sequence of older
packets which are still under way. The packet is added to the
deferred queue.
2) The missing packets arrive in sequence, and for each 16th of them
an ACK is sent back to the receiver, as it should be.
3) When building those ACK messages, it is checked if there is a gap
between the link's 'rcv_nxt' and the first packet in the deferred
queue. This is always the case until packet number #N-1 arrives, and
a 'gap' indicator is added, effectively turning them into NACK
messages.
4) When those NACKs arrive at the sender, all the requested
retransmissions are done, since it is a first-time request.
This sometimes leads to a huge amount of redundant retransmissions,
causing a drop in max throughput. This problem gets worse when we
in a later commit introduce variable window congestion control,
since it drops the link back to 'fast recovery' much more often
than necessary.
We now fix this by not sending any 'gap' indicator in regular ACK
messages. We already have a mechanism for sending explicit NACKs
in place, and this is sufficient to keep up the packet flow.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:44 +08:00
|
|
|
struct sk_buff_head *dfq = &l->deferdq;
|
|
|
|
u32 defq_len = skb_queue_len(dfq);
|
2016-09-02 01:52:51 +08:00
|
|
|
int match1, match2;
|
2015-10-16 02:52:41 +08:00
|
|
|
|
2016-09-02 01:52:51 +08:00
|
|
|
if (link_is_bc_rcvlink(l)) {
|
|
|
|
match1 = def_cnt & 0xf;
|
|
|
|
match2 = tipc_own_addr(l->net) & 0xf;
|
|
|
|
if (match1 == match2)
|
|
|
|
return TIPC_LINK_SND_STATE;
|
|
|
|
return 0;
|
|
|
|
}
|
2015-10-22 20:51:41 +08:00
|
|
|
|
tipc: eliminate gap indicator from ACK messages
When we increase the link send window we sometimes observe the
following scenario:
1) A packet #N arrives out of order far ahead of a sequence of older
packets which are still under way. The packet is added to the
deferred queue.
2) The missing packets arrive in sequence, and for each 16th of them
an ACK is sent back to the receiver, as it should be.
3) When building those ACK messages, it is checked if there is a gap
between the link's 'rcv_nxt' and the first packet in the deferred
queue. This is always the case until packet number #N-1 arrives, and
a 'gap' indicator is added, effectively turning them into NACK
messages.
4) When those NACKs arrive at the sender, all the requested
retransmissions are done, since it is a first-time request.
This sometimes leads to a huge amount of redundant retransmissions,
causing a drop in max throughput. This problem gets worse when we
in a later commit introduce variable window congestion control,
since it drops the link back to 'fast recovery' much more often
than necessary.
We now fix this by not sending any 'gap' indicator in regular ACK
messages. We already have a mechanism for sending explicit NACKs
in place, and this is sufficient to keep up the packet flow.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:44 +08:00
|
|
|
if (defq_len >= 3 && !((defq_len - 3) % 16)) {
|
|
|
|
u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
|
|
|
|
|
|
|
|
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0,
|
|
|
|
rcvgap, 0, 0, xmitq);
|
|
|
|
}
|
2016-09-02 01:52:51 +08:00
|
|
|
return 0;
|
2015-10-16 02:52:41 +08:00
|
|
|
}
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
/* tipc_link_rcv - process TIPC packets/messages arriving from off-node
|
2015-10-16 02:52:41 +08:00
|
|
|
* @l: the link that should handle the message
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
* @skb: TIPC packet
|
|
|
|
* @xmitq: queue to place packets to be sent after this call
|
|
|
|
*/
|
|
|
|
int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
2015-10-16 02:52:41 +08:00
|
|
|
struct sk_buff_head *defq = &l->deferdq;
|
tipc: reduce duplicate packets for unicast traffic
For unicast transmission, the current NACK sending althorithm is over-
active that forces the sending side to retransmit a packet that is not
really lost but just arrived at the receiving side with some delay, or
even retransmit same packets that have already been retransmitted
before. As a result, many duplicates are observed also under normal
condition, ie. without packet loss.
One example case is: node1 transmits 1 2 3 4 10 5 6 7 8 9, when node2
receives packet #10, it puts into the deferdq. When the packet #5 comes
it sends NACK with gap [6 - 9]. However, shortly after that, when
packet #6 arrives, it pulls out packet #10 from the deferfq, but it is
still out of order, so it makes another NACK with gap [7 - 9] and so on
... Finally, node1 has to retransmit the packets 5 6 7 8 9 a number of
times, but in fact all the packets are not lost at all, so duplicates!
This commit reduces duplicates by changing the condition to send NACK,
also restricting the retransmissions on individual packets via a timer
of about 1ms. However, it also needs to say that too tricky condition
for NACKs or too long timeout value for retransmissions will result in
performance reducing! The criterias in this commit are found to be
effective for both the requirements to reduce duplicates but not affect
performance.
The tipc_link_rcv() is also improved to only dequeue skb from the link
deferdq if it is expected (ie. its seqno <= rcv_nxt).
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:52 +08:00
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
tipc: improve sequence number checking
The sequence number of an incoming packet is currently only checked
for less than, equality to, or bigger than the next expected number,
meaning that the receive window in practice becomes one half sequence
number cycle, or U16_MAX/2. This does not make sense, and may not even
be safe if there are extreme delays in the network. Any packet sent by
the peer during the ongoing cycle must belong inside his current send
window, or should otherwise be dropped if possible.
Since a link endpoint cannot know its peer's current send window, it
has to base this sanity check on a worst-case assumption, i.e., that
the peer is using a maximum sized window of 8191 packets. Using this
assumption, we now add a check that the sequence number is not bigger
than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks
done, so that the receive window test is performed before the gap test.
This way, we are guaranteed that no packet with illegal sequence numbers
are ever added to the deferred queue.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:42 +08:00
|
|
|
u16 seqno, rcv_nxt, win_lim;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
int released = 0;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
int rc = 0;
|
|
|
|
|
tipc: reduce duplicate packets for unicast traffic
For unicast transmission, the current NACK sending althorithm is over-
active that forces the sending side to retransmit a packet that is not
really lost but just arrived at the receiving side with some delay, or
even retransmit same packets that have already been retransmitted
before. As a result, many duplicates are observed also under normal
condition, ie. without packet loss.
One example case is: node1 transmits 1 2 3 4 10 5 6 7 8 9, when node2
receives packet #10, it puts into the deferdq. When the packet #5 comes
it sends NACK with gap [6 - 9]. However, shortly after that, when
packet #6 arrives, it pulls out packet #10 from the deferfq, but it is
still out of order, so it makes another NACK with gap [7 - 9] and so on
... Finally, node1 has to retransmit the packets 5 6 7 8 9 a number of
times, but in fact all the packets are not lost at all, so duplicates!
This commit reduces duplicates by changing the condition to send NACK,
also restricting the retransmissions on individual packets via a timer
of about 1ms. However, it also needs to say that too tricky condition
for NACKs or too long timeout value for retransmissions will result in
performance reducing! The criterias in this commit are found to be
effective for both the requirements to reduce duplicates but not affect
performance.
The tipc_link_rcv() is also improved to only dequeue skb from the link
deferdq if it is expected (ie. its seqno <= rcv_nxt).
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:52 +08:00
|
|
|
/* Verify and update link state */
|
|
|
|
if (unlikely(msg_user(hdr) == LINK_PROTOCOL))
|
|
|
|
return tipc_link_proto_rcv(l, skb, xmitq);
|
|
|
|
|
|
|
|
/* Don't send probe at next timeout expiration */
|
|
|
|
l->silent_intv_cnt = 0;
|
|
|
|
|
2015-10-16 02:52:41 +08:00
|
|
|
do {
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
hdr = buf_msg(skb);
|
2015-10-16 02:52:41 +08:00
|
|
|
seqno = msg_seqno(hdr);
|
|
|
|
rcv_nxt = l->rcv_nxt;
|
tipc: improve sequence number checking
The sequence number of an incoming packet is currently only checked
for less than, equality to, or bigger than the next expected number,
meaning that the receive window in practice becomes one half sequence
number cycle, or U16_MAX/2. This does not make sense, and may not even
be safe if there are extreme delays in the network. Any packet sent by
the peer during the ongoing cycle must belong inside his current send
window, or should otherwise be dropped if possible.
Since a link endpoint cannot know its peer's current send window, it
has to base this sanity check on a worst-case assumption, i.e., that
the peer is using a maximum sized window of 8191 packets. Using this
assumption, we now add a check that the sequence number is not bigger
than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks
done, so that the receive window test is performed before the gap test.
This way, we are guaranteed that no packet with illegal sequence numbers
are ever added to the deferred queue.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:42 +08:00
|
|
|
win_lim = rcv_nxt + TIPC_MAX_LINK_WIN;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
if (unlikely(!link_is_up(l))) {
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
if (l->state == LINK_ESTABLISHING)
|
|
|
|
rc = TIPC_LINK_UP_EVT;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
break;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
|
|
|
|
tipc: improve sequence number checking
The sequence number of an incoming packet is currently only checked
for less than, equality to, or bigger than the next expected number,
meaning that the receive window in practice becomes one half sequence
number cycle, or U16_MAX/2. This does not make sense, and may not even
be safe if there are extreme delays in the network. Any packet sent by
the peer during the ongoing cycle must belong inside his current send
window, or should otherwise be dropped if possible.
Since a link endpoint cannot know its peer's current send window, it
has to base this sanity check on a worst-case assumption, i.e., that
the peer is using a maximum sized window of 8191 packets. Using this
assumption, we now add a check that the sequence number is not bigger
than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks
done, so that the receive window test is performed before the gap test.
This way, we are guaranteed that no packet with illegal sequence numbers
are ever added to the deferred queue.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:42 +08:00
|
|
|
/* Drop if outside receive window */
|
|
|
|
if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) {
|
|
|
|
l->stats.duplicates++;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
kfree_skb(skb);
|
|
|
|
break;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
2020-05-26 17:38:34 +08:00
|
|
|
released += tipc_link_advance_transmq(l, l, msg_ack(hdr), 0,
|
|
|
|
NULL, NULL, NULL, NULL);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
tipc: improve sequence number checking
The sequence number of an incoming packet is currently only checked
for less than, equality to, or bigger than the next expected number,
meaning that the receive window in practice becomes one half sequence
number cycle, or U16_MAX/2. This does not make sense, and may not even
be safe if there are extreme delays in the network. Any packet sent by
the peer during the ongoing cycle must belong inside his current send
window, or should otherwise be dropped if possible.
Since a link endpoint cannot know its peer's current send window, it
has to base this sanity check on a worst-case assumption, i.e., that
the peer is using a maximum sized window of 8191 packets. Using this
assumption, we now add a check that the sequence number is not bigger
than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks
done, so that the receive window test is performed before the gap test.
This way, we are guaranteed that no packet with illegal sequence numbers
are ever added to the deferred queue.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:42 +08:00
|
|
|
/* Defer delivery if sequence gap */
|
|
|
|
if (unlikely(seqno != rcv_nxt)) {
|
2020-05-26 17:38:37 +08:00
|
|
|
if (!__tipc_skb_queue_sorted(defq, seqno, skb))
|
|
|
|
l->stats.duplicates++;
|
2016-09-02 01:52:51 +08:00
|
|
|
rc |= tipc_link_build_nack_msg(l, xmitq);
|
2015-10-16 02:52:41 +08:00
|
|
|
break;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
}
|
|
|
|
|
tipc: improve sequence number checking
The sequence number of an incoming packet is currently only checked
for less than, equality to, or bigger than the next expected number,
meaning that the receive window in practice becomes one half sequence
number cycle, or U16_MAX/2. This does not make sense, and may not even
be safe if there are extreme delays in the network. Any packet sent by
the peer during the ongoing cycle must belong inside his current send
window, or should otherwise be dropped if possible.
Since a link endpoint cannot know its peer's current send window, it
has to base this sanity check on a worst-case assumption, i.e., that
the peer is using a maximum sized window of 8191 packets. Using this
assumption, we now add a check that the sequence number is not bigger
than next_expected + TIPC_MAX_LINK_WIN. We also re-order the checks
done, so that the receive window test is performed before the gap test.
This way, we are guaranteed that no packet with illegal sequence numbers
are ever added to the deferred queue.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:42 +08:00
|
|
|
/* Deliver packet */
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->rcv_nxt++;
|
2016-11-25 23:35:02 +08:00
|
|
|
l->stats.recv_pkts++;
|
2019-04-04 12:09:53 +08:00
|
|
|
|
|
|
|
if (unlikely(msg_user(hdr) == TUNNEL_PROTOCOL))
|
|
|
|
rc |= tipc_link_tnl_rcv(l, skb, l->inputq);
|
|
|
|
else if (!tipc_data_input(l, skb, l->inputq))
|
|
|
|
rc |= tipc_link_input(l, skb, l->inputq, &l->reasm_buf);
|
2015-10-16 02:52:41 +08:00
|
|
|
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
|
2016-04-16 01:33:07 +08:00
|
|
|
rc |= tipc_link_build_state_msg(l, xmitq);
|
2016-09-02 01:52:49 +08:00
|
|
|
if (unlikely(rc & ~TIPC_LINK_SND_STATE))
|
2015-10-22 20:51:41 +08:00
|
|
|
break;
|
tipc: reduce duplicate packets for unicast traffic
For unicast transmission, the current NACK sending althorithm is over-
active that forces the sending side to retransmit a packet that is not
really lost but just arrived at the receiving side with some delay, or
even retransmit same packets that have already been retransmitted
before. As a result, many duplicates are observed also under normal
condition, ie. without packet loss.
One example case is: node1 transmits 1 2 3 4 10 5 6 7 8 9, when node2
receives packet #10, it puts into the deferdq. When the packet #5 comes
it sends NACK with gap [6 - 9]. However, shortly after that, when
packet #6 arrives, it pulls out packet #10 from the deferfq, but it is
still out of order, so it makes another NACK with gap [7 - 9] and so on
... Finally, node1 has to retransmit the packets 5 6 7 8 9 a number of
times, but in fact all the packets are not lost at all, so duplicates!
This commit reduces duplicates by changing the condition to send NACK,
also restricting the retransmissions on individual packets via a timer
of about 1ms. However, it also needs to say that too tricky condition
for NACKs or too long timeout value for retransmissions will result in
performance reducing! The criterias in this commit are found to be
effective for both the requirements to reduce duplicates but not affect
performance.
The tipc_link_rcv() is also improved to only dequeue skb from the link
deferdq if it is expected (ie. its seqno <= rcv_nxt).
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:52 +08:00
|
|
|
} while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt)));
|
2015-10-16 02:52:41 +08:00
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
/* Forward queues and wake up waiting users */
|
|
|
|
if (released) {
|
|
|
|
tipc_link_update_cwin(l, released, 0);
|
|
|
|
tipc_link_advance_backlog(l, xmitq);
|
|
|
|
if (unlikely(!skb_queue_empty(&l->wakeupq)))
|
|
|
|
link_prepare_wakeup(l);
|
|
|
|
}
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:26 +08:00
|
|
|
static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
bool probe_reply, u16 rcvgap,
|
|
|
|
int tolerance, int priority,
|
2015-07-17 04:54:26 +08:00
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
2020-05-26 17:38:37 +08:00
|
|
|
struct tipc_mon_state *mstate = &l->mon_state;
|
|
|
|
struct sk_buff_head *dfq = &l->deferdq;
|
2016-09-02 01:52:49 +08:00
|
|
|
struct tipc_link *bcl = l->bc_rcvlink;
|
2016-03-04 03:23:21 +08:00
|
|
|
struct tipc_msg *hdr;
|
2020-05-26 17:38:37 +08:00
|
|
|
struct sk_buff *skb;
|
2016-09-02 01:52:49 +08:00
|
|
|
bool node_up = link_is_up(bcl);
|
2020-05-26 17:38:37 +08:00
|
|
|
u16 glen = 0, bc_rcvgap = 0;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
int dlen = 0;
|
|
|
|
void *data;
|
2015-07-17 04:54:26 +08:00
|
|
|
|
|
|
|
/* Don't send protocol message during reset or link failover */
|
2015-07-31 06:24:21 +08:00
|
|
|
if (tipc_link_is_blocked(l))
|
2015-07-17 04:54:26 +08:00
|
|
|
return;
|
|
|
|
|
2016-03-04 03:23:21 +08:00
|
|
|
if (!tipc_link_is_up(l) && (mtyp == STATE_MSG))
|
|
|
|
return;
|
|
|
|
|
tipc: eliminate gap indicator from ACK messages
When we increase the link send window we sometimes observe the
following scenario:
1) A packet #N arrives out of order far ahead of a sequence of older
packets which are still under way. The packet is added to the
deferred queue.
2) The missing packets arrive in sequence, and for each 16th of them
an ACK is sent back to the receiver, as it should be.
3) When building those ACK messages, it is checked if there is a gap
between the link's 'rcv_nxt' and the first packet in the deferred
queue. This is always the case until packet number #N-1 arrives, and
a 'gap' indicator is added, effectively turning them into NACK
messages.
4) When those NACKs arrive at the sender, all the requested
retransmissions are done, since it is a first-time request.
This sometimes leads to a huge amount of redundant retransmissions,
causing a drop in max throughput. This problem gets worse when we
in a later commit introduce variable window congestion control,
since it drops the link back to 'fast recovery' much more often
than necessary.
We now fix this by not sending any 'gap' indicator in regular ACK
messages. We already have a mechanism for sending explicit NACKs
in place, and this is sufficient to keep up the packet flow.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:44 +08:00
|
|
|
if ((probe || probe_reply) && !skb_queue_empty(dfq))
|
2016-03-04 03:23:21 +08:00
|
|
|
rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
|
|
|
|
|
|
|
|
skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
tipc_max_domain_size + MAX_GAP_ACK_BLKS_SZ,
|
|
|
|
l->addr, tipc_own_addr(l->net), 0, 0, 0);
|
2016-03-04 03:23:21 +08:00
|
|
|
if (!skb)
|
|
|
|
return;
|
|
|
|
|
|
|
|
hdr = buf_msg(skb);
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
data = msg_data(hdr);
|
2016-03-04 03:23:21 +08:00
|
|
|
msg_set_session(hdr, l->session);
|
|
|
|
msg_set_bearer_id(hdr, l->bearer_id);
|
2015-07-17 04:54:26 +08:00
|
|
|
msg_set_net_plane(hdr, l->net_plane);
|
2015-10-22 20:51:41 +08:00
|
|
|
msg_set_next_sent(hdr, l->snd_nxt);
|
|
|
|
msg_set_ack(hdr, l->rcv_nxt - 1);
|
2016-09-02 01:52:49 +08:00
|
|
|
msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1);
|
2016-10-28 06:51:55 +08:00
|
|
|
msg_set_bc_ack_invalid(hdr, !node_up);
|
2015-10-22 20:51:41 +08:00
|
|
|
msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
|
2015-07-17 04:54:26 +08:00
|
|
|
msg_set_link_tolerance(hdr, tolerance);
|
|
|
|
msg_set_linkprio(hdr, priority);
|
|
|
|
msg_set_redundant_link(hdr, node_up);
|
|
|
|
msg_set_seq_gap(hdr, 0);
|
2015-10-22 20:51:41 +08:00
|
|
|
msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
|
2015-07-17 04:54:26 +08:00
|
|
|
|
|
|
|
if (mtyp == STATE_MSG) {
|
2018-07-10 07:07:35 +08:00
|
|
|
if (l->peer_caps & TIPC_LINK_PROTO_SEQNO)
|
|
|
|
msg_set_seqno(hdr, l->snd_nxt_state++);
|
2016-03-04 03:23:21 +08:00
|
|
|
msg_set_seq_gap(hdr, rcvgap);
|
2020-05-26 17:38:37 +08:00
|
|
|
bc_rcvgap = link_bc_rcv_gap(bcl);
|
|
|
|
msg_set_bc_gap(hdr, bc_rcvgap);
|
2015-07-17 04:54:26 +08:00
|
|
|
msg_set_probe(hdr, probe);
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
msg_set_is_keepalive(hdr, probe || probe_reply);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
if (l->peer_caps & TIPC_GAP_ACK_BLOCK)
|
2020-05-26 17:38:34 +08:00
|
|
|
glen = tipc_build_gap_ack_blks(l, hdr);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id);
|
|
|
|
msg_set_size(hdr, INT_H_SIZE + glen + dlen);
|
|
|
|
skb_trim(skb, INT_H_SIZE + glen + dlen);
|
2015-07-17 04:54:26 +08:00
|
|
|
l->stats.sent_states++;
|
2015-10-22 20:51:41 +08:00
|
|
|
l->rcv_unacked = 0;
|
2015-07-17 04:54:26 +08:00
|
|
|
} else {
|
|
|
|
/* RESET_MSG or ACTIVATE_MSG */
|
tipc: fix link session and re-establish issues
When a link endpoint is re-created (e.g. after a node reboot or
interface reset), the link session number is varied by random, the peer
endpoint will be synced with this new session number before the link is
re-established.
However, there is a shortcoming in this mechanism that can lead to the
link never re-established or faced with a failure then. It happens when
the peer endpoint is ready in ESTABLISHING state, the 'peer_session' as
well as the 'in_session' flag have been set, but suddenly this link
endpoint leaves. When it comes back with a random session number, there
are two situations possible:
1/ If the random session number is larger than (or equal to) the
previous one, the peer endpoint will be updated with this new session
upon receipt of a RESET_MSG from this endpoint, and the link can be re-
established as normal. Otherwise, all the RESET_MSGs from this endpoint
will be rejected by the peer. In turn, when this link endpoint receives
one ACTIVATE_MSG from the peer, it will move to ESTABLISHED and start
to send STATE_MSGs, but again these messages will be dropped by the
peer due to wrong session.
The peer link endpoint can still become ESTABLISHED after receiving a
traffic message from this endpoint (e.g. a BCAST_PROTOCOL or
NAME_DISTRIBUTOR), but since all the STATE_MSGs are invalid, the link
will be forced down sooner or later!
Even in case the random session number is larger than the previous one,
it can be that the ACTIVATE_MSG from the peer arrives first, and this
link endpoint moves quickly to ESTABLISHED without sending out any
RESET_MSG yet. Consequently, the peer link will not be updated with the
new session number, and the same link failure scenario as above will
happen.
2/ Another situation can be that, the peer link endpoint was reset due
to any reasons in the meantime, its link state was set to RESET from
ESTABLISHING but still in session, i.e. the 'in_session' flag is not
reset...
Now, if the random session number from this endpoint is less than the
previous one, all the RESET_MSGs from this endpoint will be rejected by
the peer. In the other direction, when this link endpoint receives a
RESET_MSG from the peer, it moves to ESTABLISHING and starts to send
ACTIVATE_MSGs, but all these messages will be rejected by the peer too.
As a result, the link cannot be re-established but gets stuck with this
link endpoint in state ESTABLISHING and the peer in RESET!
Solution:
===========
This link endpoint should not go directly to ESTABLISHED when getting
ACTIVATE_MSG from the peer which may belong to the old session if the
link was re-created. To ensure the session to be correct before the
link is re-established, the peer endpoint in ESTABLISHING state will
send back the last session number in ACTIVATE_MSG for a verification at
this endpoint. Then, if needed, a new and more appropriate session
number will be regenerated to force a re-synch first.
In addition, when a link in ESTABLISHING state is reset, its state will
move to RESET according to the link FSM, along with resetting the
'in_session' flag (and the other data) as a normal link reset, it will
also be deleted if requested.
The solution is backward compatible.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-11 14:29:43 +08:00
|
|
|
if (mtyp == ACTIVATE_MSG) {
|
|
|
|
msg_set_dest_session_valid(hdr, 1);
|
|
|
|
msg_set_dest_session(hdr, l->peer_session);
|
|
|
|
}
|
2015-07-17 04:54:26 +08:00
|
|
|
msg_set_max_pkt(hdr, l->advertised_mtu);
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
strcpy(data, l->if_name);
|
|
|
|
msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME);
|
|
|
|
skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME);
|
2015-07-17 04:54:26 +08:00
|
|
|
}
|
2016-03-04 03:23:21 +08:00
|
|
|
if (probe)
|
|
|
|
l->stats.sent_probes++;
|
|
|
|
if (rcvgap)
|
|
|
|
l->stats.sent_nacks++;
|
2020-05-26 17:38:37 +08:00
|
|
|
if (bc_rcvgap)
|
|
|
|
bcl->stats.sent_nacks++;
|
2015-07-17 04:54:26 +08:00
|
|
|
skb->priority = TC_PRIO_CONTROL;
|
2015-07-31 06:24:19 +08:00
|
|
|
__skb_queue_tail(xmitq, skb);
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_proto_build(skb, false, l->name);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2018-09-27 03:00:54 +08:00
|
|
|
void tipc_link_create_dummy_tnl_msg(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
u32 onode = tipc_own_addr(l->net);
|
|
|
|
struct tipc_msg *hdr, *ihdr;
|
|
|
|
struct sk_buff_head tnlq;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
u32 dnode = l->addr;
|
|
|
|
|
2019-08-15 22:42:50 +08:00
|
|
|
__skb_queue_head_init(&tnlq);
|
2018-09-27 03:00:54 +08:00
|
|
|
skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG,
|
|
|
|
INT_H_SIZE, BASIC_H_SIZE,
|
|
|
|
dnode, onode, 0, 0, 0);
|
|
|
|
if (!skb) {
|
|
|
|
pr_warn("%sunable to create tunnel packet\n", link_co_err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
hdr = buf_msg(skb);
|
|
|
|
msg_set_msgcnt(hdr, 1);
|
|
|
|
msg_set_bearer_id(hdr, l->peer_bearer_id);
|
|
|
|
|
|
|
|
ihdr = (struct tipc_msg *)msg_data(hdr);
|
|
|
|
tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
|
|
|
|
BASIC_H_SIZE, dnode);
|
|
|
|
msg_set_errcode(ihdr, TIPC_ERR_NO_PORT);
|
|
|
|
__skb_queue_tail(&tnlq, skb);
|
|
|
|
tipc_link_xmit(l, &tnlq, xmitq);
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets
|
2015-10-16 02:52:41 +08:00
|
|
|
* with contents of the link's transmit and backlog queues.
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-07-31 06:24:19 +08:00
|
|
|
void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
|
|
|
|
int mtyp, struct sk_buff_head *xmitq)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2019-04-04 12:09:53 +08:00
|
|
|
struct sk_buff_head *fdefq = &tnl->failover_deferdq;
|
2015-07-31 06:24:19 +08:00
|
|
|
struct sk_buff *skb, *tnlskb;
|
|
|
|
struct tipc_msg *hdr, tnlhdr;
|
|
|
|
struct sk_buff_head *queue = &l->transmq;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
struct sk_buff_head tmpxq, tnlq, frags;
|
2015-07-31 06:24:19 +08:00
|
|
|
u16 pktlen, pktcnt, seqno = l->snd_nxt;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
bool pktcnt_need_update = false;
|
2019-07-24 09:56:11 +08:00
|
|
|
u16 syncpt;
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
int rc;
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
if (!tnl)
|
2006-01-03 02:04:38 +08:00
|
|
|
return;
|
|
|
|
|
2019-08-15 22:42:50 +08:00
|
|
|
__skb_queue_head_init(&tnlq);
|
2019-07-24 09:56:11 +08:00
|
|
|
/* Link Synching:
|
|
|
|
* From now on, send only one single ("dummy") SYNCH message
|
|
|
|
* to peer. The SYNCH message does not contain any data, just
|
|
|
|
* a header conveying the synch point to the peer.
|
|
|
|
*/
|
|
|
|
if (mtyp == SYNCH_MSG && (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
|
|
|
|
tnlskb = tipc_msg_create(TUNNEL_PROTOCOL, SYNCH_MSG,
|
|
|
|
INT_H_SIZE, 0, l->addr,
|
|
|
|
tipc_own_addr(l->net),
|
|
|
|
0, 0, 0);
|
|
|
|
if (!tnlskb) {
|
|
|
|
pr_warn("%sunable to create dummy SYNCH_MSG\n",
|
|
|
|
link_co_err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
hdr = buf_msg(tnlskb);
|
|
|
|
syncpt = l->snd_nxt + skb_queue_len(&l->backlogq) - 1;
|
|
|
|
msg_set_syncpt(hdr, syncpt);
|
|
|
|
msg_set_bearer_id(hdr, l->peer_bearer_id);
|
|
|
|
__skb_queue_tail(&tnlq, tnlskb);
|
|
|
|
tipc_link_xmit(tnl, &tnlq, xmitq);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-11-06 19:12:17 +08:00
|
|
|
__skb_queue_head_init(&tmpxq);
|
|
|
|
__skb_queue_head_init(&frags);
|
|
|
|
/* At least one packet required for safe algorithm => add dummy */
|
|
|
|
skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
|
|
|
|
BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net),
|
|
|
|
0, 0, TIPC_ERR_NO_PORT);
|
|
|
|
if (!skb) {
|
|
|
|
pr_warn("%sunable to create tunnel packet\n", link_co_err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
__skb_queue_tail(&tnlq, skb);
|
|
|
|
tipc_link_xmit(l, &tnlq, &tmpxq);
|
|
|
|
__skb_queue_purge(&tmpxq);
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
/* Initialize reusable tunnel packet header */
|
2016-03-04 03:23:21 +08:00
|
|
|
tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL,
|
2015-07-31 06:24:19 +08:00
|
|
|
mtyp, INT_H_SIZE, l->addr);
|
2019-04-04 12:09:53 +08:00
|
|
|
if (mtyp == SYNCH_MSG)
|
|
|
|
pktcnt = l->snd_nxt - buf_seqno(skb_peek(&l->transmq));
|
|
|
|
else
|
|
|
|
pktcnt = skb_queue_len(&l->transmq);
|
|
|
|
pktcnt += skb_queue_len(&l->backlogq);
|
2015-07-31 06:24:19 +08:00
|
|
|
msg_set_msgcnt(&tnlhdr, pktcnt);
|
|
|
|
msg_set_bearer_id(&tnlhdr, l->peer_bearer_id);
|
|
|
|
tnl:
|
|
|
|
/* Wrap each packet into a tunnel packet */
|
2015-03-14 04:08:10 +08:00
|
|
|
skb_queue_walk(queue, skb) {
|
2015-07-31 06:24:19 +08:00
|
|
|
hdr = buf_msg(skb);
|
|
|
|
if (queue == &l->backlogq)
|
|
|
|
msg_set_seqno(hdr, seqno++);
|
|
|
|
pktlen = msg_size(hdr);
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
|
|
|
|
/* Tunnel link MTU is not large enough? This could be
|
|
|
|
* due to:
|
|
|
|
* 1) Link MTU has just changed or set differently;
|
|
|
|
* 2) Or FAILOVER on the top of a SYNCH message
|
|
|
|
*
|
|
|
|
* The 2nd case should not happen if peer supports
|
|
|
|
* TIPC_TUNNEL_ENHANCED
|
|
|
|
*/
|
|
|
|
if (pktlen > tnl->mtu - INT_H_SIZE) {
|
|
|
|
if (mtyp == FAILOVER_MSG &&
|
|
|
|
(tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
|
|
|
|
rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu,
|
|
|
|
&frags);
|
|
|
|
if (rc) {
|
|
|
|
pr_warn("%sunable to frag msg: rc %d\n",
|
|
|
|
link_co_err, rc);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pktcnt += skb_queue_len(&frags) - 1;
|
|
|
|
pktcnt_need_update = true;
|
|
|
|
skb_queue_splice_tail_init(&frags, &tnlq);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED
|
|
|
|
* => Just warn it and return!
|
|
|
|
*/
|
|
|
|
pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n",
|
|
|
|
link_co_err, msg_user(hdr),
|
|
|
|
msg_type(hdr), msg_size(hdr));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
|
2017-01-13 22:46:25 +08:00
|
|
|
tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
|
2015-07-31 06:24:19 +08:00
|
|
|
if (!tnlskb) {
|
|
|
|
pr_warn("%sunable to send packet\n", link_co_err);
|
2006-01-03 02:04:38 +08:00
|
|
|
return;
|
|
|
|
}
|
2015-07-31 06:24:19 +08:00
|
|
|
skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE);
|
|
|
|
skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen);
|
|
|
|
__skb_queue_tail(&tnlq, tnlskb);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2015-07-31 06:24:19 +08:00
|
|
|
if (queue != &l->backlogq) {
|
|
|
|
queue = &l->backlogq;
|
|
|
|
goto tnl;
|
tipc: change reception of tunnelled failover packets
When a link is reset, and there is a redundant link available, all
sender sockets will steer their subsequent traffic through the
remaining link. In order to guarantee preserved packet order and
cardinality during the transition, we tunnel the failing link's send
queue through the remaining link before we allow any sockets to use it.
In this commit, we change the algorithm for receiving failover
("ORIGINAL_MSG") packets in tipc_link_tunnel_rcv(), at the same time
delegating it to a new subfuncton, tipc_link_failover_rcv(). Instead
of directly returning an extracted inner packet to the packet reception
loop in tipc_rcv(), we first check if it is a message fragment, in which
case we append it to the reset link's fragment chain. If the fragment
chain is complete, we return the whole chain instead of the individual
buffer, eliminating any need for the tipc_rcv() loop to do reassembly of
tunneled packets.
This change makes it possible to further simplify tipc_link_tunnel_rcv(),
as well as the calling tipc_rcv() loop. We will do that in later
commits. It also makes it possible to identify a single spot in the code
where we can tell that a failover procedure is finished, something that
is useful when we are deleting links after a failover. This will also
be done in a later commit.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-14 06:29:11 +08:00
|
|
|
}
|
tipc: change reception of tunnelled duplicate packets
When a second link to a destination comes up, some sender sockets will
steer their subsequent traffic through the new link. In order to
guarantee preserved packet order and cardinality for those sockets, we
tunnel a duplicate of the old link's send queue through the new link
before we open it for regular traffic. The last arriving packet copy,
on whichever link, will be dropped at the receiving end based on the
original sequence number, to ensure that only one copy is delivered to
the end receiver.
In this commit, we change the algorithm for receiving DUPLICATE_MSG
packets, at the same time delegating it to a new subfunction,
tipc_link_dup_rcv(). Instead of returning an extracted inner packet to
the packet reception loop in tipc_rcv(), we just add it to the receiving
(new) link's deferred packet queue. The packet will then be processed by
that link when it receives its first non-tunneled packet, i.e., at
latest when the changeover procedure is finished.
Because tipc_link_tunnel_rcv()/tipc_link_dup_rcv() now is consuming all
packets of type DUPLICATE_MSG, the calling tipc_rcv() function can omit
testing for this. This in turn means that the current conditional jump
to the label 'protocol_check' becomes redundant, and we can remove that
label.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-14 06:29:10 +08:00
|
|
|
|
tipc: fix changeover issues due to large packet
In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:
1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.
2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).
3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.
The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.
For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-07-24 09:56:12 +08:00
|
|
|
if (pktcnt_need_update)
|
|
|
|
skb_queue_walk(&tnlq, skb) {
|
|
|
|
hdr = buf_msg(skb);
|
|
|
|
msg_set_msgcnt(hdr, pktcnt);
|
|
|
|
}
|
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
tipc_link_xmit(tnl, &tnlq, xmitq);
|
2006-01-03 02:04:38 +08:00
|
|
|
|
2015-07-31 06:24:19 +08:00
|
|
|
if (mtyp == FAILOVER_MSG) {
|
|
|
|
tnl->drop_point = l->rcv_nxt;
|
|
|
|
tnl->failover_reasm_skb = l->reasm_buf;
|
|
|
|
l->reasm_buf = NULL;
|
2019-04-04 12:09:53 +08:00
|
|
|
|
|
|
|
/* Failover the link's deferdq */
|
|
|
|
if (unlikely(!skb_queue_empty(fdefq))) {
|
|
|
|
pr_warn("Link failover deferdq not empty: %d!\n",
|
|
|
|
skb_queue_len(fdefq));
|
|
|
|
__skb_queue_purge(fdefq);
|
|
|
|
}
|
|
|
|
skb_queue_splice_init(&l->deferdq, fdefq);
|
2015-04-02 21:33:00 +08:00
|
|
|
}
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
tipc: fix missing Name entries due to half-failover
TIPC link can temporarily fall into "half-establish" that only one of
the link endpoints is ESTABLISHED and starts to send traffic, PROTOCOL
messages, whereas the other link endpoint is not up (e.g. immediately
when the endpoint receives ACTIVATE_MSG, the network interface goes
down...).
This is a normal situation and will be settled because the link
endpoint will be eventually brought down after the link tolerance time.
However, the situation will become worse when the second link is
established before the first link endpoint goes down,
For example:
1. Both links <1A-2A>, <1B-2B> down
2. Link endpoint 2A up, but 1A still down (e.g. due to network
disturbance, wrong session, etc.)
3. Link <1B-2B> up
4. Link endpoint 2A down (e.g. due to link tolerance timeout)
5. Node B starts failover onto link <1B-2B>
==> Node A does never start link failover.
When the "half-failover" situation happens, two consequences have been
observed:
a) Peer link/node gets stuck in FAILINGOVER state;
b) Traffic or user messages that peer node is trying to failover onto
the second link can be partially or completely dropped by this node.
The consequence a) was actually solved by commit c140eb166d68 ("tipc:
fix failover problem"), but that commit didn't cover the b). It's due
to the fact that the tunnel link endpoint has never been prepared for a
failover, so the 'l->drop_point' (and the other data...) is not set
correctly. When a TUNNEL_MSG from peer node arrives on the link,
depending on the inner message's seqno and the current 'l->drop_point'
value, the message can be dropped (- treated as a duplicate message) or
processed.
At this early stage, the traffic messages from peer are likely to be
NAME_DISTRIBUTORs, this means some name table entries will be missed on
the node forever!
The commit resolves the issue by starting the FAILOVER process on this
node as well. Another benefit from this solution is that we ensure the
link will not be re-established until the failover ends.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-02 18:23:23 +08:00
|
|
|
/**
|
|
|
|
* tipc_link_failover_prepare() - prepare tnl for link failover
|
|
|
|
*
|
|
|
|
* This is a special version of the precursor - tipc_link_tnl_prepare(),
|
|
|
|
* see the tipc_node_link_failover() for details
|
|
|
|
*
|
|
|
|
* @l: failover link
|
|
|
|
* @tnl: tunnel link
|
|
|
|
* @xmitq: queue for messages to be xmited
|
|
|
|
*/
|
|
|
|
void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
struct sk_buff_head *fdefq = &tnl->failover_deferdq;
|
|
|
|
|
|
|
|
tipc_link_create_dummy_tnl_msg(tnl, xmitq);
|
|
|
|
|
2019-10-24 23:30:43 +08:00
|
|
|
/* This failover link endpoint was never established before,
|
tipc: fix missing Name entries due to half-failover
TIPC link can temporarily fall into "half-establish" that only one of
the link endpoints is ESTABLISHED and starts to send traffic, PROTOCOL
messages, whereas the other link endpoint is not up (e.g. immediately
when the endpoint receives ACTIVATE_MSG, the network interface goes
down...).
This is a normal situation and will be settled because the link
endpoint will be eventually brought down after the link tolerance time.
However, the situation will become worse when the second link is
established before the first link endpoint goes down,
For example:
1. Both links <1A-2A>, <1B-2B> down
2. Link endpoint 2A up, but 1A still down (e.g. due to network
disturbance, wrong session, etc.)
3. Link <1B-2B> up
4. Link endpoint 2A down (e.g. due to link tolerance timeout)
5. Node B starts failover onto link <1B-2B>
==> Node A does never start link failover.
When the "half-failover" situation happens, two consequences have been
observed:
a) Peer link/node gets stuck in FAILINGOVER state;
b) Traffic or user messages that peer node is trying to failover onto
the second link can be partially or completely dropped by this node.
The consequence a) was actually solved by commit c140eb166d68 ("tipc:
fix failover problem"), but that commit didn't cover the b). It's due
to the fact that the tunnel link endpoint has never been prepared for a
failover, so the 'l->drop_point' (and the other data...) is not set
correctly. When a TUNNEL_MSG from peer node arrives on the link,
depending on the inner message's seqno and the current 'l->drop_point'
value, the message can be dropped (- treated as a duplicate message) or
processed.
At this early stage, the traffic messages from peer are likely to be
NAME_DISTRIBUTORs, this means some name table entries will be missed on
the node forever!
The commit resolves the issue by starting the FAILOVER process on this
node as well. Another benefit from this solution is that we ensure the
link will not be re-established until the failover ends.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-02 18:23:23 +08:00
|
|
|
* so it has not received anything from peer.
|
|
|
|
* Otherwise, it must be a normal failover situation or the
|
|
|
|
* node has entered SELF_DOWN_PEER_LEAVING and both peer nodes
|
|
|
|
* would have to start over from scratch instead.
|
|
|
|
*/
|
|
|
|
tnl->drop_point = 1;
|
|
|
|
tnl->failover_reasm_skb = NULL;
|
|
|
|
|
|
|
|
/* Initiate the link's failover deferdq */
|
|
|
|
if (unlikely(!skb_queue_empty(fdefq))) {
|
|
|
|
pr_warn("Link failover deferdq not empty: %d!\n",
|
|
|
|
skb_queue_len(fdefq));
|
|
|
|
__skb_queue_purge(fdefq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-07-10 07:07:36 +08:00
|
|
|
/* tipc_link_validate_msg(): validate message against current link state
|
|
|
|
* Returns true if message should be accepted, otherwise false
|
|
|
|
*/
|
|
|
|
bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr)
|
|
|
|
{
|
|
|
|
u16 curr_session = l->peer_session;
|
|
|
|
u16 session = msg_session(hdr);
|
|
|
|
int mtyp = msg_type(hdr);
|
|
|
|
|
|
|
|
if (msg_user(hdr) != LINK_PROTOCOL)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
switch (mtyp) {
|
|
|
|
case RESET_MSG:
|
|
|
|
if (!l->in_session)
|
|
|
|
return true;
|
|
|
|
/* Accept only RESET with new session number */
|
|
|
|
return more(session, curr_session);
|
|
|
|
case ACTIVATE_MSG:
|
|
|
|
if (!l->in_session)
|
|
|
|
return true;
|
|
|
|
/* Accept only ACTIVATE with new or current session number */
|
|
|
|
return !less(session, curr_session);
|
|
|
|
case STATE_MSG:
|
|
|
|
/* Accept only STATE with current session number */
|
|
|
|
if (!l->in_session)
|
|
|
|
return false;
|
|
|
|
if (session != curr_session)
|
|
|
|
return false;
|
2018-09-27 04:28:52 +08:00
|
|
|
/* Extra sanity check */
|
|
|
|
if (!link_is_up(l) && msg_ack(hdr))
|
|
|
|
return false;
|
2018-07-10 07:07:36 +08:00
|
|
|
if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO))
|
|
|
|
return true;
|
|
|
|
/* Accept only STATE with new sequence number */
|
|
|
|
return !less(msg_seqno(hdr), l->rcv_nxt_state);
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
/* tipc_link_proto_rcv(): receive link level protocol message :
|
|
|
|
* Note that network plane id propagates through the network, and may
|
|
|
|
* change at any time. The node with lowest numerical id determines
|
|
|
|
* network plane
|
|
|
|
*/
|
|
|
|
static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
struct tipc_gap_ack_blks *ga = NULL;
|
2020-05-26 17:38:34 +08:00
|
|
|
bool reply = msg_probe(hdr), retransmitted = false;
|
tipc: do not update mtu if msg_max is too small in mtu negotiation
When doing link mtu negotiation, a malicious peer may send Activate msg
with a very small mtu, e.g. 4 in Shuang's testing, without checking for
the minimum mtu, l->mtu will be set to 4 in tipc_link_proto_rcv(), then
n->links[bearer_id].mtu is set to 4294967228, which is a overflow of
'4 - INT_H_SIZE - EMSG_OVERHEAD' in tipc_link_mss().
With tipc_link.mtu = 4, tipc_link_xmit() kept printing the warning:
tipc: Too large msg, purging xmit list 1 5 0 40 4!
tipc: Too large msg, purging xmit list 1 15 0 60 4!
And with tipc_link_entry.mtu 4294967228, a huge skb was allocated in
named_distribute(), and when purging it in tipc_link_xmit(), a crash
was even caused:
general protection fault, probably for non-canonical address 0x2100001011000dd: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded Not tainted 6.3.0.neta #19
RIP: 0010:kfree_skb_list_reason+0x7e/0x1f0
Call Trace:
<IRQ>
skb_release_data+0xf9/0x1d0
kfree_skb_reason+0x40/0x100
tipc_link_xmit+0x57a/0x740 [tipc]
tipc_node_xmit+0x16c/0x5c0 [tipc]
tipc_named_node_up+0x27f/0x2c0 [tipc]
tipc_node_write_unlock+0x149/0x170 [tipc]
tipc_rcv+0x608/0x740 [tipc]
tipc_udp_recv+0xdc/0x1f0 [tipc]
udp_queue_rcv_one_skb+0x33e/0x620
udp_unicast_rcv_skb.isra.72+0x75/0x90
__udp4_lib_rcv+0x56d/0xc20
ip_protocol_deliver_rcu+0x100/0x2d0
This patch fixes it by checking the new mtu against tipc_bearer_min_mtu(),
and not updating mtu if it is too small.
Fixes: ed193ece2649 ("tipc: simplify link mtu negotiation")
Reported-by: Shuang Li <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-15 03:52:28 +08:00
|
|
|
u32 dlen = msg_data_sz(hdr), glen = 0, msg_max;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
u16 peers_snd_nxt = msg_next_sent(hdr);
|
|
|
|
u16 peers_tol = msg_link_tolerance(hdr);
|
|
|
|
u16 peers_prio = msg_linkprio(hdr);
|
2020-05-26 17:38:34 +08:00
|
|
|
u16 gap = msg_seq_gap(hdr);
|
|
|
|
u16 ack = msg_ack(hdr);
|
tipc: fix stale link problem during synchronization
Recent changes to the link synchronization means that we can now just
drop packets arriving on the synchronizing link before the synch point
is reached. This has lead to significant simplifications to the
implementation, but also turns out to have a flip side that we need
to consider.
Under unlucky circumstances, the two endpoints may end up
repeatedly dropping each other's packets, while immediately
asking for retransmission of the same packets, just to drop
them once more. This pattern will eventually be broken when
the synch point is reached on the other link, but before that,
the endpoints may have arrived at the retransmission limit
(stale counter) that indicates that the link should be broken.
We see this happen at rare occasions.
The fix for this is to not ask for retransmissions when a link is in
state LINK_SYNCHING. The fact that the link has reached this state
means that it has already received the first SYNCH packet, and that it
knows the synch point. Hence, it doesn't need any more packets until the
other link has reached the synch point, whereafter it can go ahead and
ask for the missing packets.
However, because of the reduced traffic on the synching link that
follows this change, it may now take longer to discover that the
synch point has been reached. We compensate for this by letting all
packets, on any of the links, trig a check for synchronization
termination. This is possible because the packets themselves don't
contain any information that is needed for discovering this condition.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-20 14:12:56 +08:00
|
|
|
u16 rcv_nxt = l->rcv_nxt;
|
2020-05-26 17:38:34 +08:00
|
|
|
u16 rcvgap = 0;
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
int mtyp = msg_type(hdr);
|
2020-05-26 17:38:34 +08:00
|
|
|
int rc = 0, released;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
char *if_name;
|
2020-05-26 17:38:34 +08:00
|
|
|
void *data;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
2018-12-19 10:17:57 +08:00
|
|
|
trace_tipc_proto_rcv(skb, false, l->name);
|
2022-02-06 03:11:18 +08:00
|
|
|
|
|
|
|
if (dlen > U16_MAX)
|
|
|
|
goto exit;
|
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
if (tipc_link_is_blocked(l) || !xmitq)
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
goto exit;
|
|
|
|
|
2016-03-04 03:23:21 +08:00
|
|
|
if (tipc_own_addr(l->net) > msg_prevnode(hdr))
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->net_plane = msg_net_plane(hdr);
|
|
|
|
|
2022-12-03 17:46:35 +08:00
|
|
|
if (skb_linearize(skb))
|
|
|
|
goto exit;
|
|
|
|
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
hdr = buf_msg(skb);
|
|
|
|
data = msg_data(hdr);
|
|
|
|
|
2018-12-19 10:17:57 +08:00
|
|
|
if (!tipc_link_validate_msg(l, hdr)) {
|
|
|
|
trace_tipc_skb_dump(skb, false, "PROTO invalid (1)!");
|
|
|
|
trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (1)!");
|
2018-07-10 07:07:36 +08:00
|
|
|
goto exit;
|
2018-12-19 10:17:57 +08:00
|
|
|
}
|
2018-07-10 07:07:36 +08:00
|
|
|
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
switch (mtyp) {
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
case RESET_MSG:
|
|
|
|
case ACTIVATE_MSG:
|
tipc: do not update mtu if msg_max is too small in mtu negotiation
When doing link mtu negotiation, a malicious peer may send Activate msg
with a very small mtu, e.g. 4 in Shuang's testing, without checking for
the minimum mtu, l->mtu will be set to 4 in tipc_link_proto_rcv(), then
n->links[bearer_id].mtu is set to 4294967228, which is a overflow of
'4 - INT_H_SIZE - EMSG_OVERHEAD' in tipc_link_mss().
With tipc_link.mtu = 4, tipc_link_xmit() kept printing the warning:
tipc: Too large msg, purging xmit list 1 5 0 40 4!
tipc: Too large msg, purging xmit list 1 15 0 60 4!
And with tipc_link_entry.mtu 4294967228, a huge skb was allocated in
named_distribute(), and when purging it in tipc_link_xmit(), a crash
was even caused:
general protection fault, probably for non-canonical address 0x2100001011000dd: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded Not tainted 6.3.0.neta #19
RIP: 0010:kfree_skb_list_reason+0x7e/0x1f0
Call Trace:
<IRQ>
skb_release_data+0xf9/0x1d0
kfree_skb_reason+0x40/0x100
tipc_link_xmit+0x57a/0x740 [tipc]
tipc_node_xmit+0x16c/0x5c0 [tipc]
tipc_named_node_up+0x27f/0x2c0 [tipc]
tipc_node_write_unlock+0x149/0x170 [tipc]
tipc_rcv+0x608/0x740 [tipc]
tipc_udp_recv+0xdc/0x1f0 [tipc]
udp_queue_rcv_one_skb+0x33e/0x620
udp_unicast_rcv_skb.isra.72+0x75/0x90
__udp4_lib_rcv+0x56d/0xc20
ip_protocol_deliver_rcu+0x100/0x2d0
This patch fixes it by checking the new mtu against tipc_bearer_min_mtu(),
and not updating mtu if it is too small.
Fixes: ed193ece2649 ("tipc: simplify link mtu negotiation")
Reported-by: Shuang Li <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-15 03:52:28 +08:00
|
|
|
msg_max = msg_max_pkt(hdr);
|
|
|
|
if (msg_max < tipc_bearer_min_mtu(l->net, l->bearer_id))
|
|
|
|
break;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
/* Complete own link name with peer's interface name */
|
|
|
|
if_name = strrchr(l->name, ':') + 1;
|
|
|
|
if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME)
|
|
|
|
break;
|
|
|
|
if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME)
|
|
|
|
break;
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
strncpy(if_name, data, TIPC_MAX_IF_NAME);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
|
|
|
/* Update own tolerance if peer indicates a non-zero value */
|
minmax: add in_range() macro
Patch series "New page table range API", v6.
This patchset changes the API used by the MM to set up page table entries.
The four APIs are:
set_ptes(mm, addr, ptep, pte, nr)
update_mmu_cache_range(vma, addr, ptep, nr)
flush_dcache_folio(folio)
flush_icache_pages(vma, page, nr)
flush_dcache_folio() isn't technically new, but no architecture
implemented it, so I've done that for them. The old APIs remain around
but are mostly implemented by calling the new interfaces.
The new APIs are based around setting up N page table entries at once.
The N entries belong to the same PMD, the same folio and the same VMA, so
ptep++ is a legitimate operation, and locking is taken care of for you.
Some architectures can do a better job of it than just a loop, but I have
hesitated to make too deep a change to architectures I don't understand
well.
One thing I have changed in every architecture is that PG_arch_1 is now a
per-folio bit instead of a per-page bit when used for dcache clean/dirty
tracking. This was something that would have to happen eventually, and it
makes sense to do it now rather than iterate over every page involved in a
cache flush and figure out if it needs to happen.
The point of all this is better performance, and Fengwei Yin has measured
improvement on x86. I suspect you'll see improvement on your architecture
too. Try the new will-it-scale test mentioned here:
https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/
You'll need to run it on an XFS filesystem and have
CONFIG_TRANSPARENT_HUGEPAGE set.
This patchset is the basis for much of the anonymous large folio work
being done by Ryan, so it's received quite a lot of testing over the last
few months.
This patch (of 38):
Determine if a value lies within a range more efficiently (subtraction +
comparison vs two comparisons and an AND). It also has useful (under some
circumstances) behaviour if the range exceeds the maximum value of the
type. Convert all the conflicting definitions of in_range() within the
kernel; some can use the generic definition while others need their own
definition.
Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-02 23:13:29 +08:00
|
|
|
if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->tolerance = peers_tol;
|
2018-10-10 23:34:01 +08:00
|
|
|
l->bc_rcvlink->tolerance = peers_tol;
|
|
|
|
}
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
/* Update own priority if peer's priority is higher */
|
minmax: add in_range() macro
Patch series "New page table range API", v6.
This patchset changes the API used by the MM to set up page table entries.
The four APIs are:
set_ptes(mm, addr, ptep, pte, nr)
update_mmu_cache_range(vma, addr, ptep, nr)
flush_dcache_folio(folio)
flush_icache_pages(vma, page, nr)
flush_dcache_folio() isn't technically new, but no architecture
implemented it, so I've done that for them. The old APIs remain around
but are mostly implemented by calling the new interfaces.
The new APIs are based around setting up N page table entries at once.
The N entries belong to the same PMD, the same folio and the same VMA, so
ptep++ is a legitimate operation, and locking is taken care of for you.
Some architectures can do a better job of it than just a loop, but I have
hesitated to make too deep a change to architectures I don't understand
well.
One thing I have changed in every architecture is that PG_arch_1 is now a
per-folio bit instead of a per-page bit when used for dcache clean/dirty
tracking. This was something that would have to happen eventually, and it
makes sense to do it now rather than iterate over every page involved in a
cache flush and figure out if it needs to happen.
The point of all this is better performance, and Fengwei Yin has measured
improvement on x86. I suspect you'll see improvement on your architecture
too. Try the new will-it-scale test mentioned here:
https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/
You'll need to run it on an XFS filesystem and have
CONFIG_TRANSPARENT_HUGEPAGE set.
This patchset is the basis for much of the anonymous large folio work
being done by Ryan, so it's received quite a lot of testing over the last
few months.
This patch (of 38):
Determine if a value lies within a range more efficiently (subtraction +
comparison vs two comparisons and an AND). It also has useful (under some
circumstances) behaviour if the range exceeds the maximum value of the
type. Convert all the conflicting definitions of in_range() within the
kernel; some can use the generic definition while others need their own
definition.
Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-02 23:13:29 +08:00
|
|
|
if (tipc_in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI))
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->priority = peers_prio;
|
|
|
|
|
tipc: fix link re-establish failure
When a link failure is detected locally, the link is reset, the flag
link->in_session is set to false, and a RESET_MSG with the 'stopping'
bit set is sent to the peer.
The purpose of this bit is to inform the peer that this endpoint just
is going down, and that the peer should handle the reception of this
particular RESET message as a local failure. This forces the peer to
accept another RESET or ACTIVATE message from this endpoint before it
can re-establish the link. This again is necessary to ensure that
link session numbers are properly exchanged before the link comes up
again.
If a failure is detected locally at the same time at the peer endpoint
this will do the same, which is also a correct behavior.
However, when receiving such messages, the endpoints will not
distinguish between 'stopping' RESETs and ordinary ones when it comes
to updating session numbers. Both endpoints will copy the received
session number and set their 'in_session' flags to true at the
reception, while they are still expecting another RESET from the
peer before they can go ahead and re-establish. This is contradictory,
since, after applying the validation check referred to below, the
'in_session' flag will cause rejection of all such messages, and the
link will never come up again.
We now fix this by not only handling received RESET/STOPPING messages
as a local failure, but also by omitting to set a new session number
and the 'in_session' flag in such cases.
Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-11 06:30:24 +08:00
|
|
|
/* If peer is going down we want full re-establish cycle */
|
|
|
|
if (msg_peer_stopping(hdr)) {
|
tipc: guarantee peer bearer id exchange after reboot
When a link endpoint is going down locally, e.g., because its interface
is being stopped, it will spontaneously send out a RESET message to
its peer, informing it about this fact. This saves the peer from
detecting the failure via probing, and hence gives both speedier and
less resource consuming failure detection on the peer side.
According to the link FSM, a receiver of a RESET message, ignoring the
reason for it, must now consider the sender ready to come back up, and
starts periodically sending out ACTIVATE messages to the peer in order
to re-establish the link. Also, according to the FSM, the receiver of
an ACTIVATE message can now go directly to state ESTABLISHED and start
sending regular traffic packets. This is a well-proven and robust FSM.
However, in the case of a reboot, there is a small possibilty that link
endpoint on the rebooted node may have been re-created with a new bearer
identity between the moment it sent its (pre-boot) RESET and the moment
it receives the ACTIVATE from the peer. The new bearer identity cannot
be known by the peer according to this scenario, since traffic headers
don't convey such information. This is a problem, because both endpoints
need to know the correct value of the peer's bearer id at any moment in
time in order to be able to produce correct link events for their users.
The only way to guarantee this is to enforce a full setup message
exchange (RESET + ACTIVATE) even after the reboot, since those messages
carry the bearer idientity in their header.
In this commit we do this by introducing and setting a "stopping" bit in
the header of the spontaneously generated RESET messages, informing the
peer that the sender will not be immediately ready to re-establish the
link. A receiver seeing this bit must act as if this were a locally
detected connectivity failure, and hence has to go through a full two-
way setup message exchange before any link can be re-established.
Although never reported, this problem seems to have always been around.
This protocol addition is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-04-16 01:33:03 +08:00
|
|
|
rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
tipc: fix link re-establish failure
When a link failure is detected locally, the link is reset, the flag
link->in_session is set to false, and a RESET_MSG with the 'stopping'
bit set is sent to the peer.
The purpose of this bit is to inform the peer that this endpoint just
is going down, and that the peer should handle the reception of this
particular RESET message as a local failure. This forces the peer to
accept another RESET or ACTIVATE message from this endpoint before it
can re-establish the link. This again is necessary to ensure that
link session numbers are properly exchanged before the link comes up
again.
If a failure is detected locally at the same time at the peer endpoint
this will do the same, which is also a correct behavior.
However, when receiving such messages, the endpoints will not
distinguish between 'stopping' RESETs and ordinary ones when it comes
to updating session numbers. Both endpoints will copy the received
session number and set their 'in_session' flags to true at the
reception, while they are still expecting another RESET from the
peer before they can go ahead and re-establish. This is contradictory,
since, after applying the validation check referred to below, the
'in_session' flag will cause rejection of all such messages, and the
link will never come up again.
We now fix this by not only handling received RESET/STOPPING messages
as a local failure, but also by omitting to set a new session number
and the 'in_session' flag in such cases.
Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-11 06:30:24 +08:00
|
|
|
break;
|
|
|
|
}
|
tipc: fix link session and re-establish issues
When a link endpoint is re-created (e.g. after a node reboot or
interface reset), the link session number is varied by random, the peer
endpoint will be synced with this new session number before the link is
re-established.
However, there is a shortcoming in this mechanism that can lead to the
link never re-established or faced with a failure then. It happens when
the peer endpoint is ready in ESTABLISHING state, the 'peer_session' as
well as the 'in_session' flag have been set, but suddenly this link
endpoint leaves. When it comes back with a random session number, there
are two situations possible:
1/ If the random session number is larger than (or equal to) the
previous one, the peer endpoint will be updated with this new session
upon receipt of a RESET_MSG from this endpoint, and the link can be re-
established as normal. Otherwise, all the RESET_MSGs from this endpoint
will be rejected by the peer. In turn, when this link endpoint receives
one ACTIVATE_MSG from the peer, it will move to ESTABLISHED and start
to send STATE_MSGs, but again these messages will be dropped by the
peer due to wrong session.
The peer link endpoint can still become ESTABLISHED after receiving a
traffic message from this endpoint (e.g. a BCAST_PROTOCOL or
NAME_DISTRIBUTOR), but since all the STATE_MSGs are invalid, the link
will be forced down sooner or later!
Even in case the random session number is larger than the previous one,
it can be that the ACTIVATE_MSG from the peer arrives first, and this
link endpoint moves quickly to ESTABLISHED without sending out any
RESET_MSG yet. Consequently, the peer link will not be updated with the
new session number, and the same link failure scenario as above will
happen.
2/ Another situation can be that, the peer link endpoint was reset due
to any reasons in the meantime, its link state was set to RESET from
ESTABLISHING but still in session, i.e. the 'in_session' flag is not
reset...
Now, if the random session number from this endpoint is less than the
previous one, all the RESET_MSGs from this endpoint will be rejected by
the peer. In the other direction, when this link endpoint receives a
RESET_MSG from the peer, it moves to ESTABLISHING and starts to send
ACTIVATE_MSGs, but all these messages will be rejected by the peer too.
As a result, the link cannot be re-established but gets stuck with this
link endpoint in state ESTABLISHING and the peer in RESET!
Solution:
===========
This link endpoint should not go directly to ESTABLISHED when getting
ACTIVATE_MSG from the peer which may belong to the old session if the
link was re-created. To ensure the session to be correct before the
link is re-established, the peer endpoint in ESTABLISHING state will
send back the last session number in ACTIVATE_MSG for a verification at
this endpoint. Then, if needed, a new and more appropriate session
number will be regenerated to force a re-synch first.
In addition, when a link in ESTABLISHING state is reset, its state will
move to RESET according to the link FSM, along with resetting the
'in_session' flag (and the other data) as a normal link reset, it will
also be deleted if requested.
The solution is backward compatible.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-11 14:29:43 +08:00
|
|
|
|
|
|
|
/* If this endpoint was re-created while peer was ESTABLISHING
|
|
|
|
* it doesn't know current session number. Force re-synch.
|
|
|
|
*/
|
|
|
|
if (mtyp == ACTIVATE_MSG && msg_dest_session_valid(hdr) &&
|
|
|
|
l->session != msg_dest_session(hdr)) {
|
|
|
|
if (less(l->session, msg_dest_session(hdr)))
|
|
|
|
l->session = msg_dest_session(hdr) + 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
tipc: fix link re-establish failure
When a link failure is detected locally, the link is reset, the flag
link->in_session is set to false, and a RESET_MSG with the 'stopping'
bit set is sent to the peer.
The purpose of this bit is to inform the peer that this endpoint just
is going down, and that the peer should handle the reception of this
particular RESET message as a local failure. This forces the peer to
accept another RESET or ACTIVATE message from this endpoint before it
can re-establish the link. This again is necessary to ensure that
link session numbers are properly exchanged before the link comes up
again.
If a failure is detected locally at the same time at the peer endpoint
this will do the same, which is also a correct behavior.
However, when receiving such messages, the endpoints will not
distinguish between 'stopping' RESETs and ordinary ones when it comes
to updating session numbers. Both endpoints will copy the received
session number and set their 'in_session' flags to true at the
reception, while they are still expecting another RESET from the
peer before they can go ahead and re-establish. This is contradictory,
since, after applying the validation check referred to below, the
'in_session' flag will cause rejection of all such messages, and the
link will never come up again.
We now fix this by not only handling received RESET/STOPPING messages
as a local failure, but also by omitting to set a new session number
and the 'in_session' flag in such cases.
Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-11 06:30:24 +08:00
|
|
|
/* ACTIVATE_MSG serves as PEER_RESET if link is already down */
|
|
|
|
if (mtyp == RESET_MSG || !link_is_up(l))
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT);
|
|
|
|
|
|
|
|
/* ACTIVATE_MSG takes up link if it was already locally reset */
|
tipc: fix link re-establish failure
When a link failure is detected locally, the link is reset, the flag
link->in_session is set to false, and a RESET_MSG with the 'stopping'
bit set is sent to the peer.
The purpose of this bit is to inform the peer that this endpoint just
is going down, and that the peer should handle the reception of this
particular RESET message as a local failure. This forces the peer to
accept another RESET or ACTIVATE message from this endpoint before it
can re-establish the link. This again is necessary to ensure that
link session numbers are properly exchanged before the link comes up
again.
If a failure is detected locally at the same time at the peer endpoint
this will do the same, which is also a correct behavior.
However, when receiving such messages, the endpoints will not
distinguish between 'stopping' RESETs and ordinary ones when it comes
to updating session numbers. Both endpoints will copy the received
session number and set their 'in_session' flags to true at the
reception, while they are still expecting another RESET from the
peer before they can go ahead and re-establish. This is contradictory,
since, after applying the validation check referred to below, the
'in_session' flag will cause rejection of all such messages, and the
link will never come up again.
We now fix this by not only handling received RESET/STOPPING messages
as a local failure, but also by omitting to set a new session number
and the 'in_session' flag in such cases.
Fixes: 7ea817f4e832 ("tipc: check session number before accepting link protocol messages")
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-11-11 06:30:24 +08:00
|
|
|
if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING)
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
rc = TIPC_LINK_UP_EVT;
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->peer_session = msg_session(hdr);
|
2018-07-10 07:07:36 +08:00
|
|
|
l->in_session = true;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->peer_bearer_id = msg_bearer_id(hdr);
|
tipc: do not update mtu if msg_max is too small in mtu negotiation
When doing link mtu negotiation, a malicious peer may send Activate msg
with a very small mtu, e.g. 4 in Shuang's testing, without checking for
the minimum mtu, l->mtu will be set to 4 in tipc_link_proto_rcv(), then
n->links[bearer_id].mtu is set to 4294967228, which is a overflow of
'4 - INT_H_SIZE - EMSG_OVERHEAD' in tipc_link_mss().
With tipc_link.mtu = 4, tipc_link_xmit() kept printing the warning:
tipc: Too large msg, purging xmit list 1 5 0 40 4!
tipc: Too large msg, purging xmit list 1 15 0 60 4!
And with tipc_link_entry.mtu 4294967228, a huge skb was allocated in
named_distribute(), and when purging it in tipc_link_xmit(), a crash
was even caused:
general protection fault, probably for non-canonical address 0x2100001011000dd: 0000 [#1] PREEMPT SMP PTI
CPU: 0 PID: 0 Comm: swapper/0 Kdump: loaded Not tainted 6.3.0.neta #19
RIP: 0010:kfree_skb_list_reason+0x7e/0x1f0
Call Trace:
<IRQ>
skb_release_data+0xf9/0x1d0
kfree_skb_reason+0x40/0x100
tipc_link_xmit+0x57a/0x740 [tipc]
tipc_node_xmit+0x16c/0x5c0 [tipc]
tipc_named_node_up+0x27f/0x2c0 [tipc]
tipc_node_write_unlock+0x149/0x170 [tipc]
tipc_rcv+0x608/0x740 [tipc]
tipc_udp_recv+0xdc/0x1f0 [tipc]
udp_queue_rcv_one_skb+0x33e/0x620
udp_unicast_rcv_skb.isra.72+0x75/0x90
__udp4_lib_rcv+0x56d/0xc20
ip_protocol_deliver_rcu+0x100/0x2d0
This patch fixes it by checking the new mtu against tipc_bearer_min_mtu(),
and not updating mtu if it is too small.
Fixes: ed193ece2649 ("tipc: simplify link mtu negotiation")
Reported-by: Shuang Li <shuali@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2023-05-15 03:52:28 +08:00
|
|
|
if (l->mtu > msg_max)
|
|
|
|
l->mtu = msg_max;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
break;
|
2015-07-31 06:24:21 +08:00
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
case STATE_MSG:
|
2022-03-08 10:11:59 +08:00
|
|
|
/* Validate Gap ACK blocks, drop if invalid */
|
|
|
|
glen = tipc_get_gap_ack_blks(&ga, l, hdr, true);
|
|
|
|
if (glen > dlen)
|
|
|
|
break;
|
|
|
|
|
2018-07-10 07:07:35 +08:00
|
|
|
l->rcv_nxt_state = msg_seqno(hdr) + 1;
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
/* Update own tolerance if peer indicates a non-zero value */
|
minmax: add in_range() macro
Patch series "New page table range API", v6.
This patchset changes the API used by the MM to set up page table entries.
The four APIs are:
set_ptes(mm, addr, ptep, pte, nr)
update_mmu_cache_range(vma, addr, ptep, nr)
flush_dcache_folio(folio)
flush_icache_pages(vma, page, nr)
flush_dcache_folio() isn't technically new, but no architecture
implemented it, so I've done that for them. The old APIs remain around
but are mostly implemented by calling the new interfaces.
The new APIs are based around setting up N page table entries at once.
The N entries belong to the same PMD, the same folio and the same VMA, so
ptep++ is a legitimate operation, and locking is taken care of for you.
Some architectures can do a better job of it than just a loop, but I have
hesitated to make too deep a change to architectures I don't understand
well.
One thing I have changed in every architecture is that PG_arch_1 is now a
per-folio bit instead of a per-page bit when used for dcache clean/dirty
tracking. This was something that would have to happen eventually, and it
makes sense to do it now rather than iterate over every page involved in a
cache flush and figure out if it needs to happen.
The point of all this is better performance, and Fengwei Yin has measured
improvement on x86. I suspect you'll see improvement on your architecture
too. Try the new will-it-scale test mentioned here:
https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/
You'll need to run it on an XFS filesystem and have
CONFIG_TRANSPARENT_HUGEPAGE set.
This patchset is the basis for much of the anonymous large folio work
being done by Ryan, so it's received quite a lot of testing over the last
few months.
This patch (of 38):
Determine if a value lies within a range more efficiently (subtraction +
comparison vs two comparisons and an AND). It also has useful (under some
circumstances) behaviour if the range exceeds the maximum value of the
type. Convert all the conflicting definitions of in_range() within the
kernel; some can use the generic definition while others need their own
definition.
Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-02 23:13:29 +08:00
|
|
|
if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) {
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->tolerance = peers_tol;
|
2018-10-10 23:34:01 +08:00
|
|
|
l->bc_rcvlink->tolerance = peers_tol;
|
|
|
|
}
|
2016-11-24 10:05:26 +08:00
|
|
|
/* Update own prio if peer indicates a different value */
|
|
|
|
if ((peers_prio != l->priority) &&
|
minmax: add in_range() macro
Patch series "New page table range API", v6.
This patchset changes the API used by the MM to set up page table entries.
The four APIs are:
set_ptes(mm, addr, ptep, pte, nr)
update_mmu_cache_range(vma, addr, ptep, nr)
flush_dcache_folio(folio)
flush_icache_pages(vma, page, nr)
flush_dcache_folio() isn't technically new, but no architecture
implemented it, so I've done that for them. The old APIs remain around
but are mostly implemented by calling the new interfaces.
The new APIs are based around setting up N page table entries at once.
The N entries belong to the same PMD, the same folio and the same VMA, so
ptep++ is a legitimate operation, and locking is taken care of for you.
Some architectures can do a better job of it than just a loop, but I have
hesitated to make too deep a change to architectures I don't understand
well.
One thing I have changed in every architecture is that PG_arch_1 is now a
per-folio bit instead of a per-page bit when used for dcache clean/dirty
tracking. This was something that would have to happen eventually, and it
makes sense to do it now rather than iterate over every page involved in a
cache flush and figure out if it needs to happen.
The point of all this is better performance, and Fengwei Yin has measured
improvement on x86. I suspect you'll see improvement on your architecture
too. Try the new will-it-scale test mentioned here:
https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/
You'll need to run it on an XFS filesystem and have
CONFIG_TRANSPARENT_HUGEPAGE set.
This patchset is the basis for much of the anonymous large folio work
being done by Ryan, so it's received quite a lot of testing over the last
few months.
This patch (of 38):
Determine if a value lies within a range more efficiently (subtraction +
comparison vs two comparisons and an AND). It also has useful (under some
circumstances) behaviour if the range exceeds the maximum value of the
type. Convert all the conflicting definitions of in_range() within the
kernel; some can use the generic definition while others need their own
definition.
Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-08-02 23:13:29 +08:00
|
|
|
tipc_in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) {
|
2016-02-01 15:19:57 +08:00
|
|
|
l->priority = peers_prio;
|
|
|
|
rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
|
|
|
|
}
|
|
|
|
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->silent_intv_cnt = 0;
|
|
|
|
l->stats.recv_states++;
|
|
|
|
if (msg_probe(hdr))
|
|
|
|
l->stats.recv_probes++;
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
|
|
|
|
if (!link_is_up(l)) {
|
|
|
|
if (l->state == LINK_ESTABLISHING)
|
|
|
|
rc = TIPC_LINK_UP_EVT;
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
break;
|
tipc: delay ESTABLISH state event when link is established
Link establishing, just like link teardown, is a non-atomic action, in
the sense that discovering that conditions are right to establish a link,
and the actual adding of the link to one of the node's send slots is done
in two different lock contexts. The link FSM is designed to help bridging
the gap between the two contexts in a safe manner.
We have now discovered a weakness in the implementaton of this FSM.
Because we directly let the link go from state LINK_ESTABLISHING to
state LINK_ESTABLISHED already in the first lock context, we are unable
to distinguish between a fully established link, i.e., a link that has
been added to its slot, and a link that has not yet reached the second
lock context. It may hence happen that a manual intervention, e.g., when
disabling an interface, causes the function tipc_node_link_down() to try
removing the link from the node slots, decrementing its active link
counter etc, although the link was never added there in the first place.
We solve this by delaying the actual state change until we reach the
second lock context, inside the function tipc_node_link_up(). This
makes it possible for potentail callers of __tipc_node_link_down() to
know if they should proceed or not, and the problem is solved.
Unforunately, the situation described above also has a second problem.
Since there by necessity is a tipc_node_link_up() call pending once
the node lock has been released, we must defuse that call by setting
the link back from LINK_ESTABLISHING to LINK_RESET state. This forces
us to make a slight modification to the link FSM, which will now look
as follows.
+------------------------------------+
|RESET_EVT |
| |
| +--------------+
| +-----------------| SYNCHING |-----------------+
| |FAILURE_EVT +--------------+ PEER_RESET_EVT|
| | A | |
| | | | |
| | | | |
| | |SYNCH_ |SYNCH_ |
| | |BEGIN_EVT |END_EVT |
| | | | |
| V | V V
| +-------------+ +--------------+ +------------+
| | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET |
| +-------------+ FAILURE_ +--------------+ PEER_ +------------+
| | EVT | A RESET_EVT |
| | | | |
| | +----------------+ | |
| RESET_EVT| |RESET_EVT | |
| | | | |
| | | |ESTABLISH_EVT |
| | | +-------------+ | |
| | | | RESET_EVT | | |
| | | | | | |
| V V V | | |
| +-------------+ +--------------+ RESET_EVT|
+--->| RESET |--------->| ESTABLISHING |<----------------+
+-------------+ PEER_ +--------------+
| A RESET_EVT |
| | |
| | |
|FAILOVER_ |FAILOVER_ |FAILOVER_
|BEGIN_EVT |END_EVT |BEGIN_EVT
| | |
V | |
+-------------+ |
| FAILINGOVER |<----------------+
+-------------+
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-16 02:52:44 +08:00
|
|
|
}
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
|
|
|
|
tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr,
|
tipc: add neighbor monitoring framework
TIPC based clusters are by default set up with full-mesh link
connectivity between all nodes. Those links are expected to provide
a short failure detection time, by default set to 1500 ms. Because
of this, the background load for neighbor monitoring in an N-node
cluster increases with a factor N on each node, while the overall
monitoring traffic through the network infrastructure increases at
a ~(N * (N - 1)) rate. Experience has shown that such clusters don't
scale well beyond ~100 nodes unless we significantly increase failure
discovery tolerance.
This commit introduces a framework and an algorithm that drastically
reduces this background load, while basically maintaining the original
failure detection times across the whole cluster. Using this algorithm,
background load will now grow at a rate of ~(2 * sqrt(N)) per node, and
at ~(2 * N * sqrt(N)) in traffic overhead. As an example, each node will
now have to actively monitor 38 neighbors in a 400-node cluster, instead
of as before 399.
This "Overlapping Ring Supervision Algorithm" is completely distributed
and employs no centralized or coordinated state. It goes as follows:
- Each node makes up a linearly ascending, circular list of all its N
known neighbors, based on their TIPC node identity. This algorithm
must be the same on all nodes.
- The node then selects the next M = sqrt(N) - 1 nodes downstream from
itself in the list, and chooses to actively monitor those. This is
called its "local monitoring domain".
- It creates a domain record describing the monitoring domain, and
piggy-backs this in the data area of all neighbor monitoring messages
(LINK_PROTOCOL/STATE) leaving that node. This means that all nodes in
the cluster eventually (default within 400 ms) will learn about
its monitoring domain.
- Whenever a node discovers a change in its local domain, e.g., a node
has been added or has gone down, it creates and sends out a new
version of its node record to inform all neighbors about the change.
- A node receiving a domain record from anybody outside its local domain
matches this against its own list (which may not look the same), and
chooses to not actively monitor those members of the received domain
record that are also present in its own list. Instead, it relies on
indications from the direct monitoring nodes if an indirectly
monitored node has gone up or down. If a node is indicated lost, the
receiving node temporarily activates its own direct monitoring towards
that node in order to confirm, or not, that it is actually gone.
- Since each node is actively monitoring sqrt(N) downstream neighbors,
each node is also actively monitored by the same number of upstream
neighbors. This means that all non-direct monitoring nodes normally
will receive sqrt(N) indications that a node is gone.
- A major drawback with ring monitoring is how it handles failures that
cause massive network partitionings. If both a lost node and all its
direct monitoring neighbors are inside the lost partition, the nodes in
the remaining partition will never receive indications about the loss.
To overcome this, each node also chooses to actively monitor some
nodes outside its local domain. Those nodes are called remote domain
"heads", and are selected in such a way that no node in the cluster
will be more than two direct monitoring hops away. Because of this,
each node, apart from monitoring the member of its local domain, will
also typically monitor sqrt(N) remote head nodes.
- As an optimization, local list status, domain status and domain
records are marked with a generation number. This saves senders from
unnecessarily conveying unaltered domain records, and receivers from
performing unneeded re-adaptations of their node monitoring list, such
as re-assigning domain heads.
- As a measure of caution we have added the possibility to disable the
new algorithm through configuration. We do this by keeping a threshold
value for the cluster size; a cluster that grows beyond this value
will switch from full-mesh to ring monitoring, and vice versa when
it shrinks below the value. This means that if the threshold is set to
a value larger than any anticipated cluster size (default size is 32)
the new algorithm is effectively disabled. A patch set for altering the
threshold value and for listing the table contents will follow shortly.
- This change is fully backwards compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-06-14 08:46:22 +08:00
|
|
|
&l->mon_state, l->bearer_id);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
/* Send NACK if peer has sent pkts we haven't received yet */
|
tipc: eliminate more unnecessary nacks and retransmissions
When we increase the link tranmsit window we often observe the following
scenario:
1) A STATE message bypasses a sequence of traffic packets and arrives
far ahead of those to the receiver. STATE messages contain a
'peers_nxt_snt' field to indicate which was the last packet sent
from the peer. This mechanism is intended as a last resort for the
receiver to detect missing packets, e.g., during very low traffic
when there is no packet flow to help early loss detection.
3) The receiving link compares the 'peer_nxt_snt' field to its own
'rcv_nxt', finds that there is a gap, and immediately sends a
NACK message back to the peer.
4) When this NACKs arrives at the sender, all the requested
retransmissions are performed, since it is a first-time request.
Just like in the scenario described in the previous commit this leads
to many redundant retransmissions, with decreased throughput as a
consequence.
We fix this by adding two more conditions before we send a NACK in
this sitution. First, the deferred queue must be empty, so we cannot
assume that the potential packet loss has already been detected by
other means. Second, we check the 'peers_snd_nxt' field only in probe/
probe_reply messages, thus turning this into a true mechanism of last
resort as it was really meant to be.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:45 +08:00
|
|
|
if ((reply || msg_is_keepalive(hdr)) &&
|
|
|
|
more(peers_snd_nxt, rcv_nxt) &&
|
|
|
|
!tipc_link_is_synching(l) &&
|
|
|
|
skb_queue_empty(&l->deferdq))
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
rcvgap = peers_snd_nxt - l->rcv_nxt;
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
if (rcvgap || reply)
|
|
|
|
tipc_link_build_proto_msg(l, STATE_MSG, 0, reply,
|
|
|
|
rcvgap, 0, 0, xmitq);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
released = tipc_link_advance_transmq(l, l, ack, gap, ga, xmitq,
|
|
|
|
&retransmitted, &rc);
|
tipc: improve TIPC throughput by Gap ACK blocks
During unicast link transmission, it's observed very often that because
of one or a few lost/dis-ordered packets, the sending side will fastly
reach the send window limit and must wait for the packets to be arrived
at the receiving side or in the worst case, a retransmission must be
done first. The sending side cannot release a lot of subsequent packets
in its transmq even though all of them might have already been received
by the receiving side.
That is, one or two packets dis-ordered/lost and dozens of packets have
to wait, this obviously reduces the overall throughput!
This commit introduces an algorithm to overcome this by using "Gap ACK
blocks". Basically, a Gap ACK block will consist of <ack, gap> numbers
that describes the link deferdq where packets have been got by the
receiving side but with gaps, for example:
link deferdq: [1 2 3 4 10 11 13 14 15 20]
--> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0>
The Gap ACK blocks will be sent to the sending side along with the
traditional ACK or NACK message. Immediately when receiving the message
the sending side will now not only release from its transmq the packets
ack-ed by the ACK but also by the Gap ACK blocks! So, more packets can
be enqueued and transmitted.
In addition, the sending side can now do "multi-retransmissions"
according to the Gaps reported in the Gap ACK blocks.
The new algorithm as verified helps greatly improve the TIPC throughput
especially under packet loss condition.
So far, a maximum of 32 blocks is quite enough without any "Too few Gap
ACK blocks" reports with a 5.0% packet loss rate, however this number
can be increased in the furture if needed.
Also, the patch is backward compatible.
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-04 12:09:51 +08:00
|
|
|
if (gap)
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
l->stats.recv_nacks++;
|
2020-05-26 17:38:34 +08:00
|
|
|
if (released || retransmitted)
|
|
|
|
tipc_link_update_cwin(l, released, retransmitted);
|
|
|
|
if (released)
|
|
|
|
tipc_link_advance_backlog(l, xmitq);
|
tipc: reduce locking scope during packet reception
We convert packet/message reception according to the same principle
we have been using for message sending and timeout handling:
We move the function tipc_rcv() to node.c, hence handling the initial
packet reception at the link aggregation level. The function grabs
the node lock, selects the receiving link, and accesses it via a new
call tipc_link_rcv(). This function appends buffers to the input
queue for delivery upwards, but it may also append outgoing packets
to the xmit queue, just as we do during regular message sending. The
latter will happen when buffers are forwarded from the link backlog,
or when retransmission is requested.
Upon return of this function, and after having released the node lock,
tipc_rcv() delivers/tranmsits the contents of those queues, but it may
also perform actions such as link activation or reset, as indicated by
the return flags from the link.
This reduces the number of cpu cycles spent inside the node spinlock,
and reduces contention on that lock.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-07-17 04:54:31 +08:00
|
|
|
if (unlikely(!skb_queue_empty(&l->wakeupq)))
|
|
|
|
link_prepare_wakeup(l);
|
|
|
|
}
|
|
|
|
exit:
|
|
|
|
kfree_skb(skb);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
2015-10-22 20:51:41 +08:00
|
|
|
/* tipc_link_build_bc_proto_msg() - create broadcast protocol message
|
|
|
|
*/
|
|
|
|
static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast,
|
|
|
|
u16 peers_snd_nxt,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct tipc_msg *hdr;
|
|
|
|
struct sk_buff *dfrd_skb = skb_peek(&l->deferdq);
|
|
|
|
u16 ack = l->rcv_nxt - 1;
|
|
|
|
u16 gap_to = peers_snd_nxt - 1;
|
|
|
|
|
|
|
|
skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
|
2016-03-04 03:23:21 +08:00
|
|
|
0, l->addr, tipc_own_addr(l->net), 0, 0, 0);
|
2015-10-22 20:51:41 +08:00
|
|
|
if (!skb)
|
|
|
|
return false;
|
|
|
|
hdr = buf_msg(skb);
|
|
|
|
msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1);
|
|
|
|
msg_set_bcast_ack(hdr, ack);
|
|
|
|
msg_set_bcgap_after(hdr, ack);
|
|
|
|
if (dfrd_skb)
|
|
|
|
gap_to = buf_seqno(dfrd_skb) - 1;
|
|
|
|
msg_set_bcgap_to(hdr, gap_to);
|
|
|
|
msg_set_non_seq(hdr, bcast);
|
|
|
|
__skb_queue_tail(xmitq, skb);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* tipc_link_build_bc_init_msg() - synchronize broadcast link endpoints.
|
|
|
|
*
|
|
|
|
* Give a newly added peer node the sequence number where it should
|
|
|
|
* start receiving and acking broadcast packets.
|
|
|
|
*/
|
2015-10-24 22:56:01 +08:00
|
|
|
static void tipc_link_build_bc_init_msg(struct tipc_link *l,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-10-22 20:51:41 +08:00
|
|
|
{
|
|
|
|
struct sk_buff_head list;
|
|
|
|
|
|
|
|
__skb_queue_head_init(&list);
|
|
|
|
if (!tipc_link_build_bc_proto_msg(l->bc_rcvlink, false, 0, &list))
|
|
|
|
return;
|
2016-10-28 06:51:55 +08:00
|
|
|
msg_set_bc_ack_invalid(buf_msg(skb_peek(&list)), true);
|
2015-10-22 20:51:41 +08:00
|
|
|
tipc_link_xmit(l, &list, xmitq);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* tipc_link_bc_init_rcv - receive initial broadcast synch data from peer
|
|
|
|
*/
|
|
|
|
void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr)
|
|
|
|
{
|
|
|
|
int mtyp = msg_type(hdr);
|
|
|
|
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
|
|
|
|
|
|
|
|
if (link_is_up(l))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (msg_user(hdr) == BCAST_PROTOCOL) {
|
|
|
|
l->rcv_nxt = peers_snd_nxt;
|
|
|
|
l->state = LINK_ESTABLISHED;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (l->peer_caps & TIPC_BCAST_SYNCH)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (msg_peer_node_is_up(hdr))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Compatibility: accept older, less safe initial synch data */
|
|
|
|
if ((mtyp == RESET_MSG) || (mtyp == ACTIVATE_MSG))
|
|
|
|
l->rcv_nxt = peers_snd_nxt;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
|
|
|
|
*/
|
2016-09-02 01:52:49 +08:00
|
|
|
int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-10-22 20:51:41 +08:00
|
|
|
{
|
|
|
|
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
|
2016-09-02 01:52:49 +08:00
|
|
|
int rc = 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
|
|
|
if (!link_is_up(l))
|
2016-09-02 01:52:49 +08:00
|
|
|
return rc;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
|
|
|
if (!msg_peer_node_is_up(hdr))
|
2016-09-02 01:52:49 +08:00
|
|
|
return rc;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2020-11-30 02:32:44 +08:00
|
|
|
/* Open when peer acknowledges our bcast init msg (pkt #1) */
|
2016-07-12 04:08:35 +08:00
|
|
|
if (msg_ack(hdr))
|
|
|
|
l->bc_peer_is_up = true;
|
|
|
|
|
|
|
|
if (!l->bc_peer_is_up)
|
2016-09-02 01:52:49 +08:00
|
|
|
return rc;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
|
|
|
/* Ignore if peers_snd_nxt goes beyond receive window */
|
|
|
|
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
|
2016-09-02 01:52:49 +08:00
|
|
|
return rc;
|
|
|
|
|
|
|
|
l->snd_nxt = peers_snd_nxt;
|
|
|
|
if (link_bc_rcv_gap(l))
|
|
|
|
rc |= TIPC_LINK_SND_STATE;
|
|
|
|
|
|
|
|
/* Return now if sender supports nack via STATE messages */
|
|
|
|
if (l->peer_caps & TIPC_BCAST_STATE_NACK)
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
/* Otherwise, be backwards compatible */
|
2015-10-22 20:51:41 +08:00
|
|
|
|
|
|
|
if (!more(peers_snd_nxt, l->rcv_nxt)) {
|
|
|
|
l->nack_state = BC_NACK_SND_CONDITIONAL;
|
2016-09-02 01:52:49 +08:00
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Don't NACK if one was recently sent or peeked */
|
|
|
|
if (l->nack_state == BC_NACK_SND_SUPPRESS) {
|
|
|
|
l->nack_state = BC_NACK_SND_UNCONDITIONAL;
|
2016-09-02 01:52:49 +08:00
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Conditionally delay NACK sending until next synch rcv */
|
|
|
|
if (l->nack_state == BC_NACK_SND_CONDITIONAL) {
|
|
|
|
l->nack_state = BC_NACK_SND_UNCONDITIONAL;
|
|
|
|
if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN)
|
2016-09-02 01:52:49 +08:00
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Send NACK now but suppress next one */
|
|
|
|
tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq);
|
|
|
|
l->nack_state = BC_NACK_SND_SUPPRESS;
|
2016-09-02 01:52:49 +08:00
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap,
|
|
|
|
struct tipc_gap_ack_blks *ga,
|
2020-05-26 17:38:36 +08:00
|
|
|
struct sk_buff_head *xmitq,
|
|
|
|
struct sk_buff_head *retrq)
|
2015-10-22 20:51:41 +08:00
|
|
|
{
|
2020-05-26 17:38:34 +08:00
|
|
|
struct tipc_link *l = r->bc_sndlink;
|
|
|
|
bool unused = false;
|
|
|
|
int rc = 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
if (!link_is_up(r) || !r->bc_peer_is_up)
|
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2020-05-26 17:38:37 +08:00
|
|
|
if (gap) {
|
|
|
|
l->stats.recv_nacks++;
|
|
|
|
r->stats.recv_nacks++;
|
|
|
|
}
|
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
if (less(acked, r->acked) || (acked == r->acked && !gap && !ga))
|
|
|
|
return 0;
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2020-05-26 17:38:35 +08:00
|
|
|
trace_tipc_link_bc_ack(r, acked, gap, &l->transmq);
|
2020-05-26 17:38:36 +08:00
|
|
|
tipc_link_advance_transmq(l, r, acked, gap, ga, retrq, &unused, &rc);
|
2015-10-22 20:51:41 +08:00
|
|
|
|
2020-05-26 17:38:34 +08:00
|
|
|
tipc_link_advance_backlog(l, xmitq);
|
|
|
|
if (unlikely(!skb_queue_empty(&l->wakeupq)))
|
|
|
|
link_prepare_wakeup(l);
|
|
|
|
|
|
|
|
return rc;
|
2015-10-22 20:51:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* tipc_link_bc_nack_rcv(): receive broadcast nack message
|
2016-09-02 01:52:49 +08:00
|
|
|
* This function is here for backwards compatibility, since
|
|
|
|
* no BCAST_PROTOCOL/STATE messages occur from TIPC v2.5.
|
2015-10-22 20:51:41 +08:00
|
|
|
*/
|
|
|
|
int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
|
|
|
|
struct sk_buff_head *xmitq)
|
|
|
|
{
|
|
|
|
struct tipc_msg *hdr = buf_msg(skb);
|
|
|
|
u32 dnode = msg_destnode(hdr);
|
|
|
|
int mtyp = msg_type(hdr);
|
|
|
|
u16 acked = msg_bcast_ack(hdr);
|
|
|
|
u16 from = acked + 1;
|
|
|
|
u16 to = msg_bcgap_to(hdr);
|
|
|
|
u16 peers_snd_nxt = to + 1;
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
kfree_skb(skb);
|
|
|
|
|
|
|
|
if (!tipc_link_is_up(l) || !l->bc_peer_is_up)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (mtyp != STATE_MSG)
|
|
|
|
return 0;
|
|
|
|
|
2016-03-04 03:23:21 +08:00
|
|
|
if (dnode == tipc_own_addr(l->net)) {
|
2020-05-26 17:38:36 +08:00
|
|
|
rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq,
|
|
|
|
xmitq);
|
2015-10-22 20:51:41 +08:00
|
|
|
l->stats.recv_nacks++;
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Msg for other node => suppress own NACK at next sync if applicable */
|
|
|
|
if (more(peers_snd_nxt, l->rcv_nxt) && !less(l->rcv_nxt, from))
|
|
|
|
l->nack_state = BC_NACK_SND_SUPPRESS;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2018-03-30 05:20:41 +08:00
|
|
|
int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE);
|
tipc: clean up handling of message priorities
Messages transferred by TIPC are assigned an "importance priority", -an
integer value indicating how to treat the message when there is link or
destination socket congestion.
There is no separate header field for this value. Instead, the message
user values have been chosen in ascending order according to perceived
importance, so that the message user field can be used for this.
This is not a good solution. First, we have many more users than the
needed priority levels, so we end up with treating more priority
levels than necessary. Second, the user field cannot always
accurately reflect the priority of the message. E.g., a message
fragment packet should really have the priority of the enveloped
user data message, and not the priority of the MSG_FRAGMENTER user.
Until now, we have been working around this problem in different ways,
but it is now time to implement a consistent way of handling such
priorities, although still within the constraint that we cannot
allocate any more bits in the regular data message header for this.
In this commit, we define a new priority level, TIPC_SYSTEM_IMPORTANCE,
that will be the only one used apart from the four (lower) user data
levels. All non-data messages map down to this priority. Furthermore,
we take some free bits from the MSG_FRAGMENTER header and allocate
them to store the priority of the enveloped message. We then adjust
the functions msg_importance()/msg_set_importance() so that they
read/set the correct header fields depending on user type.
This small protocol change is fully compatible, because the code at
the receiving end of a link currently reads the importance level
only from user data messages, where there is no change.
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-14 04:08:11 +08:00
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
l->min_win = min_win;
|
|
|
|
l->ssthresh = max_win;
|
|
|
|
l->max_win = max_win;
|
|
|
|
l->window = min_win;
|
|
|
|
l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2;
|
|
|
|
l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4;
|
|
|
|
l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6;
|
|
|
|
l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8;
|
tipc: introduce starvation free send algorithm
Currently, we only use a single counter; the length of the backlog
queue, to determine whether a message should be accepted to the queue
or not. Each time a message is being sent, the queue length is compared
to a threshold value for the message's importance priority. If the queue
length is beyond this threshold, the message is rejected. This algorithm
implies a risk of starvation of low importance senders during very high
load, because it may take a long time before the backlog queue has
decreased enough to accept a lower level message.
We now eliminate this risk by introducing a counter for each importance
priority. When a message is sent, we check only the queue level for that
particular message's priority. If that is ok, the message can be added
to the backlog, irrespective of the queue level for other priorities.
This way, each level is guaranteed a certain portion of the total
bandwidth, and any risk of starvation is eliminated.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 00:07:24 +08:00
|
|
|
l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2021-01-14 16:04:48 +08:00
|
|
|
* tipc_link_reset_stats - reset link statistics
|
2015-11-20 03:30:47 +08:00
|
|
|
* @l: pointer to link
|
2006-01-03 02:04:38 +08:00
|
|
|
*/
|
2015-11-20 03:30:46 +08:00
|
|
|
void tipc_link_reset_stats(struct tipc_link *l)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-11-20 03:30:46 +08:00
|
|
|
memset(&l->stats, 0, sizeof(l->stats));
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
|
|
|
|
2015-07-17 04:54:30 +08:00
|
|
|
static void link_print(struct tipc_link *l, const char *str)
|
2006-01-03 02:04:38 +08:00
|
|
|
{
|
2015-07-17 04:54:30 +08:00
|
|
|
struct sk_buff *hskb = skb_peek(&l->transmq);
|
tipc: make struct tipc_link generic to support broadcast
Realizing that unicast is just a special case of broadcast, we also see
that we can go in the other direction, i.e., that modest changes to the
current unicast link can make it generic enough to support broadcast.
The following changes are introduced here:
- A new counter ("ackers") in struct tipc_link, to indicate how many
peers need to ack a packet before it can be released.
- A corresponding counter in the skb user area, to keep track of how
many peers a are left to ack before a buffer can be released.
- A new counter ("acked"), to keep persistent track of how far a peer
has acked at the moment, i.e., where in the transmission queue to
start updating buffers when the next ack arrives. This is to avoid
double acknowledgements from a peer, with inadvertent relase of
packets as a result.
- A more generic tipc_link_retrans() function, where retransmit starts
from a given sequence number, instead of the first packet in the
transmision queue. This is to minimize the number of retransmitted
packets on the broadcast media.
When the new functionality is taken into use in the next commits,
we expect it to have minimal effect on unicast mode performance.
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-10-22 20:51:38 +08:00
|
|
|
u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt - 1;
|
2015-07-17 04:54:30 +08:00
|
|
|
u16 tail = l->snd_nxt - 1;
|
tipc: decouple the relationship between bearer and link
Currently on both paths of message transmission and reception, the
read lock of tipc_net_lock must be held before bearer is accessed,
while the write lock of tipc_net_lock has to be taken before bearer
is configured. Although it can ensure that bearer is always valid on
the two data paths, link and bearer is closely bound together.
So as the part of effort of removing tipc_net_lock, the locking
policy of bearer protection will be adjusted as below: on the two
data paths, RCU is used, and on the configuration path of bearer,
RTNL lock is applied.
Now RCU just covers the path of message reception. To make it possible
to protect the path of message transmission with RCU, link should not
use its stored bearer pointer to access bearer, but it should use the
bearer identity of its attached bearer as index to get bearer instance
from bearer_list array, which can help us decouple the relationship
between bearer and link. As a result, bearer on the path of message
transmission can be safely protected by RCU when we access bearer_list
array within RCU lock protection.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Jon Maloy <jon.maloy@ericsson.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Tested-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-04-21 10:55:46 +08:00
|
|
|
|
2015-07-31 06:24:21 +08:00
|
|
|
pr_info("%s Link <%s> state %x\n", str, l->name, l->state);
|
2015-07-17 04:54:30 +08:00
|
|
|
pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n",
|
|
|
|
skb_queue_len(&l->transmq), head, tail,
|
|
|
|
skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt);
|
2006-01-03 02:04:38 +08:00
|
|
|
}
|
2014-11-20 17:29:07 +08:00
|
|
|
|
|
|
|
/* Parse and validate nested (link) properties valid for media, bearer and link
|
|
|
|
*/
|
|
|
|
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[])
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
netlink: make validation more configurable for future strictness
We currently have two levels of strict validation:
1) liberal (default)
- undefined (type >= max) & NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
- garbage at end of message accepted
2) strict (opt-in)
- NLA_UNSPEC attributes accepted
- attribute length >= expected accepted
Split out parsing strictness into four different options:
* TRAILING - check that there's no trailing data after parsing
attributes (in message or nested)
* MAXTYPE - reject attrs > max known type
* UNSPEC - reject attributes with NLA_UNSPEC policy entries
* STRICT_ATTRS - strictly validate attribute size
The default for future things should be *everything*.
The current *_strict() is a combination of TRAILING and MAXTYPE,
and is renamed to _deprecated_strict().
The current regular parsing has none of this, and is renamed to
*_parse_deprecated().
Additionally it allows us to selectively set one of the new flags
even on old policies. Notably, the UNSPEC flag could be useful in
this case, since it can be arranged (by filling in the policy) to
not be an incompatible userspace ABI change, but would then going
forward prevent forgetting attribute entries. Similar can apply
to the POLICY flag.
We end up with the following renames:
* nla_parse -> nla_parse_deprecated
* nla_parse_strict -> nla_parse_deprecated_strict
* nlmsg_parse -> nlmsg_parse_deprecated
* nlmsg_parse_strict -> nlmsg_parse_deprecated_strict
* nla_parse_nested -> nla_parse_nested_deprecated
* nla_validate_nested -> nla_validate_nested_deprecated
Using spatch, of course:
@@
expression TB, MAX, HEAD, LEN, POL, EXT;
@@
-nla_parse(TB, MAX, HEAD, LEN, POL, EXT)
+nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression NLH, HDRLEN, TB, MAX, POL, EXT;
@@
-nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
+nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT)
@@
expression TB, MAX, NLA, POL, EXT;
@@
-nla_parse_nested(TB, MAX, NLA, POL, EXT)
+nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT)
@@
expression START, MAX, POL, EXT;
@@
-nla_validate_nested(START, MAX, POL, EXT)
+nla_validate_nested_deprecated(START, MAX, POL, EXT)
@@
expression NLH, HDRLEN, MAX, POL, EXT;
@@
-nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT)
+nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT)
For this patch, don't actually add the strict, non-renamed versions
yet so that it breaks compile if I get it wrong.
Also, while at it, make nla_validate and nla_parse go down to a
common __nla_validate_parse() function to avoid code duplication.
Ultimately, this allows us to have very strict validation for every
new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the
next patch, while existing things will continue to work as is.
In effect then, this adds fully strict validation for any new command.
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 20:07:28 +08:00
|
|
|
err = nla_parse_nested_deprecated(props, TIPC_NLA_PROP_MAX, prop,
|
|
|
|
tipc_nl_prop_policy, NULL);
|
2014-11-20 17:29:07 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
if (props[TIPC_NLA_PROP_PRIO]) {
|
|
|
|
u32 prio;
|
|
|
|
|
|
|
|
prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
|
|
|
|
if (prio > TIPC_MAX_LINK_PRI)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (props[TIPC_NLA_PROP_TOL]) {
|
|
|
|
u32 tol;
|
|
|
|
|
|
|
|
tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
|
|
|
|
if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (props[TIPC_NLA_PROP_WIN]) {
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
u32 max_win;
|
2014-11-20 17:29:07 +08:00
|
|
|
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
|
|
|
|
if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN)
|
2014-11-20 17:29:07 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2014-11-20 17:29:12 +08:00
|
|
|
|
2014-11-24 18:10:29 +08:00
|
|
|
static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
|
2014-11-20 17:29:12 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct nlattr *stats;
|
|
|
|
|
|
|
|
struct nla_map {
|
|
|
|
u32 key;
|
|
|
|
u32 val;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nla_map map[] = {
|
2016-11-25 23:35:02 +08:00
|
|
|
{TIPC_NLA_STATS_RX_INFO, 0},
|
2014-11-20 17:29:12 +08:00
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments},
|
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled},
|
2016-11-25 23:35:02 +08:00
|
|
|
{TIPC_NLA_STATS_TX_INFO, 0},
|
2014-11-20 17:29:12 +08:00
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments},
|
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLED, s->sent_bundled},
|
|
|
|
{TIPC_NLA_STATS_MSG_PROF_TOT, (s->msg_length_counts) ?
|
|
|
|
s->msg_length_counts : 1},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_CNT, s->msg_length_counts},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_TOT, s->msg_lengths_total},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P0, s->msg_length_profile[0]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P1, s->msg_length_profile[1]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P2, s->msg_length_profile[2]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P3, s->msg_length_profile[3]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P4, s->msg_length_profile[4]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P5, s->msg_length_profile[5]},
|
|
|
|
{TIPC_NLA_STATS_MSG_LEN_P6, s->msg_length_profile[6]},
|
|
|
|
{TIPC_NLA_STATS_RX_STATES, s->recv_states},
|
|
|
|
{TIPC_NLA_STATS_RX_PROBES, s->recv_probes},
|
|
|
|
{TIPC_NLA_STATS_RX_NACKS, s->recv_nacks},
|
|
|
|
{TIPC_NLA_STATS_RX_DEFERRED, s->deferred_recv},
|
|
|
|
{TIPC_NLA_STATS_TX_STATES, s->sent_states},
|
|
|
|
{TIPC_NLA_STATS_TX_PROBES, s->sent_probes},
|
|
|
|
{TIPC_NLA_STATS_TX_NACKS, s->sent_nacks},
|
|
|
|
{TIPC_NLA_STATS_TX_ACKS, s->sent_acks},
|
|
|
|
{TIPC_NLA_STATS_RETRANSMITTED, s->retransmitted},
|
|
|
|
{TIPC_NLA_STATS_DUPLICATES, s->duplicates},
|
|
|
|
{TIPC_NLA_STATS_LINK_CONGS, s->link_congs},
|
|
|
|
{TIPC_NLA_STATS_MAX_QUEUE, s->max_queue_sz},
|
|
|
|
{TIPC_NLA_STATS_AVG_QUEUE, s->queue_sz_counts ?
|
|
|
|
(s->accu_queue_sz / s->queue_sz_counts) : 0}
|
|
|
|
};
|
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
stats = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS);
|
2014-11-20 17:29:12 +08:00
|
|
|
if (!stats)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(map); i++)
|
|
|
|
if (nla_put_u32(skb, map[i].key, map[i].val))
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
nla_nest_end(skb, stats);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
msg_full:
|
|
|
|
nla_nest_cancel(skb, stats);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Caller should hold appropriate locks to protect the link */
|
2015-11-20 03:30:45 +08:00
|
|
|
int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
|
|
|
|
struct tipc_link *link, int nlflags)
|
2014-11-20 17:29:12 +08:00
|
|
|
{
|
2018-03-23 03:42:49 +08:00
|
|
|
u32 self = tipc_own_addr(net);
|
2014-11-20 17:29:12 +08:00
|
|
|
struct nlattr *attrs;
|
|
|
|
struct nlattr *prop;
|
2018-03-23 03:42:49 +08:00
|
|
|
void *hdr;
|
|
|
|
int err;
|
2014-11-20 17:29:12 +08:00
|
|
|
|
2015-02-09 16:50:03 +08:00
|
|
|
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
|
2015-04-29 00:33:50 +08:00
|
|
|
nlflags, TIPC_NL_LINK_GET);
|
2014-11-20 17:29:12 +08:00
|
|
|
if (!hdr)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK);
|
2014-11-20 17:29:12 +08:00
|
|
|
if (!attrs)
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name))
|
|
|
|
goto attr_msg_full;
|
2018-03-23 03:42:49 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(self)))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
tipc: simplify link mtu negotiation
When a link is being established, the two endpoints advertise their
respective interface MTU in the transmitted RESET and ACTIVATE messages.
If there is any difference, the lower of the two MTUs will be selected
for use by both endpoints.
However, as a remnant of earlier attempts to introduce TIPC level
routing. there also exists an MTU discovery mechanism. If an intermediate
node has a lower MTU than the two endpoints, they will discover this
through a bisectional approach, and finally adopt this MTU for common use.
Since there is no TIPC level routing, and probably never will be,
this mechanism doesn't make any sense, and only serves to make the
link level protocol unecessarily complex.
In this commit, we eliminate the MTU discovery algorithm,and fall back
to the simple MTU advertising approach. This change is fully backwards
compatible.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-04-02 21:33:02 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
2016-11-25 23:35:02 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
2016-11-25 23:35:02 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
if (tipc_link_is_up(link))
|
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
|
|
|
|
goto attr_msg_full;
|
2015-10-22 20:51:46 +08:00
|
|
|
if (link->active)
|
2014-11-20 17:29:12 +08:00
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE))
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP);
|
2014-11-20 17:29:12 +08:00
|
|
|
if (!prop)
|
|
|
|
goto attr_msg_full;
|
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority))
|
|
|
|
goto prop_msg_full;
|
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance))
|
|
|
|
goto prop_msg_full;
|
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN,
|
tipc: introduce starvation free send algorithm
Currently, we only use a single counter; the length of the backlog
queue, to determine whether a message should be accepted to the queue
or not. Each time a message is being sent, the queue length is compared
to a threshold value for the message's importance priority. If the queue
length is beyond this threshold, the message is rejected. This algorithm
implies a risk of starvation of low importance senders during very high
load, because it may take a long time before the backlog queue has
decreased enough to accept a lower level message.
We now eliminate this risk by introducing a counter for each importance
priority. When a message is sent, we check only the queue level for that
particular message's priority. If that is ok, the message can be added
to the backlog, irrespective of the queue level for other priorities.
This way, each level is guaranteed a certain portion of the total
bandwidth, and any risk of starvation is eliminated.
Reviewed-by: Ying Xue <ying.xue@windriver.com>
Reviewed-by: Erik Hugne <erik.hugne@ericsson.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-26 00:07:24 +08:00
|
|
|
link->window))
|
2014-11-20 17:29:12 +08:00
|
|
|
goto prop_msg_full;
|
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority))
|
|
|
|
goto prop_msg_full;
|
|
|
|
nla_nest_end(msg->skb, prop);
|
|
|
|
|
|
|
|
err = __tipc_nl_add_stats(msg->skb, &link->stats);
|
|
|
|
if (err)
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
nla_nest_end(msg->skb, attrs);
|
|
|
|
genlmsg_end(msg->skb, hdr);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
prop_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, prop);
|
|
|
|
attr_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, attrs);
|
|
|
|
msg_full:
|
|
|
|
genlmsg_cancel(msg->skb, hdr);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
|
|
|
|
struct tipc_stats *stats)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct nlattr *nest;
|
|
|
|
|
|
|
|
struct nla_map {
|
|
|
|
__u32 key;
|
|
|
|
__u32 val;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct nla_map map[] = {
|
2016-11-25 23:35:02 +08:00
|
|
|
{TIPC_NLA_STATS_RX_INFO, stats->recv_pkts},
|
2015-11-20 03:30:46 +08:00
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
|
|
|
|
{TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
|
|
|
|
{TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
|
2016-11-25 23:35:02 +08:00
|
|
|
{TIPC_NLA_STATS_TX_INFO, stats->sent_pkts},
|
2015-11-20 03:30:46 +08:00
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
|
|
|
|
{TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
|
|
|
|
{TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled},
|
|
|
|
{TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks},
|
|
|
|
{TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv},
|
|
|
|
{TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks},
|
|
|
|
{TIPC_NLA_STATS_TX_ACKS, stats->sent_acks},
|
|
|
|
{TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted},
|
|
|
|
{TIPC_NLA_STATS_DUPLICATES, stats->duplicates},
|
|
|
|
{TIPC_NLA_STATS_LINK_CONGS, stats->link_congs},
|
|
|
|
{TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz},
|
|
|
|
{TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ?
|
|
|
|
(stats->accu_queue_sz / stats->queue_sz_counts) : 0}
|
|
|
|
};
|
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
nest = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS);
|
2015-11-20 03:30:46 +08:00
|
|
|
if (!nest)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(map); i++)
|
|
|
|
if (nla_put_u32(skb, map[i].key, map[i].val))
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
nla_nest_end(skb, nest);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
msg_full:
|
|
|
|
nla_nest_cancel(skb, nest);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2020-05-26 17:38:37 +08:00
|
|
|
int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg,
|
|
|
|
struct tipc_link *bcl)
|
2015-11-20 03:30:46 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
void *hdr;
|
|
|
|
struct nlattr *attrs;
|
|
|
|
struct nlattr *prop;
|
tipc: update a binding service via broadcast
Currently, updating binding table (add service binding to
name table/withdraw a service binding) is being sent over replicast.
However, if we are scaling up clusters to > 100 nodes/containers this
method is less affection because of looping through nodes in a cluster one
by one.
It is worth to use broadcast to update a binding service. This way, the
binding table can be updated on all peer nodes in one shot.
Broadcast is used when all peer nodes, as indicated by a new capability
flag TIPC_NAMED_BCAST, support reception of this message type.
Four problems need to be considered when introducing this feature.
1) When establishing a link to a new peer node we still update this by a
unicast 'bulk' update. This may lead to race conditions, where a later
broadcast publication/withdrawal bypass the 'bulk', resulting in
disordered publications, or even that a withdrawal may arrive before the
corresponding publication. We solve this by adding an 'is_last_bulk' bit
in the last bulk messages so that it can be distinguished from all other
messages. Only when this message has arrived do we open up for reception
of broadcast publications/withdrawals.
2) When a first legacy node is added to the cluster all distribution
will switch over to use the legacy 'replicast' method, while the
opposite happens when the last legacy node leaves the cluster. This
entails another risk of message disordering that has to be handled. We
solve this by adding a sequence number to the broadcast/replicast
messages, so that disordering can be discovered and corrected. Note
however that we don't need to consider potential message loss or
duplication at this protocol level.
3) Bulk messages don't contain any sequence numbers, and will always
arrive in order. Hence we must exempt those from the sequence number
control and deliver them unconditionally. We solve this by adding a new
'is_bulk' bit in those messages so that they can be recognized.
4) Legacy messages, which don't contain any new bits or sequence
numbers, but neither can arrive out of order, also need to be exempt
from the initial synchronization and sequence number check, and
delivered unconditionally. Therefore, we add another 'is_not_legacy' bit
to all new messages so that those can be distinguished from legacy
messages and the latter delivered directly.
v1->v2:
- fix warning issue reported by kbuild test robot <lkp@intel.com>
- add santiy check to drop the publication message with a sequence
number that is lower than the agreed synch point
Signed-off-by: kernel test robot <lkp@intel.com>
Signed-off-by: Hoang Huu Le <hoang.h.le@dektech.com.au>
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-17 14:56:05 +08:00
|
|
|
u32 bc_mode = tipc_bcast_get_mode(net);
|
tipc: support broadcast/replicast configurable for bc-link
Currently, a multicast stream uses either broadcast or replicast as
transmission method, based on the ratio between number of actual
destinations nodes and cluster size.
However, when an L2 interface (e.g., VXLAN) provides pseudo
broadcast support, this becomes very inefficient, as it blindly
replicates multicast packets to all cluster/subnet nodes,
irrespective of whether they host actual target sockets or not.
The TIPC multicast algorithm is able to distinguish real destination
nodes from other nodes, and hence provides a smarter and more
efficient method for transferring multicast messages than
pseudo broadcast can do.
Because of this, we now make it possible for users to force
the broadcast link to permanently switch to using replicast,
irrespective of which capabilities the bearer provides,
or pretend to provide.
Conversely, we also make it possible to force the broadcast link
to always use true broadcast. While maybe less useful in
deployed systems, this may at least be useful for testing the
broadcast algorithm in small clusters.
We retain the current AUTOSELECT ability, i.e., to let the broadcast link
automatically select which algorithm to use, and to switch back and forth
between broadcast and replicast as the ratio between destination
node number and cluster size changes. This remains the default method.
Furthermore, we make it possible to configure the threshold ratio for
such switches. The default ratio is now set to 10%, down from 25% in the
earlier implementation.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-19 19:49:48 +08:00
|
|
|
u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net);
|
2015-11-20 03:30:46 +08:00
|
|
|
|
|
|
|
if (!bcl)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
tipc_bcast_lock(net);
|
|
|
|
|
|
|
|
hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
|
|
|
|
NLM_F_MULTI, TIPC_NL_LINK_GET);
|
2016-02-18 00:47:35 +08:00
|
|
|
if (!hdr) {
|
|
|
|
tipc_bcast_unlock(net);
|
2015-11-20 03:30:46 +08:00
|
|
|
return -EMSGSIZE;
|
2016-02-18 00:47:35 +08:00
|
|
|
}
|
2015-11-20 03:30:46 +08:00
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK);
|
2015-11-20 03:30:46 +08:00
|
|
|
if (!attrs)
|
|
|
|
goto msg_full;
|
|
|
|
|
|
|
|
/* The broadcast link is always up */
|
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST))
|
|
|
|
goto attr_msg_full;
|
|
|
|
if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
|
|
|
|
goto attr_msg_full;
|
2016-11-25 23:35:02 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0))
|
2015-11-20 03:30:46 +08:00
|
|
|
goto attr_msg_full;
|
2016-11-25 23:35:02 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0))
|
2015-11-20 03:30:46 +08:00
|
|
|
goto attr_msg_full;
|
|
|
|
|
2019-04-26 17:13:06 +08:00
|
|
|
prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP);
|
2015-11-20 03:30:46 +08:00
|
|
|
if (!prop)
|
|
|
|
goto attr_msg_full;
|
tipc: introduce variable window congestion control
We introduce a simple variable window congestion control for links.
The algorithm is inspired by the Reno algorithm, covering both 'slow
start', 'congestion avoidance', and 'fast recovery' modes.
- We introduce hard lower and upper window limits per link, still
different and configurable per bearer type.
- We introduce a 'slow start theshold' variable, initially set to
the maximum window size.
- We let a link start at the minimum congestion window, i.e. in slow
start mode, and then let is grow rapidly (+1 per rceived ACK) until
it reaches the slow start threshold and enters congestion avoidance
mode.
- In congestion avoidance mode we increment the congestion window for
each window-size number of acked packets, up to a possible maximum
equal to the configured maximum window.
- For each non-duplicate NACK received, we drop back to fast recovery
mode, by setting the both the slow start threshold to and the
congestion window to (current_congestion_window / 2).
- If the timeout handler finds that the transmit queue has not moved
since the previous timeout, it drops the link back to slow start
and forces a probe containing the last sent sequence number to the
sent to the peer, so that this can discover the stale situation.
This change does in reality have effect only on unicast ethernet
transport, as we have seen that there is no room whatsoever for
increasing the window max size for the UDP bearer.
For now, we also choose to keep the limits for the broadcast link
unchanged and equal.
This algorithm seems to give a 50-100% throughput improvement for
messages larger than MTU.
Suggested-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 07:52:46 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win))
|
2015-11-20 03:30:46 +08:00
|
|
|
goto prop_msg_full;
|
tipc: support broadcast/replicast configurable for bc-link
Currently, a multicast stream uses either broadcast or replicast as
transmission method, based on the ratio between number of actual
destinations nodes and cluster size.
However, when an L2 interface (e.g., VXLAN) provides pseudo
broadcast support, this becomes very inefficient, as it blindly
replicates multicast packets to all cluster/subnet nodes,
irrespective of whether they host actual target sockets or not.
The TIPC multicast algorithm is able to distinguish real destination
nodes from other nodes, and hence provides a smarter and more
efficient method for transferring multicast messages than
pseudo broadcast can do.
Because of this, we now make it possible for users to force
the broadcast link to permanently switch to using replicast,
irrespective of which capabilities the bearer provides,
or pretend to provide.
Conversely, we also make it possible to force the broadcast link
to always use true broadcast. While maybe less useful in
deployed systems, this may at least be useful for testing the
broadcast algorithm in small clusters.
We retain the current AUTOSELECT ability, i.e., to let the broadcast link
automatically select which algorithm to use, and to switch back and forth
between broadcast and replicast as the ratio between destination
node number and cluster size changes. This remains the default method.
Furthermore, we make it possible to configure the threshold ratio for
such switches. The default ratio is now set to 10%, down from 25% in the
earlier implementation.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-19 19:49:48 +08:00
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode))
|
|
|
|
goto prop_msg_full;
|
|
|
|
if (bc_mode & BCLINK_MODE_SEL)
|
|
|
|
if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST_RATIO,
|
|
|
|
bc_ratio))
|
|
|
|
goto prop_msg_full;
|
2015-11-20 03:30:46 +08:00
|
|
|
nla_nest_end(msg->skb, prop);
|
|
|
|
|
|
|
|
err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats);
|
|
|
|
if (err)
|
|
|
|
goto attr_msg_full;
|
|
|
|
|
|
|
|
tipc_bcast_unlock(net);
|
|
|
|
nla_nest_end(msg->skb, attrs);
|
|
|
|
genlmsg_end(msg->skb, hdr);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
prop_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, prop);
|
|
|
|
attr_msg_full:
|
|
|
|
nla_nest_cancel(msg->skb, attrs);
|
|
|
|
msg_full:
|
|
|
|
tipc_bcast_unlock(net);
|
|
|
|
genlmsg_cancel(msg->skb, hdr);
|
|
|
|
|
|
|
|
return -EMSGSIZE;
|
|
|
|
}
|
|
|
|
|
2016-02-01 15:19:56 +08:00
|
|
|
void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-11-20 03:30:46 +08:00
|
|
|
{
|
|
|
|
l->tolerance = tol;
|
2018-10-10 23:34:01 +08:00
|
|
|
if (l->bc_rcvlink)
|
|
|
|
l->bc_rcvlink->tolerance = tol;
|
2018-02-14 20:34:39 +08:00
|
|
|
if (link_is_up(l))
|
|
|
|
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
|
2015-11-20 03:30:46 +08:00
|
|
|
}
|
|
|
|
|
2016-02-01 15:19:56 +08:00
|
|
|
void tipc_link_set_prio(struct tipc_link *l, u32 prio,
|
|
|
|
struct sk_buff_head *xmitq)
|
2015-11-20 03:30:46 +08:00
|
|
|
{
|
|
|
|
l->priority = prio;
|
tipc: improve link resiliency when rps is activated
Currently, the TIPC RPS dissector is based only on the incoming packets'
source node address, hence steering all traffic from a node to the same
core. We have seen that this makes the links vulnerable to starvation
and unnecessary resets when we turn down the link tolerance to very low
values.
To reduce the risk of this happening, we exempt probe and probe replies
packets from the convergence to one core per source node. Instead, we do
the opposite, - we try to diverge those packets across as many cores as
possible, by randomizing the flow selector key.
To make such packets identifiable to the dissector, we add a new
'is_keepalive' bit to word 0 of the LINK_PROTOCOL header. This bit is
set both for PROBE and PROBE_REPLY messages, and only for those.
It should be noted that these packets are not part of any flow anyway,
and only constitute a minuscule fraction of all packets sent across a
link. Hence, there is no risk that this will affect overall performance.
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-11-08 16:59:26 +08:00
|
|
|
tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq);
|
2015-11-20 03:30:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
|
|
|
|
{
|
|
|
|
l->abort_limit = limit;
|
|
|
|
}
|
tipc: enable tracepoints in tipc
As for the sake of debugging/tracing, the commit enables tracepoints in
TIPC along with some general trace_events as shown below. It also
defines some 'tipc_*_dump()' functions that allow to dump TIPC object
data whenever needed, that is, for general debug purposes, ie. not just
for the trace_events.
The following trace_events are now available:
- trace_tipc_skb_dump(): allows to trace and dump TIPC msg & skb data,
e.g. message type, user, droppable, skb truesize, cloned skb, etc.
- trace_tipc_list_dump(): allows to trace and dump any TIPC buffers or
queues, e.g. TIPC link transmq, socket receive queue, etc.
- trace_tipc_sk_dump(): allows to trace and dump TIPC socket data, e.g.
sk state, sk type, connection type, rmem_alloc, socket queues, etc.
- trace_tipc_link_dump(): allows to trace and dump TIPC link data, e.g.
link state, silent_intv_cnt, gap, bc_gap, link queues, etc.
- trace_tipc_node_dump(): allows to trace and dump TIPC node data, e.g.
node state, active links, capabilities, link entries, etc.
How to use:
Put the trace functions at any places where we want to dump TIPC data
or events.
Note:
a) The dump functions will generate raw data only, that is, to offload
the trace event's processing, it can require a tool or script to parse
the data but this should be simple.
b) The trace_tipc_*_dump() should be reserved for a failure cases only
(e.g. the retransmission failure case) or where we do not expect to
happen too often, then we can consider enabling these events by default
since they will almost not take any effects under normal conditions,
but once the rare condition or failure occurs, we get the dumped data
fully for post-analysis.
For other trace purposes, we can reuse these trace classes as template
but different events.
c) A trace_event is only effective when we enable it. To enable the
TIPC trace_events, echo 1 to 'enable' files in the events/tipc/
directory in the 'debugfs' file system. Normally, they are located at:
/sys/kernel/debug/tracing/events/tipc/
For example:
To enable the tipc_link_dump event:
echo 1 > /sys/kernel/debug/tracing/events/tipc/tipc_link_dump/enable
To enable all the TIPC trace_events:
echo 1 > /sys/kernel/debug/tracing/events/tipc/enable
To collect the trace data:
cat trace
or
cat trace_pipe > /trace.out &
To disable all the TIPC trace_events:
echo 0 > /sys/kernel/debug/tracing/events/tipc/enable
To clear the trace buffer:
echo > trace
d) Like the other trace_events, the feature like 'filter' or 'trigger'
is also usable for the tipc trace_events.
For more details, have a look at:
Documentation/trace/ftrace.txt
MAINTAINERS | add two new files 'trace.h' & 'trace.c' in tipc
Acked-by: Ying Xue <ying.xue@windriver.com>
Tested-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-19 10:17:56 +08:00
|
|
|
|
|
|
|
/**
|
|
|
|
* tipc_link_dump - dump TIPC link data
|
|
|
|
* @l: tipc link to be dumped
|
|
|
|
* @dqueues: bitmask to decide if any link queue to be dumped?
|
|
|
|
* - TIPC_DUMP_NONE: don't dump link queues
|
|
|
|
* - TIPC_DUMP_TRANSMQ: dump link transmq queue
|
|
|
|
* - TIPC_DUMP_BACKLOGQ: dump link backlog queue
|
|
|
|
* - TIPC_DUMP_DEFERDQ: dump link deferd queue
|
|
|
|
* - TIPC_DUMP_INPUTQ: dump link input queue
|
|
|
|
* - TIPC_DUMP_WAKEUP: dump link wakeup queue
|
|
|
|
* - TIPC_DUMP_ALL: dump all the link queues above
|
|
|
|
* @buf: returned buffer of dump data in format
|
|
|
|
*/
|
|
|
|
int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf)
|
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
size_t sz = (dqueues) ? LINK_LMAX : LINK_LMIN;
|
|
|
|
struct sk_buff_head *list;
|
|
|
|
struct sk_buff *hskb, *tskb;
|
|
|
|
u32 len;
|
|
|
|
|
|
|
|
if (!l) {
|
|
|
|
i += scnprintf(buf, sz, "link data: (null)\n");
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
i += scnprintf(buf, sz, "link data: %x", l->addr);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %x", l->state);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->in_session);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->session);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->peer_session);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt_state);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt_state);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %x", l->peer_caps);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
|
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
|
2019-08-15 11:24:08 +08:00
|
|
|
i += scnprintf(buf + i, sz - i, " %u", 0);
|
2019-06-25 23:36:43 +08:00
|
|
|
i += scnprintf(buf + i, sz - i, " %u", 0);
|
tipc: enable tracepoints in tipc
As for the sake of debugging/tracing, the commit enables tracepoints in
TIPC along with some general trace_events as shown below. It also
defines some 'tipc_*_dump()' functions that allow to dump TIPC object
data whenever needed, that is, for general debug purposes, ie. not just
for the trace_events.
The following trace_events are now available:
- trace_tipc_skb_dump(): allows to trace and dump TIPC msg & skb data,
e.g. message type, user, droppable, skb truesize, cloned skb, etc.
- trace_tipc_list_dump(): allows to trace and dump any TIPC buffers or
queues, e.g. TIPC link transmq, socket receive queue, etc.
- trace_tipc_sk_dump(): allows to trace and dump TIPC socket data, e.g.
sk state, sk type, connection type, rmem_alloc, socket queues, etc.
- trace_tipc_link_dump(): allows to trace and dump TIPC link data, e.g.
link state, silent_intv_cnt, gap, bc_gap, link queues, etc.
- trace_tipc_node_dump(): allows to trace and dump TIPC node data, e.g.
node state, active links, capabilities, link entries, etc.
How to use:
Put the trace functions at any places where we want to dump TIPC data
or events.
Note:
a) The dump functions will generate raw data only, that is, to offload
the trace event's processing, it can require a tool or script to parse
the data but this should be simple.
b) The trace_tipc_*_dump() should be reserved for a failure cases only
(e.g. the retransmission failure case) or where we do not expect to
happen too often, then we can consider enabling these events by default
since they will almost not take any effects under normal conditions,
but once the rare condition or failure occurs, we get the dumped data
fully for post-analysis.
For other trace purposes, we can reuse these trace classes as template
but different events.
c) A trace_event is only effective when we enable it. To enable the
TIPC trace_events, echo 1 to 'enable' files in the events/tipc/
directory in the 'debugfs' file system. Normally, they are located at:
/sys/kernel/debug/tracing/events/tipc/
For example:
To enable the tipc_link_dump event:
echo 1 > /sys/kernel/debug/tracing/events/tipc/tipc_link_dump/enable
To enable all the TIPC trace_events:
echo 1 > /sys/kernel/debug/tracing/events/tipc/enable
To collect the trace data:
cat trace
or
cat trace_pipe > /trace.out &
To disable all the TIPC trace_events:
echo 0 > /sys/kernel/debug/tracing/events/tipc/enable
To clear the trace buffer:
echo > trace
d) Like the other trace_events, the feature like 'filter' or 'trigger'
is also usable for the tipc trace_events.
For more details, have a look at:
Documentation/trace/ftrace.txt
MAINTAINERS | add two new files 'trace.h' & 'trace.c' in tipc
Acked-by: Ying Xue <ying.xue@windriver.com>
Tested-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-12-19 10:17:56 +08:00
|
|
|
i += scnprintf(buf + i, sz - i, " %u", l->acked);
|
|
|
|
|
|
|
|
list = &l->transmq;
|
|
|
|
len = skb_queue_len(list);
|
|
|
|
hskb = skb_peek(list);
|
|
|
|
tskb = skb_peek_tail(list);
|
|
|
|
i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
|
|
|
|
(hskb) ? msg_seqno(buf_msg(hskb)) : 0,
|
|
|
|
(tskb) ? msg_seqno(buf_msg(tskb)) : 0);
|
|
|
|
|
|
|
|
list = &l->deferdq;
|
|
|
|
len = skb_queue_len(list);
|
|
|
|
hskb = skb_peek(list);
|
|
|
|
tskb = skb_peek_tail(list);
|
|
|
|
i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
|
|
|
|
(hskb) ? msg_seqno(buf_msg(hskb)) : 0,
|
|
|
|
(tskb) ? msg_seqno(buf_msg(tskb)) : 0);
|
|
|
|
|
|
|
|
list = &l->backlogq;
|
|
|
|
len = skb_queue_len(list);
|
|
|
|
hskb = skb_peek(list);
|
|
|
|
tskb = skb_peek_tail(list);
|
|
|
|
i += scnprintf(buf + i, sz - i, " | %u %u %u", len,
|
|
|
|
(hskb) ? msg_seqno(buf_msg(hskb)) : 0,
|
|
|
|
(tskb) ? msg_seqno(buf_msg(tskb)) : 0);
|
|
|
|
|
|
|
|
list = l->inputq;
|
|
|
|
len = skb_queue_len(list);
|
|
|
|
hskb = skb_peek(list);
|
|
|
|
tskb = skb_peek_tail(list);
|
|
|
|
i += scnprintf(buf + i, sz - i, " | %u %u %u\n", len,
|
|
|
|
(hskb) ? msg_seqno(buf_msg(hskb)) : 0,
|
|
|
|
(tskb) ? msg_seqno(buf_msg(tskb)) : 0);
|
|
|
|
|
|
|
|
if (dqueues & TIPC_DUMP_TRANSMQ) {
|
|
|
|
i += scnprintf(buf + i, sz - i, "transmq: ");
|
|
|
|
i += tipc_list_dump(&l->transmq, false, buf + i);
|
|
|
|
}
|
|
|
|
if (dqueues & TIPC_DUMP_BACKLOGQ) {
|
|
|
|
i += scnprintf(buf + i, sz - i,
|
|
|
|
"backlogq: <%u %u %u %u %u>, ",
|
|
|
|
l->backlog[TIPC_LOW_IMPORTANCE].len,
|
|
|
|
l->backlog[TIPC_MEDIUM_IMPORTANCE].len,
|
|
|
|
l->backlog[TIPC_HIGH_IMPORTANCE].len,
|
|
|
|
l->backlog[TIPC_CRITICAL_IMPORTANCE].len,
|
|
|
|
l->backlog[TIPC_SYSTEM_IMPORTANCE].len);
|
|
|
|
i += tipc_list_dump(&l->backlogq, false, buf + i);
|
|
|
|
}
|
|
|
|
if (dqueues & TIPC_DUMP_DEFERDQ) {
|
|
|
|
i += scnprintf(buf + i, sz - i, "deferdq: ");
|
|
|
|
i += tipc_list_dump(&l->deferdq, false, buf + i);
|
|
|
|
}
|
|
|
|
if (dqueues & TIPC_DUMP_INPUTQ) {
|
|
|
|
i += scnprintf(buf + i, sz - i, "inputq: ");
|
|
|
|
i += tipc_list_dump(l->inputq, false, buf + i);
|
|
|
|
}
|
|
|
|
if (dqueues & TIPC_DUMP_WAKEUP) {
|
|
|
|
i += scnprintf(buf + i, sz - i, "wakeup: ");
|
|
|
|
i += tipc_list_dump(&l->wakeupq, false, buf + i);
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|