2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-08-12 23:51:49 +08:00
|
|
|
* inet_diag.c Module for monitoring INET transport protocols sockets.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
2007-08-29 06:50:33 +08:00
|
|
|
#include <linux/kernel.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/fcntl.h>
|
|
|
|
#include <linux/random.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/cache.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/time.h>
|
|
|
|
|
|
|
|
#include <net/icmp.h>
|
|
|
|
#include <net/tcp.h>
|
|
|
|
#include <net/ipv6.h>
|
|
|
|
#include <net/inet_common.h>
|
2005-08-12 20:19:38 +08:00
|
|
|
#include <net/inet_connection_sock.h>
|
|
|
|
#include <net/inet_hashtables.h>
|
|
|
|
#include <net/inet_timewait_sock.h>
|
|
|
|
#include <net/inet6_hashtables.h>
|
2007-03-26 14:06:12 +08:00
|
|
|
#include <net/netlink.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/inet.h>
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
|
2005-08-12 23:56:38 +08:00
|
|
|
#include <linux/inet_diag.h>
|
2011-12-06 15:58:03 +08:00
|
|
|
#include <linux/sock_diag.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-12 20:27:49 +08:00
|
|
|
static const struct inet_diag_handler **inet_diag_table;
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
struct inet_diag_entry {
|
2006-09-28 09:44:30 +08:00
|
|
|
__be32 *saddr;
|
|
|
|
__be32 *daddr;
|
2005-04-17 06:20:36 +08:00
|
|
|
u16 sport;
|
|
|
|
u16 dport;
|
|
|
|
u16 family;
|
|
|
|
u16 userlocks;
|
2012-12-09 03:43:21 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
struct in6_addr saddr_storage; /* for IPv4-mapped-IPv6 addresses */
|
|
|
|
struct in6_addr daddr_storage; /* for IPv4-mapped-IPv6 addresses */
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2007-12-03 12:51:25 +08:00
|
|
|
static DEFINE_MUTEX(inet_diag_table_mutex);
|
|
|
|
|
2011-12-06 16:05:24 +08:00
|
|
|
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
|
2007-12-03 12:51:25 +08:00
|
|
|
{
|
2011-12-06 16:05:24 +08:00
|
|
|
if (!inet_diag_table[proto])
|
2011-12-15 10:43:27 +08:00
|
|
|
request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
|
|
|
|
NETLINK_SOCK_DIAG, AF_INET, proto);
|
2007-12-03 12:51:25 +08:00
|
|
|
|
|
|
|
mutex_lock(&inet_diag_table_mutex);
|
2011-12-06 16:05:24 +08:00
|
|
|
if (!inet_diag_table[proto])
|
2007-12-03 12:51:25 +08:00
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
|
2011-12-06 16:05:24 +08:00
|
|
|
return inet_diag_table[proto];
|
2007-12-03 12:51:25 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_diag_unlock_handler(
|
|
|
|
const struct inet_diag_handler *handler)
|
|
|
|
{
|
|
|
|
mutex_unlock(&inet_diag_table_mutex);
|
|
|
|
}
|
|
|
|
|
2011-12-09 14:23:00 +08:00
|
|
|
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct sk_buff *skb, struct inet_diag_req_v2 *req,
|
2012-05-25 07:58:08 +08:00
|
|
|
struct user_namespace *user_ns,
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid, u32 seq, u16 nlmsg_flags,
|
2006-01-10 06:56:56 +08:00
|
|
|
const struct nlmsghdr *unlh)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-08-10 11:10:42 +08:00
|
|
|
const struct inet_sock *inet = inet_sk(sk);
|
2005-08-12 23:51:49 +08:00
|
|
|
struct inet_diag_msg *r;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct nlmsghdr *nlh;
|
2012-06-27 07:36:12 +08:00
|
|
|
struct nlattr *attr;
|
2005-08-12 20:27:49 +08:00
|
|
|
void *info = NULL;
|
|
|
|
const struct inet_diag_handler *handler;
|
2011-12-06 15:59:32 +08:00
|
|
|
int ext = req->idiag_ext;
|
2005-08-12 20:27:49 +08:00
|
|
|
|
2011-12-06 15:59:32 +08:00
|
|
|
handler = inet_diag_table[req->sdiag_protocol];
|
2005-08-12 20:27:49 +08:00
|
|
|
BUG_ON(handler == NULL);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-09-08 04:12:54 +08:00
|
|
|
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
|
2012-06-27 07:36:12 +08:00
|
|
|
nlmsg_flags);
|
|
|
|
if (!nlh)
|
2012-06-27 12:28:54 +08:00
|
|
|
return -EMSGSIZE;
|
2005-08-12 20:27:49 +08:00
|
|
|
|
2012-06-27 12:28:54 +08:00
|
|
|
r = nlmsg_data(nlh);
|
2006-01-10 06:56:38 +08:00
|
|
|
BUG_ON(sk->sk_state == TCP_TIME_WAIT);
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_family = sk->sk_family;
|
|
|
|
r->idiag_state = sk->sk_state;
|
|
|
|
r->idiag_timer = 0;
|
|
|
|
r->idiag_retrans = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
r->id.idiag_if = sk->sk_bound_dev_if;
|
2011-12-15 10:43:44 +08:00
|
|
|
sock_diag_save_cookie(sk, r->id.idiag_cookie);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-10-15 14:30:45 +08:00
|
|
|
r->id.idiag_sport = inet->inet_sport;
|
|
|
|
r->id.idiag_dport = inet->inet_dport;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
|
|
|
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
|
|
|
|
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
|
|
|
|
|
2009-10-15 14:30:45 +08:00
|
|
|
r->id.idiag_src[0] = inet->inet_rcv_saddr;
|
|
|
|
r->id.idiag_dst[0] = inet->inet_daddr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-10-24 02:29:56 +08:00
|
|
|
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
|
|
|
|
goto errout;
|
|
|
|
|
2011-11-23 05:03:10 +08:00
|
|
|
/* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
|
|
|
|
* hence this needs to be included regardless of socket family.
|
|
|
|
*/
|
|
|
|
if (ext & (1 << (INET_DIAG_TOS - 1)))
|
2012-06-27 07:36:12 +08:00
|
|
|
if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
|
|
|
|
goto errout;
|
2011-11-23 05:03:10 +08:00
|
|
|
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-08-12 23:51:49 +08:00
|
|
|
if (r->idiag_family == AF_INET6) {
|
2005-04-17 06:20:36 +08:00
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
|
|
|
|
*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
|
2012-06-27 07:36:12 +08:00
|
|
|
|
2011-11-07 22:23:11 +08:00
|
|
|
if (ext & (1 << (INET_DIAG_TCLASS - 1)))
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
if (nla_put_u8(skb, INET_DIAG_TCLASS,
|
|
|
|
inet6_sk(sk)->tclass) < 0)
|
2012-06-27 07:36:12 +08:00
|
|
|
goto errout;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-05-25 07:58:08 +08:00
|
|
|
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
|
2011-12-09 14:23:00 +08:00
|
|
|
r->idiag_inode = sock_i_ino(sk);
|
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
|
|
|
|
struct inet_diag_meminfo minfo = {
|
|
|
|
.idiag_rmem = sk_rmem_alloc_get(sk),
|
|
|
|
.idiag_wmem = sk->sk_wmem_queued,
|
|
|
|
.idiag_fmem = sk->sk_forward_alloc,
|
|
|
|
.idiag_tmem = sk_wmem_alloc_get(sk),
|
|
|
|
};
|
|
|
|
|
|
|
|
if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0)
|
|
|
|
goto errout;
|
2011-12-09 14:23:00 +08:00
|
|
|
}
|
|
|
|
|
2011-12-30 08:53:32 +08:00
|
|
|
if (ext & (1 << (INET_DIAG_SKMEMINFO - 1)))
|
|
|
|
if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
|
2012-06-27 07:36:12 +08:00
|
|
|
goto errout;
|
2011-12-30 08:53:32 +08:00
|
|
|
|
2011-12-09 14:23:00 +08:00
|
|
|
if (icsk == NULL) {
|
2012-04-25 02:15:41 +08:00
|
|
|
handler->idiag_get_info(sk, r, NULL);
|
2011-12-09 14:23:00 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-08-29 06:50:33 +08:00
|
|
|
#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
tcp: Tail loss probe (TLP)
This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.
TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.
PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.
TLP Algorithm
On transmission of new data in Open state:
-> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
-> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
-> PTO = min(PTO, RTO)
Conditions for scheduling PTO:
-> Connection is in Open state.
-> Connection is either cwnd limited or no new data to send.
-> Number of probes per tail loss episode is limited to one.
-> Connection is SACK enabled.
When PTO fires:
new_segment_exists:
-> transmit new segment.
-> packets_out++. cwnd remains same.
no_new_packet:
-> retransmit the last segment.
Its ACK triggers FACK or early retransmit based recovery.
ACK path:
-> rearm RTO at start of ACK processing.
-> reschedule PTO if need be.
In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
==1; enables RFC5827 ER.
==2; delayed ER.
==3; TLP and delayed ER. [DEFAULT]
==4; TLP only.
The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.
Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-03-11 18:00:43 +08:00
|
|
|
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
|
|
|
|
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
|
|
|
|
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_timer = 1;
|
|
|
|
r->idiag_retrans = icsk->icsk_retransmits;
|
|
|
|
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
|
2005-08-10 11:10:42 +08:00
|
|
|
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_timer = 4;
|
|
|
|
r->idiag_retrans = icsk->icsk_probes_out;
|
|
|
|
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else if (timer_pending(&sk->sk_timer)) {
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_timer = 2;
|
|
|
|
r->idiag_retrans = icsk->icsk_probes_out;
|
|
|
|
r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_timer = 0;
|
|
|
|
r->idiag_expires = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#undef EXPIRES_IN_MS
|
2005-08-10 16:54:28 +08:00
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
if (ext & (1 << (INET_DIAG_INFO - 1))) {
|
|
|
|
attr = nla_reserve(skb, INET_DIAG_INFO,
|
|
|
|
sizeof(struct tcp_info));
|
|
|
|
if (!attr)
|
|
|
|
goto errout;
|
2011-12-09 14:23:00 +08:00
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
info = nla_data(attr);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops)
|
|
|
|
if (nla_put_string(skb, INET_DIAG_CONG,
|
|
|
|
icsk->icsk_ca_ops->name) < 0)
|
|
|
|
goto errout;
|
|
|
|
|
2005-08-12 20:27:49 +08:00
|
|
|
handler->idiag_get_info(sk, r, info);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-10 15:03:31 +08:00
|
|
|
if (sk->sk_state < TCP_TIME_WAIT &&
|
|
|
|
icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info)
|
|
|
|
icsk->icsk_ca_ops->get_info(sk, ext, skb);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-09 14:23:00 +08:00
|
|
|
out:
|
2015-01-17 05:09:00 +08:00
|
|
|
nlmsg_end(skb, nlh);
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
errout:
|
|
|
|
nlmsg_cancel(skb, nlh);
|
2007-02-01 15:16:40 +08:00
|
|
|
return -EMSGSIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-12-09 14:23:00 +08:00
|
|
|
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
|
|
|
|
|
|
|
|
static int inet_csk_diag_fill(struct sock *sk,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct sk_buff *skb, struct inet_diag_req_v2 *req,
|
2012-05-25 07:58:08 +08:00
|
|
|
struct user_namespace *user_ns,
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid, u32 seq, u16 nlmsg_flags,
|
2011-12-09 14:23:00 +08:00
|
|
|
const struct nlmsghdr *unlh)
|
|
|
|
{
|
|
|
|
return inet_sk_diag_fill(sk, inet_csk(sk),
|
2012-09-08 04:12:54 +08:00
|
|
|
skb, req, user_ns, portid, seq, nlmsg_flags, unlh);
|
2011-12-09 14:23:00 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-01-10 06:56:38 +08:00
|
|
|
static int inet_twsk_diag_fill(struct inet_timewait_sock *tw,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct sk_buff *skb, struct inet_diag_req_v2 *req,
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid, u32 seq, u16 nlmsg_flags,
|
2006-01-10 06:56:38 +08:00
|
|
|
const struct nlmsghdr *unlh)
|
|
|
|
{
|
2013-10-04 05:27:25 +08:00
|
|
|
s32 tmo;
|
2006-01-10 06:56:38 +08:00
|
|
|
struct inet_diag_msg *r;
|
2012-06-27 07:36:12 +08:00
|
|
|
struct nlmsghdr *nlh;
|
2012-06-27 12:28:54 +08:00
|
|
|
|
2012-09-08 04:12:54 +08:00
|
|
|
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
|
2012-06-27 07:36:12 +08:00
|
|
|
nlmsg_flags);
|
|
|
|
if (!nlh)
|
2012-06-27 12:28:54 +08:00
|
|
|
return -EMSGSIZE;
|
2006-01-10 06:56:38 +08:00
|
|
|
|
2012-06-27 12:28:54 +08:00
|
|
|
r = nlmsg_data(nlh);
|
2006-01-10 06:56:38 +08:00
|
|
|
BUG_ON(tw->tw_state != TCP_TIME_WAIT);
|
|
|
|
|
2013-10-04 05:27:25 +08:00
|
|
|
tmo = tw->tw_ttd - inet_tw_time_stamp();
|
2006-01-10 06:56:38 +08:00
|
|
|
if (tmo < 0)
|
|
|
|
tmo = 0;
|
|
|
|
|
|
|
|
r->idiag_family = tw->tw_family;
|
|
|
|
r->idiag_retrans = 0;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
2006-01-10 06:56:38 +08:00
|
|
|
r->id.idiag_if = tw->tw_bound_dev_if;
|
2011-12-15 10:43:44 +08:00
|
|
|
sock_diag_save_cookie(tw, r->id.idiag_cookie);
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
2006-01-10 06:56:38 +08:00
|
|
|
r->id.idiag_sport = tw->tw_sport;
|
|
|
|
r->id.idiag_dport = tw->tw_dport;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
|
|
|
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
|
|
|
|
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
|
|
|
|
|
2006-01-10 06:56:38 +08:00
|
|
|
r->id.idiag_src[0] = tw->tw_rcv_saddr;
|
|
|
|
r->id.idiag_dst[0] = tw->tw_daddr;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
2006-01-10 06:56:38 +08:00
|
|
|
r->idiag_state = tw->tw_substate;
|
|
|
|
r->idiag_timer = 3;
|
2013-10-04 05:27:25 +08:00
|
|
|
r->idiag_expires = jiffies_to_msecs(tmo);
|
2006-01-10 06:56:38 +08:00
|
|
|
r->idiag_rqueue = 0;
|
|
|
|
r->idiag_wqueue = 0;
|
|
|
|
r->idiag_uid = 0;
|
|
|
|
r->idiag_inode = 0;
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2006-01-10 06:56:38 +08:00
|
|
|
if (tw->tw_family == AF_INET6) {
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
*(struct in6_addr *)r->id.idiag_src = tw->tw_v6_rcv_saddr;
|
|
|
|
*(struct in6_addr *)r->id.idiag_dst = tw->tw_v6_daddr;
|
2006-01-10 06:56:38 +08:00
|
|
|
}
|
|
|
|
#endif
|
2012-06-27 07:36:12 +08:00
|
|
|
|
2015-01-17 05:09:00 +08:00
|
|
|
nlmsg_end(skb, nlh);
|
|
|
|
return 0;
|
2006-01-10 06:56:38 +08:00
|
|
|
}
|
|
|
|
|
2006-01-10 06:56:56 +08:00
|
|
|
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
|
2012-05-25 07:58:08 +08:00
|
|
|
struct inet_diag_req_v2 *r,
|
|
|
|
struct user_namespace *user_ns,
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid, u32 seq, u16 nlmsg_flags,
|
2006-01-10 06:56:56 +08:00
|
|
|
const struct nlmsghdr *unlh)
|
|
|
|
{
|
|
|
|
if (sk->sk_state == TCP_TIME_WAIT)
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
return inet_twsk_diag_fill(inet_twsk(sk), skb, r, portid, seq,
|
|
|
|
nlmsg_flags, unlh);
|
|
|
|
|
|
|
|
return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
|
|
|
|
nlmsg_flags, unlh);
|
2006-01-10 06:56:56 +08:00
|
|
|
}
|
|
|
|
|
2011-12-09 14:23:18 +08:00
|
|
|
int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_skb,
|
2012-01-11 06:36:35 +08:00
|
|
|
const struct nlmsghdr *nlh, struct inet_diag_req_v2 *req)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct sock *sk;
|
|
|
|
struct sk_buff *rep;
|
2012-07-16 12:28:49 +08:00
|
|
|
struct net *net = sock_net(in_skb->sk);
|
2007-12-03 12:51:25 +08:00
|
|
|
|
|
|
|
err = -EINVAL;
|
2011-12-06 15:58:39 +08:00
|
|
|
if (req->sdiag_family == AF_INET) {
|
2012-07-16 12:28:49 +08:00
|
|
|
sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
|
2005-08-12 23:51:49 +08:00
|
|
|
req->id.idiag_dport, req->id.idiag_src[0],
|
|
|
|
req->id.idiag_sport, req->id.idiag_if);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2011-12-06 15:58:39 +08:00
|
|
|
else if (req->sdiag_family == AF_INET6) {
|
2012-07-16 12:28:49 +08:00
|
|
|
sk = inet6_lookup(net, hashinfo,
|
2005-08-12 23:51:49 +08:00
|
|
|
(struct in6_addr *)req->id.idiag_dst,
|
|
|
|
req->id.idiag_dport,
|
|
|
|
(struct in6_addr *)req->id.idiag_src,
|
|
|
|
req->id.idiag_sport,
|
|
|
|
req->id.idiag_if);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
else {
|
2011-12-09 14:22:10 +08:00
|
|
|
goto out_nosk;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-12-03 12:51:25 +08:00
|
|
|
err = -ENOENT;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (sk == NULL)
|
2011-12-09 14:22:10 +08:00
|
|
|
goto out_nosk;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-15 10:43:44 +08:00
|
|
|
err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
|
2011-12-09 14:21:53 +08:00
|
|
|
if (err)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
rep = nlmsg_new(sizeof(struct inet_diag_msg) +
|
|
|
|
sizeof(struct inet_diag_meminfo) +
|
|
|
|
sizeof(struct tcp_info) + 64, GFP_KERNEL);
|
|
|
|
if (!rep) {
|
|
|
|
err = -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2012-06-27 07:36:12 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-06 15:59:32 +08:00
|
|
|
err = sk_diag_fill(sk, rep, req,
|
2013-04-17 14:46:57 +08:00
|
|
|
sk_user_ns(NETLINK_CB(in_skb).sk),
|
2012-09-08 04:12:54 +08:00
|
|
|
NETLINK_CB(in_skb).portid,
|
2007-02-01 15:16:40 +08:00
|
|
|
nlh->nlmsg_seq, 0, nlh);
|
|
|
|
if (err < 0) {
|
|
|
|
WARN_ON(err == -EMSGSIZE);
|
2012-06-27 07:36:12 +08:00
|
|
|
nlmsg_free(rep);
|
2007-02-01 15:16:40 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2012-09-08 04:12:54 +08:00
|
|
|
err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
|
2005-08-12 23:51:49 +08:00
|
|
|
MSG_DONTWAIT);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err > 0)
|
|
|
|
err = 0;
|
|
|
|
|
|
|
|
out:
|
2013-10-11 23:54:49 +08:00
|
|
|
if (sk)
|
|
|
|
sock_gen_put(sk);
|
|
|
|
|
2011-12-09 14:22:10 +08:00
|
|
|
out_nosk:
|
|
|
|
return err;
|
|
|
|
}
|
2011-12-09 14:23:18 +08:00
|
|
|
EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
|
2011-12-09 14:22:10 +08:00
|
|
|
|
|
|
|
static int inet_diag_get_exact(struct sk_buff *in_skb,
|
|
|
|
const struct nlmsghdr *nlh,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 *req)
|
2011-12-09 14:22:10 +08:00
|
|
|
{
|
|
|
|
const struct inet_diag_handler *handler;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
handler = inet_diag_lock_handler(req->sdiag_protocol);
|
|
|
|
if (IS_ERR(handler))
|
|
|
|
err = PTR_ERR(handler);
|
|
|
|
else
|
2011-12-09 14:23:18 +08:00
|
|
|
err = handler->dump_one(in_skb, nlh, req);
|
2007-12-03 12:51:25 +08:00
|
|
|
inet_diag_unlock_handler(handler);
|
2011-12-09 14:22:10 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2006-09-28 09:44:30 +08:00
|
|
|
static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int words = bits >> 5;
|
|
|
|
|
|
|
|
bits &= 0x1f;
|
|
|
|
|
|
|
|
if (words) {
|
|
|
|
if (memcmp(a1, a2, words << 2))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (bits) {
|
2006-09-28 09:44:30 +08:00
|
|
|
__be32 w1, w2;
|
|
|
|
__be32 mask;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
w1 = a1[words];
|
|
|
|
w2 = a2[words];
|
|
|
|
|
|
|
|
mask = htonl((0xffffffff) << (32 - bits));
|
|
|
|
|
|
|
|
if ((w1 ^ w2) & mask)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2011-12-09 14:21:34 +08:00
|
|
|
static int inet_diag_bc_run(const struct nlattr *_bc,
|
|
|
|
const struct inet_diag_entry *entry)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-12-09 14:21:34 +08:00
|
|
|
const void *bc = nla_data(_bc);
|
|
|
|
int len = nla_len(_bc);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
while (len > 0) {
|
|
|
|
int yes = 1;
|
2005-08-12 23:51:49 +08:00
|
|
|
const struct inet_diag_bc_op *op = bc;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
switch (op->code) {
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_NOP:
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_JMP:
|
2005-04-17 06:20:36 +08:00
|
|
|
yes = 0;
|
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_S_GE:
|
2005-04-17 06:20:36 +08:00
|
|
|
yes = entry->sport >= op[1].no;
|
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_S_LE:
|
2010-01-20 06:12:20 +08:00
|
|
|
yes = entry->sport <= op[1].no;
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_D_GE:
|
2005-04-17 06:20:36 +08:00
|
|
|
yes = entry->dport >= op[1].no;
|
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_D_LE:
|
2005-04-17 06:20:36 +08:00
|
|
|
yes = entry->dport <= op[1].no;
|
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_AUTO:
|
2005-04-17 06:20:36 +08:00
|
|
|
yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
|
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_S_COND:
|
2005-08-12 23:56:38 +08:00
|
|
|
case INET_DIAG_BC_D_COND: {
|
|
|
|
struct inet_diag_hostcond *cond;
|
2006-09-28 09:44:30 +08:00
|
|
|
__be32 *addr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-12 23:56:38 +08:00
|
|
|
cond = (struct inet_diag_hostcond *)(op + 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (cond->port != -1 &&
|
2005-08-12 23:51:49 +08:00
|
|
|
cond->port != (op->code == INET_DIAG_BC_S_COND ?
|
2005-04-17 06:20:36 +08:00
|
|
|
entry->sport : entry->dport)) {
|
|
|
|
yes = 0;
|
|
|
|
break;
|
|
|
|
}
|
2006-01-10 06:56:19 +08:00
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
if (op->code == INET_DIAG_BC_S_COND)
|
2005-04-17 06:20:36 +08:00
|
|
|
addr = entry->saddr;
|
|
|
|
else
|
|
|
|
addr = entry->daddr;
|
|
|
|
|
2012-12-09 03:43:23 +08:00
|
|
|
if (cond->family != AF_UNSPEC &&
|
|
|
|
cond->family != entry->family) {
|
|
|
|
if (entry->family == AF_INET6 &&
|
|
|
|
cond->family == AF_INET) {
|
|
|
|
if (addr[0] == 0 && addr[1] == 0 &&
|
|
|
|
addr[2] == htonl(0xffff) &&
|
|
|
|
bitstring_match(addr + 3,
|
|
|
|
cond->addr,
|
|
|
|
cond->prefix_len))
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
yes = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cond->prefix_len == 0)
|
|
|
|
break;
|
2006-01-10 06:56:19 +08:00
|
|
|
if (bitstring_match(addr, cond->addr,
|
|
|
|
cond->prefix_len))
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
yes = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-01-10 06:56:19 +08:00
|
|
|
if (yes) {
|
2005-04-17 06:20:36 +08:00
|
|
|
len -= op->yes;
|
|
|
|
bc += op->yes;
|
|
|
|
} else {
|
|
|
|
len -= op->no;
|
|
|
|
bc += op->no;
|
|
|
|
}
|
|
|
|
}
|
2010-09-23 04:43:57 +08:00
|
|
|
return len == 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-12-09 14:22:44 +08:00
|
|
|
int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
|
|
|
|
{
|
|
|
|
struct inet_diag_entry entry;
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
|
|
|
if (bc == NULL)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
entry.family = sk->sk_family;
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2011-12-09 14:22:44 +08:00
|
|
|
if (entry.family == AF_INET6) {
|
|
|
|
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
entry.saddr = sk->sk_v6_rcv_saddr.s6_addr32;
|
|
|
|
entry.daddr = sk->sk_v6_daddr.s6_addr32;
|
2011-12-09 14:22:44 +08:00
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
entry.saddr = &inet->inet_rcv_saddr;
|
|
|
|
entry.daddr = &inet->inet_daddr;
|
|
|
|
}
|
|
|
|
entry.sport = inet->inet_num;
|
|
|
|
entry.dport = ntohs(inet->inet_dport);
|
|
|
|
entry.userlocks = sk->sk_userlocks;
|
|
|
|
|
|
|
|
return inet_diag_bc_run(bc, &entry);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet_diag_bc_sk);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static int valid_cc(const void *bc, int len, int cc)
|
|
|
|
{
|
|
|
|
while (len >= 0) {
|
2005-08-12 23:51:49 +08:00
|
|
|
const struct inet_diag_bc_op *op = bc;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (cc > len)
|
|
|
|
return 0;
|
|
|
|
if (cc == len)
|
|
|
|
return 1;
|
2011-06-18 04:25:39 +08:00
|
|
|
if (op->yes < 4 || op->yes & 3)
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
len -= op->yes;
|
|
|
|
bc += op->yes;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-12-09 03:43:22 +08:00
|
|
|
/* Validate an inet_diag_hostcond. */
|
|
|
|
static bool valid_hostcond(const struct inet_diag_bc_op *op, int len,
|
|
|
|
int *min_len)
|
|
|
|
{
|
|
|
|
int addr_len;
|
|
|
|
struct inet_diag_hostcond *cond;
|
|
|
|
|
|
|
|
/* Check hostcond space. */
|
|
|
|
*min_len += sizeof(struct inet_diag_hostcond);
|
|
|
|
if (len < *min_len)
|
|
|
|
return false;
|
|
|
|
cond = (struct inet_diag_hostcond *)(op + 1);
|
|
|
|
|
|
|
|
/* Check address family and address length. */
|
|
|
|
switch (cond->family) {
|
|
|
|
case AF_UNSPEC:
|
|
|
|
addr_len = 0;
|
|
|
|
break;
|
|
|
|
case AF_INET:
|
|
|
|
addr_len = sizeof(struct in_addr);
|
|
|
|
break;
|
|
|
|
case AF_INET6:
|
|
|
|
addr_len = sizeof(struct in6_addr);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
*min_len += addr_len;
|
|
|
|
if (len < *min_len)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* Check prefix length (in bits) vs address length (in bytes). */
|
|
|
|
if (cond->prefix_len > 8 * addr_len)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2012-12-09 19:09:54 +08:00
|
|
|
/* Validate a port comparison operator. */
|
|
|
|
static inline bool valid_port_comparison(const struct inet_diag_bc_op *op,
|
|
|
|
int len, int *min_len)
|
|
|
|
{
|
|
|
|
/* Port comparisons put the port in a follow-on inet_diag_bc_op. */
|
|
|
|
*min_len += sizeof(struct inet_diag_bc_op);
|
|
|
|
if (len < *min_len)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
static int inet_diag_bc_audit(const void *bytecode, int bytecode_len)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-06-18 04:25:39 +08:00
|
|
|
const void *bc = bytecode;
|
2005-04-17 06:20:36 +08:00
|
|
|
int len = bytecode_len;
|
|
|
|
|
|
|
|
while (len > 0) {
|
2011-06-18 04:25:39 +08:00
|
|
|
const struct inet_diag_bc_op *op = bc;
|
2012-12-09 03:43:22 +08:00
|
|
|
int min_len = sizeof(struct inet_diag_bc_op);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
|
|
|
|
switch (op->code) {
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_S_COND:
|
|
|
|
case INET_DIAG_BC_D_COND:
|
2012-12-09 03:43:22 +08:00
|
|
|
if (!valid_hostcond(bc, len, &min_len))
|
|
|
|
return -EINVAL;
|
2012-12-09 19:09:54 +08:00
|
|
|
break;
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_S_GE:
|
|
|
|
case INET_DIAG_BC_S_LE:
|
|
|
|
case INET_DIAG_BC_D_GE:
|
|
|
|
case INET_DIAG_BC_D_LE:
|
2012-12-09 19:09:54 +08:00
|
|
|
if (!valid_port_comparison(bc, len, &min_len))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
break;
|
2012-12-09 19:09:54 +08:00
|
|
|
case INET_DIAG_BC_AUTO:
|
|
|
|
case INET_DIAG_BC_JMP:
|
2005-08-12 23:51:49 +08:00
|
|
|
case INET_DIAG_BC_NOP:
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2012-12-09 19:09:54 +08:00
|
|
|
|
|
|
|
if (op->code != INET_DIAG_BC_NOP) {
|
|
|
|
if (op->no < min_len || op->no > len + 4 || op->no & 3)
|
|
|
|
return -EINVAL;
|
|
|
|
if (op->no < len &&
|
|
|
|
!valid_cc(bytecode, bytecode_len, len - op->no))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2012-12-09 03:43:22 +08:00
|
|
|
if (op->yes < min_len || op->yes > len + 4 || op->yes & 3)
|
2011-06-18 04:25:39 +08:00
|
|
|
return -EINVAL;
|
2006-01-10 06:56:19 +08:00
|
|
|
bc += op->yes;
|
2005-04-17 06:20:36 +08:00
|
|
|
len -= op->yes;
|
|
|
|
}
|
|
|
|
return len == 0 ? 0 : -EINVAL;
|
|
|
|
}
|
|
|
|
|
2006-01-10 06:56:56 +08:00
|
|
|
static int inet_csk_diag_dump(struct sock *sk,
|
|
|
|
struct sk_buff *skb,
|
2011-12-06 15:57:26 +08:00
|
|
|
struct netlink_callback *cb,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 *r,
|
2011-12-06 15:57:26 +08:00
|
|
|
const struct nlattr *bc)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-12-09 14:22:44 +08:00
|
|
|
if (!inet_diag_bc_sk(bc, sk))
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-06 15:59:32 +08:00
|
|
|
return inet_csk_diag_fill(sk, skb, r,
|
2013-04-17 14:46:57 +08:00
|
|
|
sk_user_ns(NETLINK_CB(cb->skb).sk),
|
2012-09-08 04:12:54 +08:00
|
|
|
NETLINK_CB(cb->skb).portid,
|
2006-01-10 06:56:56 +08:00
|
|
|
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
static int inet_twsk_diag_dump(struct sock *sk,
|
2006-01-10 06:56:38 +08:00
|
|
|
struct sk_buff *skb,
|
2011-12-06 15:57:26 +08:00
|
|
|
struct netlink_callback *cb,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 *r,
|
2011-12-06 15:57:26 +08:00
|
|
|
const struct nlattr *bc)
|
2006-01-10 06:56:38 +08:00
|
|
|
{
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
struct inet_timewait_sock *tw = inet_twsk(sk);
|
|
|
|
|
2011-12-06 15:57:26 +08:00
|
|
|
if (bc != NULL) {
|
2006-01-10 06:56:38 +08:00
|
|
|
struct inet_diag_entry entry;
|
|
|
|
|
|
|
|
entry.family = tw->tw_family;
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2006-01-10 06:56:38 +08:00
|
|
|
if (tw->tw_family == AF_INET6) {
|
ipv6: make lookups simpler and faster
TCP listener refactoring, part 4 :
To speed up inet lookups, we moved IPv4 addresses from inet to struct
sock_common
Now is time to do the same for IPv6, because it permits us to have fast
lookups for all kind of sockets, including upcoming SYN_RECV.
Getting IPv6 addresses in TCP lookups currently requires two extra cache
lines, plus a dereference (and memory stall).
inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6
This patch is way bigger than its IPv4 counter part, because for IPv4,
we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6,
it's not doable easily.
inet6_sk(sk)->daddr becomes sk->sk_v6_daddr
inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr
And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr
at the same offset.
We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic
macro.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-04 06:42:29 +08:00
|
|
|
entry.saddr = tw->tw_v6_rcv_saddr.s6_addr32;
|
|
|
|
entry.daddr = tw->tw_v6_daddr.s6_addr32;
|
2006-01-10 06:56:38 +08:00
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
entry.saddr = &tw->tw_rcv_saddr;
|
|
|
|
entry.daddr = &tw->tw_daddr;
|
|
|
|
}
|
|
|
|
entry.sport = tw->tw_num;
|
|
|
|
entry.dport = ntohs(tw->tw_dport);
|
2007-02-09 22:24:47 +08:00
|
|
|
entry.userlocks = 0;
|
2006-01-10 06:56:38 +08:00
|
|
|
|
2011-12-09 14:21:34 +08:00
|
|
|
if (!inet_diag_bc_run(bc, &entry))
|
2006-01-10 06:56:38 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2011-12-06 15:59:32 +08:00
|
|
|
return inet_twsk_diag_fill(tw, skb, r,
|
2012-09-08 04:12:54 +08:00
|
|
|
NETLINK_CB(cb->skb).portid,
|
2006-01-10 06:56:38 +08:00
|
|
|
cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
|
|
|
|
}
|
|
|
|
|
2012-12-09 03:43:21 +08:00
|
|
|
/* Get the IPv4, IPv6, or IPv4-mapped-IPv6 local and remote addresses
|
|
|
|
* from a request_sock. For IPv4-mapped-IPv6 we must map IPv4 to IPv6.
|
|
|
|
*/
|
|
|
|
static inline void inet_diag_req_addrs(const struct sock *sk,
|
|
|
|
const struct request_sock *req,
|
|
|
|
struct inet_diag_entry *entry)
|
|
|
|
{
|
|
|
|
struct inet_request_sock *ireq = inet_rsk(req);
|
|
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
if (sk->sk_family == AF_INET6) {
|
|
|
|
if (req->rsk_ops->family == AF_INET6) {
|
2013-10-10 06:21:29 +08:00
|
|
|
entry->saddr = ireq->ir_v6_loc_addr.s6_addr32;
|
|
|
|
entry->daddr = ireq->ir_v6_rmt_addr.s6_addr32;
|
2012-12-09 03:43:21 +08:00
|
|
|
} else if (req->rsk_ops->family == AF_INET) {
|
2013-10-10 06:21:29 +08:00
|
|
|
ipv6_addr_set_v4mapped(ireq->ir_loc_addr,
|
2012-12-09 03:43:21 +08:00
|
|
|
&entry->saddr_storage);
|
2013-10-10 06:21:29 +08:00
|
|
|
ipv6_addr_set_v4mapped(ireq->ir_rmt_addr,
|
2012-12-09 03:43:21 +08:00
|
|
|
&entry->daddr_storage);
|
|
|
|
entry->saddr = entry->saddr_storage.s6_addr32;
|
|
|
|
entry->daddr = entry->daddr_storage.s6_addr32;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
2013-10-10 06:21:29 +08:00
|
|
|
entry->saddr = &ireq->ir_loc_addr;
|
|
|
|
entry->daddr = &ireq->ir_rmt_addr;
|
2012-12-09 03:43:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
|
2012-05-25 07:58:08 +08:00
|
|
|
struct request_sock *req,
|
|
|
|
struct user_namespace *user_ns,
|
2012-09-08 04:12:54 +08:00
|
|
|
u32 portid, u32 seq,
|
2006-01-10 06:56:19 +08:00
|
|
|
const struct nlmsghdr *unlh)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 13:46:52 +08:00
|
|
|
const struct inet_request_sock *ireq = inet_rsk(req);
|
2005-04-17 06:20:36 +08:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
2005-08-12 23:51:49 +08:00
|
|
|
struct inet_diag_msg *r;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct nlmsghdr *nlh;
|
|
|
|
long tmo;
|
|
|
|
|
2012-09-08 04:12:54 +08:00
|
|
|
nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
|
2012-06-27 07:36:12 +08:00
|
|
|
NLM_F_MULTI);
|
|
|
|
if (!nlh)
|
|
|
|
return -EMSGSIZE;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-06-27 07:36:12 +08:00
|
|
|
r = nlmsg_data(nlh);
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_family = sk->sk_family;
|
|
|
|
r->idiag_state = TCP_SYN_RECV;
|
|
|
|
r->idiag_timer = 1;
|
2012-10-28 07:16:46 +08:00
|
|
|
r->idiag_retrans = req->num_retrans;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
r->id.idiag_if = sk->sk_bound_dev_if;
|
2011-12-15 10:43:44 +08:00
|
|
|
sock_diag_save_cookie(req, r->id.idiag_cookie);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
tmo = req->expires - jiffies;
|
|
|
|
if (tmo < 0)
|
|
|
|
tmo = 0;
|
|
|
|
|
2009-10-15 14:30:45 +08:00
|
|
|
r->id.idiag_sport = inet->inet_sport;
|
2013-10-10 06:21:29 +08:00
|
|
|
r->id.idiag_dport = ireq->ir_rmt_port;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
|
|
|
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
|
|
|
|
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
|
|
|
|
|
2013-10-10 06:21:29 +08:00
|
|
|
r->id.idiag_src[0] = ireq->ir_loc_addr;
|
|
|
|
r->id.idiag_dst[0] = ireq->ir_rmt_addr;
|
net: inet_diag: zero out uninitialized idiag_{src,dst} fields
Jakub reported while working with nlmon netlink sniffer that parts of
the inet_diag_sockid are not initialized when r->idiag_family != AF_INET6.
That is, fields of r->id.idiag_src[1 ... 3], r->id.idiag_dst[1 ... 3].
In fact, it seems that we can leak 6 * sizeof(u32) byte of kernel [slab]
memory through this. At least, in udp_dump_one(), we allocate a skb in ...
rep = nlmsg_new(sizeof(struct inet_diag_msg) + ..., GFP_KERNEL);
... and then pass that to inet_sk_diag_fill() that puts the whole struct
inet_diag_msg into the skb, where we only fill out r->id.idiag_src[0],
r->id.idiag_dst[0] and leave the rest untouched:
r->id.idiag_src[0] = inet->inet_rcv_saddr;
r->id.idiag_dst[0] = inet->inet_daddr;
struct inet_diag_msg embeds struct inet_diag_sockid that is correctly /
fully filled out in IPv6 case, but for IPv4 not.
So just zero them out by using plain memset (for this little amount of
bytes it's probably not worth the extra check for idiag_family == AF_INET).
Similarly, fix also other places where we fill that out.
Reported-by: Jakub Zawadzki <darkjames-ws@darkjames.pl>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-17 07:38:39 +08:00
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_expires = jiffies_to_msecs(tmo);
|
|
|
|
r->idiag_rqueue = 0;
|
|
|
|
r->idiag_wqueue = 0;
|
2012-05-25 07:58:08 +08:00
|
|
|
r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
|
2005-08-12 23:51:49 +08:00
|
|
|
r->idiag_inode = 0;
|
2011-12-10 17:48:31 +08:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-08-12 23:51:49 +08:00
|
|
|
if (r->idiag_family == AF_INET6) {
|
2012-12-09 03:43:21 +08:00
|
|
|
struct inet_diag_entry entry;
|
|
|
|
inet_diag_req_addrs(sk, req, &entry);
|
|
|
|
memcpy(r->id.idiag_src, entry.saddr, sizeof(struct in6_addr));
|
|
|
|
memcpy(r->id.idiag_dst, entry.daddr, sizeof(struct in6_addr));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2015-01-17 05:09:00 +08:00
|
|
|
nlmsg_end(skb, nlh);
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
|
2011-12-06 15:57:26 +08:00
|
|
|
struct netlink_callback *cb,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 *r,
|
2011-12-06 15:57:26 +08:00
|
|
|
const struct nlattr *bc)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-08-12 23:51:49 +08:00
|
|
|
struct inet_diag_entry entry;
|
2005-08-10 11:10:42 +08:00
|
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
2005-06-19 13:48:55 +08:00
|
|
|
struct listen_sock *lopt;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
int j, s_j;
|
|
|
|
int reqnum, s_reqnum;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
s_j = cb->args[3];
|
|
|
|
s_reqnum = cb->args[4];
|
|
|
|
|
|
|
|
if (s_j > 0)
|
|
|
|
s_j--;
|
|
|
|
|
|
|
|
entry.family = sk->sk_family;
|
|
|
|
|
2005-08-10 11:10:42 +08:00
|
|
|
read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-08-10 11:10:42 +08:00
|
|
|
lopt = icsk->icsk_accept_queue.listen_opt;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!lopt || !lopt->qlen)
|
|
|
|
goto out;
|
|
|
|
|
2011-12-06 15:57:26 +08:00
|
|
|
if (bc != NULL) {
|
2009-10-15 14:30:45 +08:00
|
|
|
entry.sport = inet->inet_num;
|
2005-04-17 06:20:36 +08:00
|
|
|
entry.userlocks = sk->sk_userlocks;
|
|
|
|
}
|
|
|
|
|
2005-08-10 16:54:28 +08:00
|
|
|
for (j = s_j; j < lopt->nr_table_entries; j++) {
|
2005-06-19 13:47:21 +08:00
|
|
|
struct request_sock *req, *head = lopt->syn_table[j];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
reqnum = 0;
|
|
|
|
for (req = head; req; reqnum++, req = req->dl_next) {
|
[NET] Generalise TCP's struct open_request minisock infrastructure
Kept this first changeset minimal, without changing existing names to
ease peer review.
Basicaly tcp_openreq_alloc now receives the or_calltable, that in turn
has two new members:
->slab, that replaces tcp_openreq_cachep
->obj_size, to inform the size of the openreq descendant for
a specific protocol
The protocol specific fields in struct open_request were moved to a
class hierarchy, with the things that are common to all connection
oriented PF_INET protocols in struct inet_request_sock, the TCP ones
in tcp_request_sock, that is an inet_request_sock, that is an
open_request.
I.e. this uses the same approach used for the struct sock class
hierarchy, with sk_prot indicating if the protocol wants to use the
open_request infrastructure by filling in sk_prot->rsk_prot with an
or_calltable.
Results? Performance is improved and TCP v4 now uses only 64 bytes per
open request minisock, down from 96 without this patch :-)
Next changeset will rename some of the structs, fields and functions
mentioned above, struct or_calltable is way unclear, better name it
struct request_sock_ops, s/struct open_request/struct request_sock/g,
etc.
Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2005-06-19 13:46:52 +08:00
|
|
|
struct inet_request_sock *ireq = inet_rsk(req);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (reqnum < s_reqnum)
|
|
|
|
continue;
|
2013-10-10 06:21:29 +08:00
|
|
|
if (r->id.idiag_dport != ireq->ir_rmt_port &&
|
2005-08-12 23:51:49 +08:00
|
|
|
r->id.idiag_dport)
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (bc) {
|
2012-12-09 03:43:21 +08:00
|
|
|
inet_diag_req_addrs(sk, req, &entry);
|
2013-10-10 06:21:29 +08:00
|
|
|
entry.dport = ntohs(ireq->ir_rmt_port);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-12-09 14:21:34 +08:00
|
|
|
if (!inet_diag_bc_run(bc, &entry))
|
2005-04-17 06:20:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
err = inet_diag_fill_req(skb, sk, req,
|
2013-04-17 14:46:57 +08:00
|
|
|
sk_user_ns(NETLINK_CB(cb->skb).sk),
|
2012-09-08 04:12:54 +08:00
|
|
|
NETLINK_CB(cb->skb).portid,
|
2005-08-10 16:54:28 +08:00
|
|
|
cb->nlh->nlmsg_seq, cb->nlh);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (err < 0) {
|
|
|
|
cb->args[3] = j + 1;
|
|
|
|
cb->args[4] = reqnum;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
s_reqnum = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2005-08-10 11:10:42 +08:00
|
|
|
read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2011-12-09 14:23:18 +08:00
|
|
|
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct netlink_callback *cb, struct inet_diag_req_v2 *r, struct nlattr *bc)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int i, num;
|
|
|
|
int s_i, s_num;
|
2012-07-16 12:28:49 +08:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2006-01-10 06:56:19 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
s_i = cb->args[1];
|
|
|
|
s_num = num = cb->args[2];
|
2005-08-12 20:27:49 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (cb->args[0] == 0) {
|
2005-08-12 23:51:49 +08:00
|
|
|
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto skip_listen_ht;
|
2005-08-10 16:54:28 +08:00
|
|
|
|
2005-08-10 10:59:44 +08:00
|
|
|
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sock *sk;
|
2008-11-24 09:22:55 +08:00
|
|
|
struct hlist_nulls_node *node;
|
2008-11-20 16:40:07 +08:00
|
|
|
struct inet_listen_hashbucket *ilb;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
num = 0;
|
2008-11-20 16:40:07 +08:00
|
|
|
ilb = &hashinfo->listening_hash[i];
|
|
|
|
spin_lock_bh(&ilb->lock);
|
2008-11-24 09:22:55 +08:00
|
|
|
sk_nulls_for_each(sk, node, &ilb->head) {
|
2005-04-17 06:20:36 +08:00
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
2012-07-16 12:28:49 +08:00
|
|
|
if (!net_eq(sock_net(sk), net))
|
|
|
|
continue;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (num < s_num) {
|
|
|
|
num++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2011-12-06 15:59:15 +08:00
|
|
|
if (r->sdiag_family != AF_UNSPEC &&
|
|
|
|
sk->sk_family != r->sdiag_family)
|
|
|
|
goto next_listen;
|
|
|
|
|
2009-10-15 14:30:45 +08:00
|
|
|
if (r->id.idiag_sport != inet->inet_sport &&
|
2005-08-12 23:51:49 +08:00
|
|
|
r->id.idiag_sport)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next_listen;
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
if (!(r->idiag_states & TCPF_LISTEN) ||
|
|
|
|
r->id.idiag_dport ||
|
2005-04-17 06:20:36 +08:00
|
|
|
cb->args[3] > 0)
|
|
|
|
goto syn_recv;
|
|
|
|
|
2011-12-06 15:58:58 +08:00
|
|
|
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
|
2008-11-20 16:40:07 +08:00
|
|
|
spin_unlock_bh(&ilb->lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
syn_recv:
|
2005-08-12 23:51:49 +08:00
|
|
|
if (!(r->idiag_states & TCPF_SYN_RECV))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next_listen;
|
|
|
|
|
2011-12-06 15:58:58 +08:00
|
|
|
if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) {
|
2008-11-20 16:40:07 +08:00
|
|
|
spin_unlock_bh(&ilb->lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
next_listen:
|
|
|
|
cb->args[3] = 0;
|
|
|
|
cb->args[4] = 0;
|
|
|
|
++num;
|
|
|
|
}
|
2008-11-20 16:40:07 +08:00
|
|
|
spin_unlock_bh(&ilb->lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
s_num = 0;
|
|
|
|
cb->args[3] = 0;
|
|
|
|
cb->args[4] = 0;
|
|
|
|
}
|
|
|
|
skip_listen_ht:
|
|
|
|
cb->args[0] = 1;
|
|
|
|
s_i = num = s_num = 0;
|
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
|
2011-12-09 14:22:26 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-10-09 08:16:19 +08:00
|
|
|
for (i = s_i; i <= hashinfo->ehash_mask; i++) {
|
2005-08-10 16:54:28 +08:00
|
|
|
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
|
2008-11-22 08:39:19 +08:00
|
|
|
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sock *sk;
|
2008-11-17 11:40:17 +08:00
|
|
|
struct hlist_nulls_node *node;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-28 16:09:54 +08:00
|
|
|
num = 0;
|
|
|
|
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
if (hlist_nulls_empty(&head->chain))
|
2008-08-28 16:09:54 +08:00
|
|
|
continue;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
if (i > s_i)
|
|
|
|
s_num = 0;
|
|
|
|
|
2008-11-22 08:39:19 +08:00
|
|
|
spin_lock_bh(lock);
|
2008-11-17 11:40:17 +08:00
|
|
|
sk_nulls_for_each(sk, node, &head->chain) {
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
int res;
|
2014-01-11 04:34:45 +08:00
|
|
|
int state;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-07-16 12:28:49 +08:00
|
|
|
if (!net_eq(sock_net(sk), net))
|
|
|
|
continue;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (num < s_num)
|
|
|
|
goto next_normal;
|
2014-01-11 04:34:45 +08:00
|
|
|
state = (sk->sk_state == TCP_TIME_WAIT) ?
|
|
|
|
inet_twsk(sk)->tw_substate : sk->sk_state;
|
|
|
|
if (!(r->idiag_states & (1 << state)))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next_normal;
|
2011-12-06 15:59:15 +08:00
|
|
|
if (r->sdiag_family != AF_UNSPEC &&
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
sk->sk_family != r->sdiag_family)
|
2011-12-06 15:59:15 +08:00
|
|
|
goto next_normal;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
if (r->id.idiag_sport != htons(sk->sk_num) &&
|
2005-08-12 23:51:49 +08:00
|
|
|
r->id.idiag_sport)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next_normal;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
if (r->id.idiag_dport != sk->sk_dport &&
|
2006-01-10 06:56:19 +08:00
|
|
|
r->id.idiag_dport)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto next_normal;
|
tcp/dccp: remove twchain
TCP listener refactoring, part 3 :
Our goal is to hash SYN_RECV sockets into main ehash for fast lookup,
and parallel SYN processing.
Current inet_ehash_bucket contains two chains, one for ESTABLISH (and
friend states) sockets, another for TIME_WAIT sockets only.
As the hash table is sized to get at most one socket per bucket, it
makes little sense to have separate twchain, as it makes the lookup
slightly more complicated, and doubles hash table memory usage.
If we make sure all socket types have the lookup keys at the same
offsets, we can use a generic and faster lookup. It turns out TIME_WAIT
and ESTABLISHED sockets already have common lookup fields for IPv4.
[ INET_TW_MATCH() is no longer needed ]
I'll provide a follow-up to factorize IPv6 lookup as well, to remove
INET6_TW_MATCH()
This way, SYN_RECV pseudo sockets will be supported the same.
A new sock_gen_put() helper is added, doing either a sock_put() or
inet_twsk_put() [ and will support SYN_RECV later ].
Note this helper should only be called in real slow path, when rcu
lookup found a socket that was moved to another identity (freed/reused
immediately), but could eventually be used in other contexts, like
sock_edemux()
Before patch :
dmesg | grep "TCP established"
TCP established hash table entries: 524288 (order: 11, 8388608 bytes)
After patch :
TCP established hash table entries: 524288 (order: 10, 4194304 bytes)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-03 15:22:02 +08:00
|
|
|
if (sk->sk_state == TCP_TIME_WAIT)
|
|
|
|
res = inet_twsk_diag_dump(sk, skb, cb, r, bc);
|
|
|
|
else
|
|
|
|
res = inet_csk_diag_dump(sk, skb, cb, r, bc);
|
|
|
|
if (res < 0) {
|
2008-11-22 08:39:19 +08:00
|
|
|
spin_unlock_bh(lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
next_normal:
|
|
|
|
++num;
|
|
|
|
}
|
|
|
|
|
2008-11-22 08:39:19 +08:00
|
|
|
spin_unlock_bh(lock);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
cb->args[1] = i;
|
|
|
|
cb->args[2] = num;
|
2011-12-09 14:22:26 +08:00
|
|
|
out:
|
|
|
|
;
|
|
|
|
}
|
2011-12-09 14:23:18 +08:00
|
|
|
EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
|
2011-12-09 14:22:26 +08:00
|
|
|
|
|
|
|
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 *r, struct nlattr *bc)
|
2011-12-09 14:22:26 +08:00
|
|
|
{
|
|
|
|
const struct inet_diag_handler *handler;
|
2012-11-03 17:30:34 +08:00
|
|
|
int err = 0;
|
2011-12-09 14:22:26 +08:00
|
|
|
|
|
|
|
handler = inet_diag_lock_handler(r->sdiag_protocol);
|
|
|
|
if (!IS_ERR(handler))
|
2011-12-09 14:23:18 +08:00
|
|
|
handler->dump(skb, cb, r, bc);
|
2012-11-03 17:30:34 +08:00
|
|
|
else
|
|
|
|
err = PTR_ERR(handler);
|
2007-12-03 12:51:25 +08:00
|
|
|
inet_diag_unlock_handler(handler);
|
2011-12-09 14:22:26 +08:00
|
|
|
|
2012-11-03 17:30:34 +08:00
|
|
|
return err ? : skb->len;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 15:58:58 +08:00
|
|
|
static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
|
|
|
|
{
|
|
|
|
struct nlattr *bc = NULL;
|
2012-01-11 06:36:35 +08:00
|
|
|
int hdrlen = sizeof(struct inet_diag_req_v2);
|
2011-12-06 15:58:58 +08:00
|
|
|
|
|
|
|
if (nlmsg_attrlen(cb->nlh, hdrlen))
|
|
|
|
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
|
|
|
|
|
2012-06-27 12:28:54 +08:00
|
|
|
return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
|
2011-12-06 15:58:58 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 15:59:32 +08:00
|
|
|
static inline int inet_diag_type2proto(int type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case TCPDIAG_GETSOCK:
|
|
|
|
return IPPROTO_TCP;
|
|
|
|
case DCCPDIAG_GETSOCK:
|
|
|
|
return IPPROTO_DCCP;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-12-06 15:58:58 +08:00
|
|
|
static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb)
|
|
|
|
{
|
2012-06-27 12:28:54 +08:00
|
|
|
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 req;
|
2011-12-06 15:58:58 +08:00
|
|
|
struct nlattr *bc = NULL;
|
2012-01-11 06:37:26 +08:00
|
|
|
int hdrlen = sizeof(struct inet_diag_req);
|
2011-12-06 15:58:58 +08:00
|
|
|
|
2011-12-06 15:59:15 +08:00
|
|
|
req.sdiag_family = AF_UNSPEC; /* compatibility */
|
2011-12-06 15:58:58 +08:00
|
|
|
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
|
|
|
|
req.idiag_ext = rc->idiag_ext;
|
|
|
|
req.idiag_states = rc->idiag_states;
|
|
|
|
req.id = rc->id;
|
|
|
|
|
|
|
|
if (nlmsg_attrlen(cb->nlh, hdrlen))
|
|
|
|
bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
|
|
|
|
|
|
|
|
return __inet_diag_dump(skb, cb, &req, bc);
|
|
|
|
}
|
|
|
|
|
2011-12-06 15:58:39 +08:00
|
|
|
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
|
|
|
|
const struct nlmsghdr *nlh)
|
|
|
|
{
|
2012-06-27 12:28:54 +08:00
|
|
|
struct inet_diag_req *rc = nlmsg_data(nlh);
|
2012-01-11 06:36:35 +08:00
|
|
|
struct inet_diag_req_v2 req;
|
2011-12-06 15:58:39 +08:00
|
|
|
|
|
|
|
req.sdiag_family = rc->idiag_family;
|
|
|
|
req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
|
|
|
|
req.idiag_ext = rc->idiag_ext;
|
|
|
|
req.idiag_states = rc->idiag_states;
|
|
|
|
req.id = rc->id;
|
|
|
|
|
|
|
|
return inet_diag_get_exact(in_skb, nlh, &req);
|
|
|
|
}
|
|
|
|
|
2011-12-06 15:57:06 +08:00
|
|
|
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2012-01-11 06:37:26 +08:00
|
|
|
int hdrlen = sizeof(struct inet_diag_req);
|
2012-07-16 12:28:49 +08:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-23 14:30:35 +08:00
|
|
|
if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX ||
|
|
|
|
nlmsg_len(nlh) < hdrlen)
|
|
|
|
return -EINVAL;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-01-19 04:40:38 +08:00
|
|
|
if (nlh->nlmsg_flags & NLM_F_DUMP) {
|
2007-03-23 14:30:35 +08:00
|
|
|
if (nlmsg_attrlen(nlh, hdrlen)) {
|
|
|
|
struct nlattr *attr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-03-23 14:30:35 +08:00
|
|
|
attr = nlmsg_find_attr(nlh, hdrlen,
|
|
|
|
INET_DIAG_REQ_BYTECODE);
|
|
|
|
if (attr == NULL ||
|
|
|
|
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
|
|
|
|
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2012-02-24 22:30:15 +08:00
|
|
|
{
|
|
|
|
struct netlink_dump_control c = {
|
|
|
|
.dump = inet_diag_dump_compat,
|
|
|
|
};
|
2012-07-16 12:28:49 +08:00
|
|
|
return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
|
2012-02-24 22:30:15 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-03-23 14:30:35 +08:00
|
|
|
|
2011-12-06 15:58:39 +08:00
|
|
|
return inet_diag_get_exact_compat(skb, nlh);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2011-12-06 15:58:03 +08:00
|
|
|
static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
|
|
|
|
{
|
2012-01-11 06:36:35 +08:00
|
|
|
int hdrlen = sizeof(struct inet_diag_req_v2);
|
2012-07-16 12:28:49 +08:00
|
|
|
struct net *net = sock_net(skb->sk);
|
2011-12-06 15:58:03 +08:00
|
|
|
|
|
|
|
if (nlmsg_len(h) < hdrlen)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (h->nlmsg_flags & NLM_F_DUMP) {
|
2011-12-06 15:58:58 +08:00
|
|
|
if (nlmsg_attrlen(h, hdrlen)) {
|
|
|
|
struct nlattr *attr;
|
|
|
|
attr = nlmsg_find_attr(h, hdrlen,
|
|
|
|
INET_DIAG_REQ_BYTECODE);
|
|
|
|
if (attr == NULL ||
|
|
|
|
nla_len(attr) < sizeof(struct inet_diag_bc_op) ||
|
|
|
|
inet_diag_bc_audit(nla_data(attr), nla_len(attr)))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2012-02-24 22:30:15 +08:00
|
|
|
{
|
|
|
|
struct netlink_dump_control c = {
|
|
|
|
.dump = inet_diag_dump,
|
|
|
|
};
|
2012-07-16 12:28:49 +08:00
|
|
|
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
|
2012-02-24 22:30:15 +08:00
|
|
|
}
|
2011-12-06 15:58:03 +08:00
|
|
|
}
|
|
|
|
|
2012-06-27 12:28:54 +08:00
|
|
|
return inet_diag_get_exact(skb, h, nlmsg_data(h));
|
2011-12-06 15:58:03 +08:00
|
|
|
}
|
|
|
|
|
2012-04-25 02:21:07 +08:00
|
|
|
static const struct sock_diag_handler inet_diag_handler = {
|
2011-12-06 15:58:03 +08:00
|
|
|
.family = AF_INET,
|
|
|
|
.dump = inet_diag_handler_dump,
|
|
|
|
};
|
|
|
|
|
2012-04-25 02:21:07 +08:00
|
|
|
static const struct sock_diag_handler inet6_diag_handler = {
|
2011-12-06 15:58:03 +08:00
|
|
|
.family = AF_INET6,
|
|
|
|
.dump = inet_diag_handler_dump,
|
|
|
|
};
|
|
|
|
|
2005-08-12 20:27:49 +08:00
|
|
|
int inet_diag_register(const struct inet_diag_handler *h)
|
|
|
|
{
|
|
|
|
const __u16 type = h->idiag_type;
|
|
|
|
int err = -EINVAL;
|
|
|
|
|
2011-12-06 16:05:24 +08:00
|
|
|
if (type >= IPPROTO_MAX)
|
2005-08-12 20:27:49 +08:00
|
|
|
goto out;
|
|
|
|
|
2007-12-03 12:51:25 +08:00
|
|
|
mutex_lock(&inet_diag_table_mutex);
|
2005-08-12 20:27:49 +08:00
|
|
|
err = -EEXIST;
|
|
|
|
if (inet_diag_table[type] == NULL) {
|
|
|
|
inet_diag_table[type] = h;
|
|
|
|
err = 0;
|
|
|
|
}
|
2007-12-03 12:51:25 +08:00
|
|
|
mutex_unlock(&inet_diag_table_mutex);
|
2005-08-12 20:27:49 +08:00
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet_diag_register);
|
|
|
|
|
|
|
|
void inet_diag_unregister(const struct inet_diag_handler *h)
|
|
|
|
{
|
|
|
|
const __u16 type = h->idiag_type;
|
|
|
|
|
2011-12-06 16:05:24 +08:00
|
|
|
if (type >= IPPROTO_MAX)
|
2005-08-12 20:27:49 +08:00
|
|
|
return;
|
|
|
|
|
2007-12-03 12:51:25 +08:00
|
|
|
mutex_lock(&inet_diag_table_mutex);
|
2005-08-12 20:27:49 +08:00
|
|
|
inet_diag_table[type] = NULL;
|
2007-12-03 12:51:25 +08:00
|
|
|
mutex_unlock(&inet_diag_table_mutex);
|
2005-08-12 20:27:49 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(inet_diag_unregister);
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
static int __init inet_diag_init(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-12-06 16:05:24 +08:00
|
|
|
const int inet_diag_table_size = (IPPROTO_MAX *
|
2005-08-12 20:27:49 +08:00
|
|
|
sizeof(struct inet_diag_handler *));
|
|
|
|
int err = -ENOMEM;
|
|
|
|
|
2006-07-22 05:51:30 +08:00
|
|
|
inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL);
|
2005-08-12 20:27:49 +08:00
|
|
|
if (!inet_diag_table)
|
|
|
|
goto out;
|
|
|
|
|
2011-12-06 15:58:03 +08:00
|
|
|
err = sock_diag_register(&inet_diag_handler);
|
|
|
|
if (err)
|
|
|
|
goto out_free_nl;
|
|
|
|
|
|
|
|
err = sock_diag_register(&inet6_diag_handler);
|
|
|
|
if (err)
|
|
|
|
goto out_free_inet;
|
|
|
|
|
2011-12-06 15:59:52 +08:00
|
|
|
sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
|
2005-08-12 20:27:49 +08:00
|
|
|
out:
|
|
|
|
return err;
|
2011-12-06 15:58:03 +08:00
|
|
|
|
|
|
|
out_free_inet:
|
|
|
|
sock_diag_unregister(&inet_diag_handler);
|
|
|
|
out_free_nl:
|
2005-08-12 20:27:49 +08:00
|
|
|
kfree(inet_diag_table);
|
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
static void __exit inet_diag_exit(void)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-12-06 15:58:03 +08:00
|
|
|
sock_diag_unregister(&inet6_diag_handler);
|
|
|
|
sock_diag_unregister(&inet_diag_handler);
|
2011-12-06 15:59:52 +08:00
|
|
|
sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
|
2005-08-12 20:27:49 +08:00
|
|
|
kfree(inet_diag_table);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2005-08-12 23:51:49 +08:00
|
|
|
module_init(inet_diag_init);
|
|
|
|
module_exit(inet_diag_exit);
|
2005-04-17 06:20:36 +08:00
|
|
|
MODULE_LICENSE("GPL");
|
2011-12-15 10:43:27 +08:00
|
|
|
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
|
|
|
|
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
|