linux/net/ipv4/igmp.c

3109 lines
74 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: Internet Group Management Protocol [IGMP]
*
* This code implements the IGMP protocol as defined in RFC1112. There has
* been a further revision of this protocol since which is now supported.
*
* If you have trouble with this module be careful what gcc you have used,
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
* Authors:
* Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* Fixes:
*
* Alan Cox : Added lots of __inline__ to optimise
* the memory usage of all the tiny little
* functions.
* Alan Cox : Dumped the header building experiment.
* Alan Cox : Minor tweaks ready for multicast routing
* and extended IGMP protocol.
* Alan Cox : Removed a load of inline directives. Gcc 2.5.8
* writes utterly bogus code otherwise (sigh)
* fixed IGMP loopback to behave in the manner
* desired by mrouted, fixed the fact it has been
* broken since 1.3.6 and cleaned up a few minor
* points.
*
* Chih-Jen Chang : Tried to revise IGMP to Version 2
* Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
* The enhancements are mainly based on Steve Deering's
* ipmulti-3.5 source code.
* Chih-Jen Chang : Added the igmp_get_mrouter_info and
* Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
* the mrouted version on that device.
* Chih-Jen Chang : Added the max_resp_time parameter to
* Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
* to identify the multicast router version
* and do what the IGMP version 2 specified.
* Chih-Jen Chang : Added a timer to revert to IGMP V2 router
* Tsu-Sheng Tsao if the specified time expired.
* Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
* Alan Cox : Use GFP_ATOMIC in the right places.
* Christian Daudt : igmp timer wasn't set for local group
* memberships but was being deleted,
* which caused a "del_timer() called
* from %p with timer not initialized\n"
* message (960131).
* Christian Daudt : removed del_timer from
* igmp_timer_expire function (960205).
* Christian Daudt : igmp_heard_report now only calls
* igmp_timer_expire if tm->running is
* true (960216).
* Malcolm Beattie : ttl comparison wrong in igmp_rcv made
* igmp_heard_query never trigger. Expiry
* miscalculation fixed in igmp_heard_query
* and random() made to return unsigned to
* prevent negative expiry times.
* Alexey Kuznetsov: Wrong group leaving behaviour, backport
* fix from pending 2.1.x patches.
* Alan Cox: Forget to enable FDDI support earlier.
* Alexey Kuznetsov: Fixed leaving groups on device down.
* Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
* David L Stevens: IGMPv3 support, with help from
* Vinay Kulkarni
*/
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#endif
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
#define IGMP_QUERY_INTERVAL (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
#define IGMP_INITIAL_REPORT_DELAY (1)
/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
* IGMP specs require to report membership immediately after
* joining a group, but we delay the first report by a
* small interval. It seems more natural and still does not
* contradict to specs provided this delay is small enough.
*/
#define IGMP_V1_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
static int unsolicited_report_interval(struct in_device *in_dev)
{
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
int interval_ms, interval_jiffies;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
interval_ms = IN_DEV_CONF_GET(
in_dev,
IGMPV2_UNSOLICITED_REPORT_INTERVAL);
else /* v3 */
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
interval_ms = IN_DEV_CONF_GET(
in_dev,
IGMPV3_UNSOLICITED_REPORT_INTERVAL);
interval_jiffies = msecs_to_jiffies(interval_ms);
/* _timer functions can't handle a delay of 0 jiffies so ensure
* we always return a positive value.
*/
if (interval_jiffies <= 0)
interval_jiffies = 1;
return interval_jiffies;
}
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
gfp_t gfp);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
#endif
static void ip_mc_clear_src(struct ip_mc_list *pmc);
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta);
static void ip_ma_put(struct ip_mc_list *im)
{
if (refcount_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
kfree_rcu(im, rcu);
}
}
#define for_each_pmc_rcu(in_dev, pmc) \
for (pmc = rcu_dereference(in_dev->mc_list); \
pmc != NULL; \
pmc = rcu_dereference(pmc->next_rcu))
#define for_each_pmc_rtnl(in_dev, pmc) \
for (pmc = rtnl_dereference(in_dev->mc_list); \
pmc != NULL; \
pmc = rtnl_dereference(pmc->next_rcu))
static void ip_sf_list_clear_all(struct ip_sf_list *psf)
{
struct ip_sf_list *next;
while (psf) {
next = psf->sf_next;
kfree(psf);
psf = next;
}
}
#ifdef CONFIG_IP_MULTICAST
/*
* Timer management
*/
static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
if (del_timer(&im->timer))
refcount_dec(&im->refcnt);
im->tm_running = 0;
im->reporter = 0;
im->unsolicit_count = 0;
spin_unlock_bh(&im->lock);
}
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
int tv = get_random_u32_below(max_delay);
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
refcount_inc(&im->refcnt);
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
int tv = get_random_u32_below(in_dev->mr_maxdelay);
igmp: Make igmp group member RFC 3376 compliant 5.2. Action on Reception of a Query When a system receives a Query, it does not respond immediately. Instead, it delays its response by a random amount of time, bounded by the Max Resp Time value derived from the Max Resp Code in the received Query message. A system may receive a variety of Queries on different interfaces and of different kinds (e.g., General Queries, Group-Specific Queries, and Group-and-Source-Specific Queries), each of which may require its own delayed response. Before scheduling a response to a Query, the system must first consider previously scheduled pending responses and in many cases schedule a combined response. Therefore, the system must be able to maintain the following state: o A timer per interface for scheduling responses to General Queries. o A per-group and interface timer for scheduling responses to Group- Specific and Group-and-Source-Specific Queries. o A per-group and interface list of sources to be reported in the response to a Group-and-Source-Specific Query. When a new Query with the Router-Alert option arrives on an interface, provided the system has state to report, a delay for a response is randomly selected in the range (0, [Max Resp Time]) where Max Resp Time is derived from Max Resp Code in the received Query message. The following rules are then used to determine if a Report needs to be scheduled and the type of Report to schedule. The rules are considered in order and only the first matching rule is applied. 1. If there is a pending response to a previous General Query scheduled sooner than the selected delay, no additional response needs to be scheduled. 2. If the received Query is a General Query, the interface timer is used to schedule a response to the General Query after the selected delay. Any previously pending response to a General Query is canceled. --8<-- Currently the timer is rearmed with new random expiration time for every incoming query regardless of possibly already pending report. Which is not aligned with the above RFE. It also might happen that higher rate of incoming queries can postpone the report after the expiration time of the first query causing group membership loss. Now the per interface general query timer is rearmed only when there is no pending report already scheduled on that interface or the newly selected expiration time is before the already pending scheduled report. Signed-off-by: Michal Tesar <mtesar@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-02 21:38:36 +08:00
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
time_after_eq(exp, (in_dev->mr_gq_timer).expires))
return;
in_dev->mr_gq_running = 1;
igmp: Make igmp group member RFC 3376 compliant 5.2. Action on Reception of a Query When a system receives a Query, it does not respond immediately. Instead, it delays its response by a random amount of time, bounded by the Max Resp Time value derived from the Max Resp Code in the received Query message. A system may receive a variety of Queries on different interfaces and of different kinds (e.g., General Queries, Group-Specific Queries, and Group-and-Source-Specific Queries), each of which may require its own delayed response. Before scheduling a response to a Query, the system must first consider previously scheduled pending responses and in many cases schedule a combined response. Therefore, the system must be able to maintain the following state: o A timer per interface for scheduling responses to General Queries. o A per-group and interface timer for scheduling responses to Group- Specific and Group-and-Source-Specific Queries. o A per-group and interface list of sources to be reported in the response to a Group-and-Source-Specific Query. When a new Query with the Router-Alert option arrives on an interface, provided the system has state to report, a delay for a response is randomly selected in the range (0, [Max Resp Time]) where Max Resp Time is derived from Max Resp Code in the received Query message. The following rules are then used to determine if a Report needs to be scheduled and the type of Report to schedule. The rules are considered in order and only the first matching rule is applied. 1. If there is a pending response to a previous General Query scheduled sooner than the selected delay, no additional response needs to be scheduled. 2. If the received Query is a General Query, the interface timer is used to schedule a response to the General Query after the selected delay. Any previously pending response to a General Query is canceled. --8<-- Currently the timer is rearmed with new random expiration time for every incoming query regardless of possibly already pending report. Which is not aligned with the above RFE. It also might happen that higher rate of incoming queries can postpone the report after the expiration time of the first query causing group membership loss. Now the per interface general query timer is rearmed only when there is no pending report already scheduled on that interface or the newly selected expiration time is before the already pending scheduled report. Signed-off-by: Michal Tesar <mtesar@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-02 21:38:36 +08:00
if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
int tv = get_random_u32_below(delay);
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
}
static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
if (del_timer(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
im->tm_running = 1;
spin_unlock_bh(&im->lock);
return;
}
refcount_dec(&im->refcnt);
}
igmp_start_timer(im, max_delay);
spin_unlock_bh(&im->lock);
}
/*
* Send an IGMP report.
*/
#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
int gdeleted, int sdeleted)
{
switch (type) {
case IGMPV3_MODE_IS_INCLUDE:
case IGMPV3_MODE_IS_EXCLUDE:
if (gdeleted || sdeleted)
return 0;
if (!(pmc->gsquery && !psf->sf_gsresp)) {
if (pmc->sfmode == MCAST_INCLUDE)
return 1;
/* don't include if this source is excluded
* in all filters
*/
if (psf->sf_count[MCAST_INCLUDE])
return type == IGMPV3_MODE_IS_INCLUDE;
return pmc->sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
}
return 0;
case IGMPV3_CHANGE_TO_INCLUDE:
if (gdeleted || sdeleted)
return 0;
return psf->sf_count[MCAST_INCLUDE] != 0;
case IGMPV3_CHANGE_TO_EXCLUDE:
if (gdeleted || sdeleted)
return 0;
if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
psf->sf_count[MCAST_INCLUDE])
return 0;
return pmc->sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
case IGMPV3_ALLOW_NEW_SOURCES:
if (gdeleted || !psf->sf_crcount)
return 0;
return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
case IGMPV3_BLOCK_OLD_SOURCES:
if (pmc->sfmode == MCAST_INCLUDE)
return gdeleted || (psf->sf_crcount && sdeleted);
return psf->sf_crcount && !gdeleted && !sdeleted;
}
return 0;
}
static int
igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
{
struct ip_sf_list *psf;
int scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
}
return scount;
}
/* source address selection per RFC 3376 section 4.2.13 */
static __be32 igmpv3_get_srcaddr(struct net_device *dev,
const struct flowi4 *fl4)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
const struct in_ifaddr *ifa;
if (!in_dev)
return htonl(INADDR_ANY);
in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
}
return htonl(INADDR_ANY);
}
ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: <IRQ> [<ffffffff80578226>] ? skb_put+0x3a/0x3b [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu <lw1a2.jing@gmail.com> Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: David L Stevens <david.stevens@oracle.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-06 03:27:38 +08:00
static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
struct sk_buff *skb;
struct rtable *rt;
struct iphdr *pip;
struct igmpv3_report *pig;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = dev_net(dev);
struct flowi4 fl4;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
unsigned int size;
size = min(mtu, IP_MAX_MTU);
while (1) {
skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
if (skb)
break;
size >>= 1;
if (size < 256)
return NULL;
}
skb->priority = TC_PRIO_CONTROL;
rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
0, 0,
IPPROTO_IGMP, 0, dev->ifindex);
if (IS_ERR(rt)) {
kfree_skb(skb);
return NULL;
}
skb_dst_set(skb, &rt->dst);
skb->dev = dev;
skb_reserve(skb, hlen);
mld, igmp: Fix reserved tailroom calculation The current reserved_tailroom calculation fails to take hlen and tlen into account. skb: [__hlen__|__data____________|__tlen___|__extra__] ^ ^ head skb_end_offset In this representation, hlen + data + tlen is the size passed to alloc_skb. "extra" is the extra space made available in __alloc_skb because of rounding up by kmalloc. We can reorder the representation like so: [__hlen__|__data____________|__extra__|__tlen___] ^ ^ head skb_end_offset The maximum space available for ip headers and payload without fragmentation is min(mtu, data + extra). Therefore, reserved_tailroom = data + extra + tlen - min(mtu, data + extra) = skb_end_offset - hlen - min(mtu, skb_end_offset - hlen - tlen) = skb_tailroom - min(mtu, skb_tailroom - tlen) ; after skb_reserve(hlen) Compare the second line to the current expression: reserved_tailroom = skb_end_offset - min(mtu, skb_end_offset) and we can see that hlen and tlen are not taken into account. The min() in the third line can be expanded into: if mtu < skb_tailroom - tlen: reserved_tailroom = skb_tailroom - mtu else: reserved_tailroom = tlen Depending on hlen, tlen, mtu and the number of multicast address records, the current code may output skbs that have less tailroom than dev->needed_tailroom or it may output more skbs than needed because not all space available is used. Fixes: 4c672e4b ("ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs") Signed-off-by: Benjamin Poirier <bpoirier@suse.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-01 07:03:33 +08:00
skb_tailroom_reserve(skb, mtu, tlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
skb_put(skb, sizeof(struct iphdr) + 4);
pip->version = 4;
pip->ihl = (sizeof(struct iphdr)+4)>>2;
pip->tos = 0xc0;
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
pip->daddr = fl4.daddr;
net: igmp: add a missing rcu locking section Newly added igmpv3_get_srcaddr() needs to be called under rcu lock. Timer callbacks do not ensure this locking. ============================= WARNING: suspicious RCU usage 4.15.0+ #200 Not tainted ----------------------------- ./include/linux/inetdevice.h:216 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syzkaller616973/4074: #0: (&mm->mmap_sem){++++}, at: [<00000000bfce669e>] __do_page_fault+0x32d/0xc90 arch/x86/mm/fault.c:1355 #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1316 #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] igmpv3_send_report+0x98/0x5b0 net/ipv4/igmp.c:600 stack backtrace: CPU: 0 PID: 4074 Comm: syzkaller616973 Not tainted 4.15.0+ #200 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <IRQ> __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4592 __in_dev_get_rcu include/linux/inetdevice.h:216 [inline] igmpv3_get_srcaddr net/ipv4/igmp.c:329 [inline] igmpv3_newpack+0xeef/0x12e0 net/ipv4/igmp.c:389 add_grhead.isra.27+0x235/0x300 net/ipv4/igmp.c:432 add_grec+0xbd3/0x1170 net/ipv4/igmp.c:565 igmpv3_send_report+0xd5/0x5b0 net/ipv4/igmp.c:605 igmp_send_report+0xc43/0x1050 net/ipv4/igmp.c:722 igmp_timer_expire+0x322/0x5c0 net/ipv4/igmp.c:831 call_timer_fn+0x228/0x820 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:541 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:938 Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-02 02:26:57 +08:00
rcu_read_lock();
pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
net: igmp: add a missing rcu locking section Newly added igmpv3_get_srcaddr() needs to be called under rcu lock. Timer callbacks do not ensure this locking. ============================= WARNING: suspicious RCU usage 4.15.0+ #200 Not tainted ----------------------------- ./include/linux/inetdevice.h:216 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 3 locks held by syzkaller616973/4074: #0: (&mm->mmap_sem){++++}, at: [<00000000bfce669e>] __do_page_fault+0x32d/0xc90 arch/x86/mm/fault.c:1355 #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] lockdep_copy_map include/linux/lockdep.h:178 [inline] #1: ((&im->timer)){+.-.}, at: [<00000000619d2f71>] call_timer_fn+0x1c6/0x820 kernel/time/timer.c:1316 #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] spin_lock_bh include/linux/spinlock.h:315 [inline] #2: (&(&im->lock)->rlock){+.-.}, at: [<000000005f833c5c>] igmpv3_send_report+0x98/0x5b0 net/ipv4/igmp.c:600 stack backtrace: CPU: 0 PID: 4074 Comm: syzkaller616973 Not tainted 4.15.0+ #200 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <IRQ> __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:53 lockdep_rcu_suspicious+0x123/0x170 kernel/locking/lockdep.c:4592 __in_dev_get_rcu include/linux/inetdevice.h:216 [inline] igmpv3_get_srcaddr net/ipv4/igmp.c:329 [inline] igmpv3_newpack+0xeef/0x12e0 net/ipv4/igmp.c:389 add_grhead.isra.27+0x235/0x300 net/ipv4/igmp.c:432 add_grec+0xbd3/0x1170 net/ipv4/igmp.c:565 igmpv3_send_report+0xd5/0x5b0 net/ipv4/igmp.c:605 igmp_send_report+0xc43/0x1050 net/ipv4/igmp.c:722 igmp_timer_expire+0x322/0x5c0 net/ipv4/igmp.c:831 call_timer_fn+0x228/0x820 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x7ee/0xb70 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2d7/0xb85 kernel/softirq.c:285 invoke_softirq kernel/softirq.c:365 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:541 [inline] smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:938 Fixes: a46182b00290 ("net: igmp: Use correct source address on IGMPv3 reports") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-02 02:26:57 +08:00
rcu_read_unlock();
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
ip_select_ident(net, skb, NULL);
((u8 *)&pip[1])[0] = IPOPT_RA;
((u8 *)&pip[1])[1] = 4;
((u8 *)&pip[1])[2] = 0;
((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
pig = igmpv3_report_hdr(skb);
pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
pig->resv1 = 0;
pig->csum = 0;
pig->resv2 = 0;
pig->ngrec = 0;
return skb;
}
static int igmpv3_sendpack(struct sk_buff *skb)
{
struct igmphdr *pig = igmp_hdr(skb);
const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
struct net_device *dev = pmc->interface->dev;
struct igmpv3_report *pih;
struct igmpv3_grec *pgr;
if (!skb) {
skb = igmpv3_newpack(dev, mtu);
if (!skb)
return NULL;
}
pgr = skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
pgr->grec_nsrcs = 0;
pgr->grec_mca = pmc->multiaddr;
pih = igmpv3_report_hdr(skb);
pih->ngrec = htons(ntohs(pih->ngrec)+1);
*ppgr = pgr;
return skb;
}
ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: <IRQ> [<ffffffff80578226>] ? skb_put+0x3a/0x3b [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu <lw1a2.jing@gmail.com> Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: David L Stevens <david.stevens@oracle.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-06 03:27:38 +08:00
#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, int gdeleted, int sdeleted)
{
struct net_device *dev = pmc->interface->dev;
struct net *net = dev_net(dev);
struct igmpv3_report *pih;
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
unsigned int mtu;
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
if (ipv4_is_local_multicast(pmc->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return skb;
mtu = READ_ONCE(dev->mtu);
if (mtu < IPV4_MIN_MTU)
return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
type == IGMPV3_CHANGE_TO_EXCLUDE;
stotal = scount = 0;
psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
if (!*psf_list)
goto empty_source;
pih = skb ? igmpv3_report_hdr(skb) : NULL;
/* EX and TO_EX get a fresh packet, if needed */
if (truncate) {
if (pih && pih->ngrec &&
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
igmpv3_sendpack(skb);
skb = igmpv3_newpack(dev, mtu);
}
}
first = 1;
psf_prev = NULL;
for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
psf_prev = psf;
continue;
}
/* Based on RFC3376 5.1. Should not send source-list change
* records when there is a filter mode change.
*/
if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
(!gdeleted && pmc->crcount)) &&
(type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
goto decrease_sf_crcount;
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
if (AVAILABLE(skb) < sizeof(__be32) +
first*sizeof(struct igmpv3_grec)) {
if (truncate && !first)
break; /* truncate these */
if (pgr)
pgr->grec_nsrcs = htons(scount);
if (skb)
igmpv3_sendpack(skb);
skb = igmpv3_newpack(dev, mtu);
first = 1;
scount = 0;
}
if (first) {
skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
if (!skb)
return NULL;
psrc = skb_put(skb, sizeof(__be32));
*psrc = psf->sf_inaddr;
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
*psf_list = psf->sf_next;
kfree(psf);
continue;
}
}
psf_prev = psf;
}
empty_source:
if (!stotal) {
if (type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES)
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
pgr->grec_nsrcs = htons(scount);
if (isquery)
pmc->gsquery = 0; /* clear query state on report */
return skb;
}
static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
struct sk_buff *skb = NULL;
struct net *net = dev_net(in_dev->dev);
int type;
if (!pmc) {
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(pmc->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
else
type = IGMPV3_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
rcu_read_unlock();
} else {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
else
type = IGMPV3_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
if (!skb)
return 0;
return igmpv3_sendpack(skb);
}
/*
* remove zero-count source records from a source filter list
*/
static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
{
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
*ppsf = psf->sf_next;
kfree(psf);
} else
psf_prev = psf;
}
}
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
static void kfree_pmc(struct ip_mc_list *pmc)
{
ip_sf_list_clear_all(pmc->sources);
ip_sf_list_clear_all(pmc->tomb);
kfree(pmc);
}
static void igmpv3_send_cr(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
struct sk_buff *skb = NULL;
int type, dtype;
rcu_read_lock();
spin_lock_bh(&in_dev->mc_tomb_lock);
/* deleted MCA's */
pmc_prev = NULL;
for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
dtype = IGMPV3_BLOCK_OLD_SOURCES;
skb = add_grec(skb, pmc, type, 1, 0);
skb = add_grec(skb, pmc, dtype, 1, 1);
}
if (pmc->crcount) {
if (pmc->sfmode == MCAST_EXCLUDE) {
type = IGMPV3_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 1, 0);
}
pmc->crcount--;
if (pmc->crcount == 0) {
igmpv3_clear_zeros(&pmc->tomb);
igmpv3_clear_zeros(&pmc->sources);
}
}
if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
if (pmc_prev)
pmc_prev->next = pmc_next;
else
in_dev->mc_tomb = pmc_next;
in_dev_put(pmc->interface);
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
kfree_pmc(pmc);
} else
pmc_prev = pmc;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
/* change recs */
for_each_pmc_rcu(in_dev, pmc) {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE]) {
type = IGMPV3_BLOCK_OLD_SOURCES;
dtype = IGMPV3_ALLOW_NEW_SOURCES;
} else {
type = IGMPV3_ALLOW_NEW_SOURCES;
dtype = IGMPV3_BLOCK_OLD_SOURCES;
}
skb = add_grec(skb, pmc, type, 0, 0);
skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
/* filter mode changes */
if (pmc->crcount) {
if (pmc->sfmode == MCAST_EXCLUDE)
type = IGMPV3_CHANGE_TO_EXCLUDE;
else
type = IGMPV3_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
pmc->crcount--;
}
spin_unlock_bh(&pmc->lock);
}
rcu_read_unlock();
if (!skb)
return;
(void) igmpv3_sendpack(skb);
}
static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
int type)
{
struct sk_buff *skb;
struct iphdr *iph;
struct igmphdr *ih;
struct rtable *rt;
struct net_device *dev = in_dev->dev;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = dev_net(dev);
__be32 group = pmc ? pmc->multiaddr : 0;
struct flowi4 fl4;
__be32 dst;
int hlen, tlen;
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(group) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return 0;
if (type == IGMP_HOST_LEAVE_MESSAGE)
dst = IGMP_ALL_ROUTER;
else
dst = group;
rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
0, 0,
IPPROTO_IGMP, 0, dev->ifindex);
if (IS_ERR(rt))
return -1;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
if (!skb) {
ip_rt_put(rt);
return -1;
}
skb->priority = TC_PRIO_CONTROL;
skb_dst_set(skb, &rt->dst);
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
iph = ip_hdr(skb);
skb_put(skb, sizeof(struct iphdr) + 4);
iph->version = 4;
iph->ihl = (sizeof(struct iphdr)+4)>>2;
iph->tos = 0xc0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
ip_select_ident(net, skb, NULL);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
((u8 *)&iph[1])[3] = 0;
ih = skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
ih->code = 0;
ih->csum = 0;
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
return ip_local_out(net, skb->sk, skb);
}
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
static void igmp_gq_timer_expire(struct timer_list *t)
{
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
in_dev_put(in_dev);
}
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
static void igmp_ifc_timer_expire(struct timer_list *t)
{
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
u32 mr_ifc_count;
igmpv3_send_cr(in_dev);
net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-10 17:45:47 +08:00
restart:
mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);
if (mr_ifc_count) {
if (cmpxchg(&in_dev->mr_ifc_count,
mr_ifc_count,
mr_ifc_count - 1) != mr_ifc_count)
goto restart;
igmp_ifc_start_timer(in_dev,
unsolicited_report_interval(in_dev));
}
in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
{
struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
igmp_ifc_start_timer(in_dev, 1);
}
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
static void igmp_timer_expire(struct timer_list *t)
{
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
struct ip_mc_list *im = from_timer(im, t, timer);
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
im->tm_running = 0;
if (im->unsolicit_count && --im->unsolicit_count)
igmp_start_timer(im, unsolicited_report_interval(in_dev));
im->reporter = 1;
spin_unlock(&im->lock);
if (IGMP_V1_SEEN(in_dev))
igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
else if (IGMP_V2_SEEN(in_dev))
igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
else
igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
ip_ma_put(im);
}
/* mark EXCLUDE-mode sources */
static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
struct ip_sf_list *psf;
int i, scount;
scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
}
}
}
pmc->gsquery = 0;
if (scount == nsrcs) /* all sources excluded */
return 0;
return 1;
}
static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
struct ip_sf_list *psf;
int i, scount;
if (pmc->sfmode == MCAST_EXCLUDE)
return igmp_xmarksources(pmc, nsrcs, srcs);
/* mark INCLUDE-mode sources */
scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
break;
}
}
if (!scount) {
pmc->gsquery = 0;
return 0;
}
pmc->gsquery = 1;
return 1;
}
/* return true if packet was dropped */
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
struct net *net = dev_net(in_dev->dev);
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
return false;
if (ipv4_is_local_multicast(group) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return false;
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
break;
}
}
rcu_read_unlock();
return false;
}
/* return true if packet was dropped */
static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int len)
{
struct igmphdr *ih = igmp_hdr(skb);
struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
struct ip_mc_list *im;
__be32 group = ih->group;
int max_delay;
int mark = 0;
struct net *net = dev_net(in_dev->dev);
if (len == 8) {
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
(in_dev->mr_qrv * in_dev->mr_qi) +
in_dev->mr_qri;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
(in_dev->mr_qrv * in_dev->mr_qi) +
in_dev->mr_qri;
}
/* cancel the interface change timer */
net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-10 17:45:47 +08:00
WRITE_ONCE(in_dev->mr_ifc_count, 0);
if (del_timer(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
return true; /* ignore bogus packet; freed by caller */
} else if (IGMP_V1_SEEN(in_dev)) {
/* This is a v3 query with v1 queriers present */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
group = 0;
} else if (IGMP_V2_SEEN(in_dev)) {
/* this is a v3 query with v2 queriers present;
* Interpretation of the max_delay code is problematic here.
* A real v2 host would use ih_code directly, while v3 has a
* different encoding. We use the v3 encoding as more likely
* to be intended in a v3 query.
*/
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return true;
ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs) {
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ ntohs(ih3->nsrcs)*sizeof(__be32)))
return true;
ih3 = igmpv3_query_hdr(skb);
}
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
in_dev->mr_maxdelay = max_delay;
/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
* received value was zero, use the default or statically
* configured value.
*/
in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
/* RFC3376, 8.3. Query Response Interval:
* The number of seconds represented by the [Query Response
* Interval] must be less than the [Query Interval].
*/
if (in_dev->mr_qri >= in_dev->mr_qi)
in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
if (!group) { /* general query */
if (ih3->nsrcs)
return true; /* no sources allowed */
igmp_gq_start_timer(in_dev);
return false;
}
/* mark sources to include, if group & source-specific */
mark = ih3->nsrcs != 0;
}
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
* which the query arrived excl. those that belong
* to a "local" group (224.0.0.X)
* - For timers already running check if they need to
* be reset.
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
int changed;
if (group && group != im->multiaddr)
continue;
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(im->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
im->gsquery = im->gsquery && mark;
else
im->gsquery = mark;
changed = !im->gsquery ||
igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
spin_unlock_bh(&im->lock);
if (changed)
igmp_mod_timer(im, max_delay);
}
rcu_read_unlock();
return false;
}
/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
struct net_device *dev = skb->dev;
struct in_device *in_dev;
int len = skb->len;
bool dropped = true;
if (netif_is_l3_master(dev)) {
dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
if (!dev)
goto drop;
}
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
if (skb_checksum_simple_validate(skb))
goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
dropped = igmp_heard_query(in_dev, skb, len);
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
/* Is it our report looped back? */
if (rt_is_output_route(skb_rtable(skb)))
break;
/* don't rely on MC router hearing unicast reports */
if (skb->pkt_type == PACKET_MULTICAST ||
skb->pkt_type == PACKET_BROADCAST)
dropped = igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
return pim_rcv_v1(skb);
#endif
case IGMPV3_HOST_MEMBERSHIP_REPORT:
case IGMP_DVMRP:
case IGMP_TRACE:
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_MTRACE:
case IGMP_MTRACE_RESP:
break;
default:
break;
}
drop:
if (dropped)
kfree_skb(skb);
else
consume_skb(skb);
return 0;
}
#endif
/*
* Add a filter to a device
*/
static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
{
char buf[MAX_ADDR_LEN];
struct net_device *dev = in_dev->dev;
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
is changed. This check should be done in ndo_set_rx_mode
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
*/
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_add(dev, buf);
}
/*
* Remove a filter from a device
*/
static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
{
char buf[MAX_ADDR_LEN];
struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_del(dev, buf);
}
#ifdef CONFIG_IP_MULTICAST
/*
* deleted ip_mc_list manipulation
*/
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
gfp_t gfp)
{
struct ip_mc_list *pmc;
struct net *net = dev_net(in_dev->dev);
/* this is an "ip_mc_list" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
* used for management of the delete list. Using the same structure
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
pmc = kzalloc(sizeof(*pmc), gfp);
if (!pmc)
return;
2017-06-21 01:46:27 +08:00
spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc->next = in_dev->mc_tomb;
in_dev->mc_tomb = pmc;
spin_unlock_bh(&in_dev->mc_tomb_lock);
}
/*
* restore ip_mc_list deleted records
*/
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc, *pmc_prev;
struct ip_sf_list *psf;
struct net *net = dev_net(in_dev->dev);
__be32 multiaddr = im->multiaddr;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
}
if (pmc) {
if (pmc_prev)
pmc_prev->next = pmc->next;
else
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
spin_lock_bh(&im->lock);
if (pmc) {
im->interface = pmc->interface;
if (im->sfmode == MCAST_INCLUDE) {
igmp: fix memory leak in igmpv3_del_delrec() im->tomb and/or im->sources might not be NULL, but we currently overwrite their values blindly. Using swap() will make sure the following call to kfree_pmc(pmc) will properly free the psf structures. Tested with the C repro provided by syzbot, which basically does : socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 3 setsockopt(3, SOL_IP, IP_ADD_MEMBERSHIP, "\340\0\0\2\177\0\0\1\0\0\0\0", 12) = 0 ioctl(3, SIOCSIFFLAGS, {ifr_name="lo", ifr_flags=0}) = 0 setsockopt(3, SOL_IP, IP_MSFILTER, "\340\0\0\2\177\0\0\1\1\0\0\0\1\0\0\0\377\377\377\377", 20) = 0 ioctl(3, SIOCSIFFLAGS, {ifr_name="lo", ifr_flags=IFF_UP}) = 0 exit_group(0) = ? BUG: memory leak unreferenced object 0xffff88811450f140 (size 64): comm "softirq", pid 0, jiffies 4294942448 (age 32.070s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<00000000c7bad083>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000c7bad083>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000c7bad083>] slab_alloc mm/slab.c:3326 [inline] [<00000000c7bad083>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000009acc4151>] kmalloc include/linux/slab.h:547 [inline] [<000000009acc4151>] kzalloc include/linux/slab.h:742 [inline] [<000000009acc4151>] ip_mc_add1_src net/ipv4/igmp.c:1976 [inline] [<000000009acc4151>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2100 [<000000004ac14566>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2484 [<0000000052d8f995>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:959 [<000000004ee1e21f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1248 [<0000000066cdfe74>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2618 [<000000009383a786>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3126 [<00000000d8ac0c94>] __sys_setsockopt+0x98/0x120 net/socket.c:2072 [<000000001b1e9666>] __do_sys_setsockopt net/socket.c:2083 [inline] [<000000001b1e9666>] __se_sys_setsockopt net/socket.c:2080 [inline] [<000000001b1e9666>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2080 [<00000000420d395e>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<000000007fd83a4b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot+6ca1abd0db68b5173a4f@syzkaller.appspotmail.com Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-27 16:27:01 +08:00
swap(im->tomb, pmc->tomb);
swap(im->sources, pmc->sources);
for (psf = im->sources; psf; psf = psf->sf_next)
psf->sf_crcount = in_dev->mr_qrv ?:
READ_ONCE(net->ipv4.sysctl_igmp_qrv);
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
} else {
im->crcount = in_dev->mr_qrv ?:
READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
in_dev_put(pmc->interface);
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
kfree_pmc(pmc);
}
spin_unlock_bh(&im->lock);
}
/*
* flush ip_mc_list deleted records
*/
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *nextpmc;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc = in_dev->mc_tomb;
in_dev->mc_tomb = NULL;
spin_unlock_bh(&in_dev->mc_tomb_lock);
for (; pmc; pmc = nextpmc) {
nextpmc = pmc->next;
ip_mc_clear_src(pmc);
in_dev_put(pmc->interface);
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
kfree_pmc(pmc);
}
/* clear dead sources, too */
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
struct ip_sf_list *psf;
spin_lock_bh(&pmc->lock);
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
ip_sf_list_clear_all(psf);
}
rcu_read_unlock();
}
#endif
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
int reporter;
#endif
if (im->loaded) {
im->loaded = 0;
ip_mc_filter_del(in_dev, im->multiaddr);
}
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
if (ipv4_is_local_multicast(im->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return;
reporter = im->reporter;
igmp_stop_timer(im);
if (!in_dev->dead) {
if (IGMP_V1_SEEN(in_dev))
return;
if (IGMP_V2_SEEN(in_dev)) {
if (reporter)
igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
return;
}
/* IGMPv3 */
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
igmpv3_add_delrec(in_dev, im, gfp);
igmp_ifc_event(in_dev);
}
#endif
}
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
static void igmp_group_dropped(struct ip_mc_list *im)
{
__igmp_group_dropped(im, GFP_KERNEL);
}
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
#endif
if (im->loaded == 0) {
im->loaded = 1;
ip_mc_filter_add(in_dev, im->multiaddr);
}
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
if (ipv4_is_local_multicast(im->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return;
if (in_dev->dead)
return;
im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
spin_unlock_bh(&im->lock);
return;
}
/* else, v3 */
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
/* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
* not send filter-mode change record as the mode should be from
* IN() to IN(A).
*/
if (im->sfmode == MCAST_EXCLUDE)
im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
igmp_ifc_event(in_dev);
#endif
}
/*
* Multicast list managers
*/
static u32 ip_mc_hash(const struct ip_mc_list *im)
{
return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
}
static void ip_mc_hash_add(struct in_device *in_dev,
struct ip_mc_list *im)
{
struct ip_mc_list __rcu **mc_hash;
u32 hash;
mc_hash = rtnl_dereference(in_dev->mc_hash);
if (mc_hash) {
hash = ip_mc_hash(im);
im->next_hash = mc_hash[hash];
rcu_assign_pointer(mc_hash[hash], im);
return;
}
/* do not use a hash table for small number of items */
if (in_dev->mc_count < 4)
return;
mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
GFP_KERNEL);
if (!mc_hash)
return;
for_each_pmc_rtnl(in_dev, im) {
hash = ip_mc_hash(im);
im->next_hash = mc_hash[hash];
RCU_INIT_POINTER(mc_hash[hash], im);
}
rcu_assign_pointer(in_dev->mc_hash, mc_hash);
}
static void ip_mc_hash_remove(struct in_device *in_dev,
struct ip_mc_list *im)
{
struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
struct ip_mc_list *aux;
if (!mc_hash)
return;
mc_hash += ip_mc_hash(im);
while ((aux = rtnl_dereference(*mc_hash)) != im)
mc_hash = &aux->next_hash;
*mc_hash = im->next_hash;
}
/*
* A socket has joined a multicast group on device dev.
*/
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
unsigned int mode, gfp_t gfp)
{
struct ip_mc_list *im;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == addr) {
im->users++;
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
goto out;
}
}
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
im = kzalloc(sizeof(*im), gfp);
if (!im)
goto out;
im->users = 1;
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
/* initial mode is (EX, empty) */
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
im->sfmode = mode;
im->sfcount[mode] = 1;
refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
timer_setup(&im->timer, igmp_timer_expire, 0);
#endif
im->next_rcu = in_dev->mc_list;
in_dev->mc_count++;
rcu_assign_pointer(in_dev->mc_list, im);
ip_mc_hash_add(in_dev, im);
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
return;
}
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
}
EXPORT_SYMBOL(__ip_mc_inc_group);
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
__ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
}
EXPORT_SYMBOL(ip_mc_inc_group);
static int ip_mc_check_iphdr(struct sk_buff *skb)
{
const struct iphdr *iph;
unsigned int len;
unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
if (!pskb_may_pull(skb, offset))
return -EINVAL;
iph = ip_hdr(skb);
if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
return -EINVAL;
offset += ip_hdrlen(skb) - sizeof(*iph);
if (!pskb_may_pull(skb, offset))
return -EINVAL;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
return -EINVAL;
len = skb_network_offset(skb) + ntohs(iph->tot_len);
if (skb->len < len || len < offset)
return -EINVAL;
skb_set_transport_header(skb, offset);
return 0;
}
static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
{
unsigned int len = skb_transport_offset(skb);
len += sizeof(struct igmpv3_report);
return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
}
static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
unsigned int transport_len = ip_transport_len(skb);
unsigned int len;
/* IGMPv{1,2}? */
if (transport_len != sizeof(struct igmphdr)) {
/* or IGMPv3? */
if (transport_len < sizeof(struct igmpv3_query))
return -EINVAL;
len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
if (!ip_mc_may_pull(skb, len))
return -EINVAL;
}
/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
* all-systems destination addresses (224.0.0.1) for general queries
*/
if (!igmp_hdr(skb)->group &&
ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
return -EINVAL;
return 0;
}
static int ip_mc_check_igmp_msg(struct sk_buff *skb)
{
switch (igmp_hdr(skb)->type) {
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
return 0;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
return ip_mc_check_igmp_reportv3(skb);
case IGMP_HOST_MEMBERSHIP_QUERY:
return ip_mc_check_igmp_query(skb);
default:
return -ENOMSG;
}
}
static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
return skb_checksum_simple_validate(skb);
}
static int ip_mc_check_igmp_csum(struct sk_buff *skb)
{
unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
unsigned int transport_len = ip_transport_len(skb);
struct sk_buff *skb_chk;
if (!ip_mc_may_pull(skb, len))
return -EINVAL;
skb_chk = skb_checksum_trimmed(skb, transport_len,
ip_mc_validate_checksum);
if (!skb_chk)
return -EINVAL;
if (skb_chk != skb)
kfree_skb(skb_chk);
return 0;
}
/**
* ip_mc_check_igmp - checks whether this is a sane IGMP packet
* @skb: the skb to validate
*
* Checks whether an IPv4 packet is a valid IGMP packet. If so sets
* skb transport header accordingly and returns zero.
*
* -EINVAL: A broken packet was detected, i.e. it violates some internet
* standard
* -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
* -ENOMEM: A memory allocation failure happened.
*
* Caller needs to set the skb network header and free any returned skb if it
* differs from the provided skb.
*/
int ip_mc_check_igmp(struct sk_buff *skb)
{
int ret = ip_mc_check_iphdr(skb);
if (ret < 0)
return ret;
if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -ENOMSG;
ret = ip_mc_check_igmp_csum(skb);
if (ret < 0)
return ret;
return ip_mc_check_igmp_msg(skb);
}
EXPORT_SYMBOL(ip_mc_check_igmp);
/*
* Resend IGMP JOIN report; used by netdev notifier.
*/
static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
struct net *net = dev_net(in_dev->dev);
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(im->multiaddr) &&
!READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
/* a failover is happening and switches
* must be notified immediately
*/
if (IGMP_V1_SEEN(in_dev))
type = IGMP_HOST_MEMBERSHIP_REPORT;
else if (IGMP_V2_SEEN(in_dev))
type = IGMPV2_HOST_MEMBERSHIP_REPORT;
else
type = IGMPV3_HOST_MEMBERSHIP_REPORT;
igmp_send_report(in_dev, im, type);
}
#endif
}
/*
* A socket has left a multicast group on device dev
*/
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
struct ip_mc_list *i;
struct ip_mc_list __rcu **ip;
ASSERT_RTNL();
for (ip = &in_dev->mc_list;
(i = rtnl_dereference(*ip)) != NULL;
ip = &i->next_rcu) {
if (i->multiaddr == addr) {
if (--i->users == 0) {
ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
__igmp_group_dropped(i, gfp);
ip_mc_clear_src(i);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
ip_ma_put(i);
return;
}
break;
}
}
}
net: Fix ip_mc_{dec,inc}_group allocation context After 4effd28c1245 ("bridge: join all-snoopers multicast address"), I started seeing the following sleep in atomic warnings: [ 26.763893] BUG: sleeping function called from invalid context at mm/slab.h:421 [ 26.771425] in_atomic(): 1, irqs_disabled(): 0, pid: 1658, name: sh [ 26.777855] INFO: lockdep is turned off. [ 26.781916] CPU: 0 PID: 1658 Comm: sh Not tainted 5.0.0-rc4 #20 [ 26.787943] Hardware name: BCM97278SV (DT) [ 26.792118] Call trace: [ 26.794645] dump_backtrace+0x0/0x170 [ 26.798391] show_stack+0x24/0x30 [ 26.801787] dump_stack+0xa4/0xe4 [ 26.805182] ___might_sleep+0x208/0x218 [ 26.809102] __might_sleep+0x78/0x88 [ 26.812762] kmem_cache_alloc_trace+0x64/0x28c [ 26.817301] igmp_group_dropped+0x150/0x230 [ 26.821573] ip_mc_dec_group+0x1b0/0x1f8 [ 26.825585] br_ip4_multicast_leave_snoopers.isra.11+0x174/0x190 [ 26.831704] br_multicast_toggle+0x78/0xcc [ 26.835887] store_bridge_parm+0xc4/0xfc [ 26.839894] multicast_snooping_store+0x3c/0x4c [ 26.844517] dev_attr_store+0x44/0x5c [ 26.848262] sysfs_kf_write+0x50/0x68 [ 26.852006] kernfs_fop_write+0x14c/0x1b4 [ 26.856102] __vfs_write+0x60/0x190 [ 26.859668] vfs_write+0xc8/0x168 [ 26.863059] ksys_write+0x70/0xc8 [ 26.866449] __arm64_sys_write+0x24/0x30 [ 26.870458] el0_svc_common+0xa0/0x11c [ 26.874291] el0_svc_handler+0x38/0x70 [ 26.878120] el0_svc+0x8/0xc while toggling the bridge's multicast_snooping attribute dynamically. Pass a gfp_t down to igmpv3_add_delrec(), introduce __igmp_group_dropped() and introduce __ip_mc_dec_group() to take a gfp_t argument. Similarly introduce ____ip_mc_inc_group() and __ip_mc_inc_group() to allow caller to specify gfp_t. IPv6 part of the patch appears fine. Fixes: 4effd28c1245 ("bridge: join all-snoopers multicast address") Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-02-02 12:20:52 +08:00
EXPORT_SYMBOL(__ip_mc_dec_group);
/* Device changing type */
void ip_mc_unmap(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc)
igmp_group_dropped(pmc);
}
void ip_mc_remap(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
igmp_group_added(pmc);
}
}
/* Device going down */
void ip_mc_down(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc)
igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST
net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-10 17:45:47 +08:00
WRITE_ONCE(in_dev->mr_ifc_count, 0);
if (del_timer(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
in_dev->mr_gq_running = 0;
if (del_timer(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
#ifdef CONFIG_IP_MULTICAST
static void ip_mc_reset(struct in_device *in_dev)
{
struct net *net = dev_net(in_dev->dev);
in_dev->mr_qi = IGMP_QUERY_INTERVAL;
in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
#else
static void ip_mc_reset(struct in_device *in_dev)
{
}
#endif
void ip_mc_init_dev(struct in_device *in_dev)
{
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
treewide: setup_timer() -> timer_setup() This converts all remaining cases of the old setup_timer() API into using timer_setup(), where the callback argument is the structure already holding the struct timer_list. These should have no behavioral changes, since they just change which pointer is passed into the callback with the same available pointers after conversion. It handles the following examples, in addition to some other variations. Casting from unsigned long: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... setup_timer(&ptr->my_timer, my_callback, ptr); and forced object casts: void my_callback(struct something *ptr) { ... } ... setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr); become: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... timer_setup(&ptr->my_timer, my_callback, 0); Direct function assignments: void my_callback(unsigned long data) { struct something *ptr = (struct something *)data; ... } ... ptr->my_timer.function = my_callback; have a temporary cast added, along with converting the args: void my_callback(struct timer_list *t) { struct something *ptr = from_timer(ptr, t, my_timer); ... } ... ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback; And finally, callbacks without a data assignment: void my_callback(unsigned long data) { ... } ... setup_timer(&ptr->my_timer, my_callback, 0); have their argument renamed to verify they're unused during conversion: void my_callback(struct timer_list *unused) { ... } ... timer_setup(&ptr->my_timer, my_callback, 0); The conversion is done with the following Coccinelle script: spatch --very-quiet --all-includes --include-headers \ -I ./arch/x86/include -I ./arch/x86/include/generated \ -I ./include -I ./arch/x86/include/uapi \ -I ./arch/x86/include/generated/uapi -I ./include/uapi \ -I ./include/generated/uapi --include ./include/linux/kconfig.h \ --dir . \ --cocci-file ~/src/data/timer_setup.cocci @fix_address_of@ expression e; @@ setup_timer( -&(e) +&e , ...) // Update any raw setup_timer() usages that have a NULL callback, but // would otherwise match change_timer_function_usage, since the latter // will update all function assignments done in the face of a NULL // function initialization in setup_timer(). @change_timer_function_usage_NULL@ expression _E; identifier _timer; type _cast_data; @@ ( -setup_timer(&_E->_timer, NULL, _E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E->_timer, NULL, (_cast_data)_E); +timer_setup(&_E->_timer, NULL, 0); | -setup_timer(&_E._timer, NULL, &_E); +timer_setup(&_E._timer, NULL, 0); | -setup_timer(&_E._timer, NULL, (_cast_data)&_E); +timer_setup(&_E._timer, NULL, 0); ) @change_timer_function_usage@ expression _E; identifier _timer; struct timer_list _stl; identifier _callback; type _cast_func, _cast_data; @@ ( -setup_timer(&_E->_timer, _callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, &_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, _E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, &_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E); +timer_setup(&_E._timer, _callback, 0); | _E->_timer@_stl.function = _callback; | _E->_timer@_stl.function = &_callback; | _E->_timer@_stl.function = (_cast_func)_callback; | _E->_timer@_stl.function = (_cast_func)&_callback; | _E._timer@_stl.function = _callback; | _E._timer@_stl.function = &_callback; | _E._timer@_stl.function = (_cast_func)_callback; | _E._timer@_stl.function = (_cast_func)&_callback; ) // callback(unsigned long arg) @change_callback_handle_cast depends on change_timer_function_usage@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; identifier _handle; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { ( ... when != _origarg _handletype *_handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(_handletype *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg | ... when != _origarg _handletype *_handle; ... when != _handle _handle = -(void *)_origarg; +from_timer(_handle, t, _timer); ... when != _origarg ) } // callback(unsigned long arg) without existing variable @change_callback_handle_cast_no_arg depends on change_timer_function_usage && !change_callback_handle_cast@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _origtype; identifier _origarg; type _handletype; @@ void _callback( -_origtype _origarg +struct timer_list *t ) { + _handletype *_origarg = from_timer(_origarg, t, _timer); + ... when != _origarg - (_handletype *)_origarg + _origarg ... when != _origarg } // Avoid already converted callbacks. @match_callback_converted depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier t; @@ void _callback(struct timer_list *t) { ... } // callback(struct something *handle) @change_callback_handle_arg depends on change_timer_function_usage && !match_callback_converted && !change_callback_handle_cast && !change_callback_handle_cast_no_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; @@ void _callback( -_handletype *_handle +struct timer_list *t ) { + _handletype *_handle = from_timer(_handle, t, _timer); ... } // If change_callback_handle_arg ran on an empty function, remove // the added handler. @unchange_callback_handle_arg depends on change_timer_function_usage && change_callback_handle_arg@ identifier change_timer_function_usage._callback; identifier change_timer_function_usage._timer; type _handletype; identifier _handle; identifier t; @@ void _callback(struct timer_list *t) { - _handletype *_handle = from_timer(_handle, t, _timer); } // We only want to refactor the setup_timer() data argument if we've found // the matching callback. This undoes changes in change_timer_function_usage. @unchange_timer_function_usage depends on change_timer_function_usage && !change_callback_handle_cast && !change_callback_handle_cast_no_arg && !change_callback_handle_arg@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type change_timer_function_usage._cast_data; @@ ( -timer_setup(&_E->_timer, _callback, 0); +setup_timer(&_E->_timer, _callback, (_cast_data)_E); | -timer_setup(&_E._timer, _callback, 0); +setup_timer(&_E._timer, _callback, (_cast_data)&_E); ) // If we fixed a callback from a .function assignment, fix the // assignment cast now. @change_timer_function_assignment depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression change_timer_function_usage._E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_func; typedef TIMER_FUNC_TYPE; @@ ( _E->_timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -&_callback +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)_callback; +(TIMER_FUNC_TYPE)_callback ; | _E->_timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -&_callback; +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)_callback +(TIMER_FUNC_TYPE)_callback ; | _E._timer.function = -(_cast_func)&_callback +(TIMER_FUNC_TYPE)_callback ; ) // Sometimes timer functions are called directly. Replace matched args. @change_timer_function_calls depends on change_timer_function_usage && (change_callback_handle_cast || change_callback_handle_cast_no_arg || change_callback_handle_arg)@ expression _E; identifier change_timer_function_usage._timer; identifier change_timer_function_usage._callback; type _cast_data; @@ _callback( ( -(_cast_data)_E +&_E->_timer | -(_cast_data)&_E +&_E._timer | -_E +&_E->_timer ) ) // If a timer has been configured without a data argument, it can be // converted without regard to the callback argument, since it is unused. @match_timer_function_unused_data@ expression _E; identifier _timer; identifier _callback; @@ ( -setup_timer(&_E->_timer, _callback, 0); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0L); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E->_timer, _callback, 0UL); +timer_setup(&_E->_timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0L); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_E._timer, _callback, 0UL); +timer_setup(&_E._timer, _callback, 0); | -setup_timer(&_timer, _callback, 0); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0L); +timer_setup(&_timer, _callback, 0); | -setup_timer(&_timer, _callback, 0UL); +timer_setup(&_timer, _callback, 0); | -setup_timer(_timer, _callback, 0); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0L); +timer_setup(_timer, _callback, 0); | -setup_timer(_timer, _callback, 0UL); +timer_setup(_timer, _callback, 0); ) @change_callback_unused_data depends on match_timer_function_unused_data@ identifier match_timer_function_unused_data._callback; type _origtype; identifier _origarg; @@ void _callback( -_origtype _origarg +struct timer_list *unused ) { ... when != _origarg } Signed-off-by: Kees Cook <keescook@chromium.org>
2017-10-17 05:43:17 +08:00
timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
#endif
ip_mc_reset(in_dev);
spin_lock_init(&in_dev->mc_tomb_lock);
}
/* Device going up */
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
ip_mc_reset(in_dev);
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
igmp_group_added(pmc);
}
}
/*
* Device is about to be destroyed: clean up.
*/
void ip_mc_destroy_dev(struct in_device *in_dev)
{
struct ip_mc_list *i;
ASSERT_RTNL();
/* Deactivate timers */
ip_mc_down(in_dev);
#ifdef CONFIG_IP_MULTICAST
igmpv3_clear_delrec(in_dev);
#endif
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
net: ipv4: fix memory leak in ip_mc_add1_src BUG: memory leak unreferenced object 0xffff888101bc4c00 (size 32): comm "syz-executor527", pid 360, jiffies 4294807421 (age 19.329s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 01 00 00 00 00 00 00 00 ac 14 14 bb 00 00 02 00 ................ backtrace: [<00000000f17c5244>] kmalloc include/linux/slab.h:558 [inline] [<00000000f17c5244>] kzalloc include/linux/slab.h:688 [inline] [<00000000f17c5244>] ip_mc_add1_src net/ipv4/igmp.c:1971 [inline] [<00000000f17c5244>] ip_mc_add_src+0x95f/0xdb0 net/ipv4/igmp.c:2095 [<000000001cb99709>] ip_mc_source+0x84c/0xea0 net/ipv4/igmp.c:2416 [<0000000052cf19ed>] do_ip_setsockopt net/ipv4/ip_sockglue.c:1294 [inline] [<0000000052cf19ed>] ip_setsockopt+0x114b/0x30c0 net/ipv4/ip_sockglue.c:1423 [<00000000477edfbc>] raw_setsockopt+0x13d/0x170 net/ipv4/raw.c:857 [<00000000e75ca9bb>] __sys_setsockopt+0x158/0x270 net/socket.c:2117 [<00000000bdb993a8>] __do_sys_setsockopt net/socket.c:2128 [inline] [<00000000bdb993a8>] __se_sys_setsockopt net/socket.c:2125 [inline] [<00000000bdb993a8>] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2125 [<000000006a1ffdbd>] do_syscall_64+0x40/0x80 arch/x86/entry/common.c:47 [<00000000b11467c4>] entry_SYSCALL_64_after_hwframe+0x44/0xae In commit 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down"), the ip_mc_clear_src() in ip_mc_destroy_dev() was removed, because it was also called in igmpv3_clear_delrec(). Rough callgraph: inetdev_destroy -> ip_mc_destroy_dev -> igmpv3_clear_delrec -> ip_mc_clear_src -> RCU_INIT_POINTER(dev->ip_ptr, NULL) However, ip_mc_clear_src() called in igmpv3_clear_delrec() doesn't release in_dev->mc_list->sources. And RCU_INIT_POINTER() assigns the NULL to dev->ip_ptr. As a result, in_dev cannot be obtained through inetdev_by_index() and then in_dev->mc_list->sources cannot be released by ip_mc_del1_src() in the sock_close. Rough call sequence goes like: sock_close -> __sock_release -> inet_release -> ip_mc_drop_socket -> inetdev_by_index -> ip_mc_leave_src -> ip_mc_del_src -> ip_mc_del1_src So we still need to call ip_mc_clear_src() in ip_mc_destroy_dev() to free in_dev->mc_list->sources. Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info ...") Reported-by: Hulk Robot <hulkci@huawei.com> Signed-off-by: Chengyang Fan <cy.fan@huawei.com> Acked-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-06-16 17:59:25 +08:00
ip_mc_clear_src(i);
ip_ma_put(i);
}
}
/* RTNL is locked */
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
struct net_device *dev = NULL;
struct in_device *idev = NULL;
if (imr->imr_ifindex) {
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
idev = inetdev_by_index(net, imr->imr_ifindex);
return idev;
}
if (imr->imr_address.s_addr) {
dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
}
if (!dev) {
struct rtable *rt = ip_route_output(net,
imr->imr_multiaddr.s_addr,
0, 0, 0);
if (!IS_ERR(rt)) {
dev = rt->dst.dev;
ip_rt_put(rt);
}
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
idev = __in_dev_get_rtnl(dev);
}
return idev;
}
/*
* Join a socket to a group
*/
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
int rv = 0;
psf_prev = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
}
if (!psf || psf->sf_count[sfmode] == 0) {
/* source filter not found, or count wrong => bug */
return -ESRCH;
}
psf->sf_count[sfmode]--;
if (psf->sf_count[sfmode] == 0) {
ip_rt_multicast_event(pmc->interface);
}
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct in_device *in_dev = pmc->interface;
struct net *net = dev_net(in_dev->dev);
#endif
/* no more filters for this source */
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
pmc->sources = psf->sf_next;
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
} else
#endif
kfree(psf);
}
return rv;
}
#ifndef CONFIG_IP_MULTICAST
#define igmp_ifc_event(x) do { } while (0)
#endif
static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta)
{
struct ip_mc_list *pmc;
int changerec = 0;
int i, err;
if (!in_dev)
return -ENODEV;
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
if (!delta) {
err = -EINVAL;
if (!pmc->sfcount[sfmode])
goto out_unlock;
pmc->sfcount[sfmode]--;
}
err = 0;
for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
if (!err && rv < 0)
err = rv;
}
if (pmc->sfmode == MCAST_EXCLUDE &&
pmc->sfcount[MCAST_EXCLUDE] == 0 &&
pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
struct net *net = dev_net(in_dev->dev);
#endif
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-10 17:45:47 +08:00
WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
igmp_ifc_event(pmc->interface);
#endif
}
out_unlock:
spin_unlock_bh(&pmc->lock);
return err;
}
/*
* Add multicast single-source filter to the interface list
*/
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
}
if (!psf) {
psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
if (!psf)
return -ENOBUFS;
psf->sf_inaddr = *psfsrc;
if (psf_prev) {
psf_prev->sf_next = psf;
} else
pmc->sources = psf;
}
psf->sf_count[sfmode]++;
if (psf->sf_count[sfmode] == 1) {
ip_rt_multicast_event(pmc->interface);
}
return 0;
}
#ifdef CONFIG_IP_MULTICAST
static void sf_markstate(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
} else
psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
}
static int sf_setstate(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *dpsf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
int qrv = pmc->interface->mr_qrv;
int new_in, rv;
rv = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
} else
new_in = psf->sf_count[MCAST_INCLUDE] != 0;
if (new_in) {
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
}
if (dpsf) {
if (prev)
prev->sf_next = dpsf->sf_next;
else
pmc->tomb = dpsf->sf_next;
kfree(dpsf);
}
psf->sf_crcount = qrv;
rv++;
}
} else if (psf->sf_oldin) {
psf->sf_crcount = 0;
/*
* add or update "delete" records if an active filter
* is now inactive
*/
for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
/* pmc->lock held by callers */
dpsf->sf_next = pmc->tomb;
pmc->tomb = dpsf;
}
dpsf->sf_crcount = qrv;
rv++;
}
}
return rv;
}
#endif
/*
* Add multicast source filter list to the interface list
*/
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta)
{
struct ip_mc_list *pmc;
int isexclude;
int i, err;
if (!in_dev)
return -ENODEV;
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
isexclude = pmc->sfmode == MCAST_EXCLUDE;
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
for (i = 0; i < sfcount; i++) {
err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
if (err) {
int j;
if (!delta)
pmc->sfcount[sfmode]--;
for (j = 0; j < i; j++)
(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
struct net *net = dev_net(pmc->interface->dev);
in_dev = pmc->interface;
#endif
/* filter mode change */
if (pmc->sfcount[MCAST_EXCLUDE])
pmc->sfmode = MCAST_EXCLUDE;
else if (pmc->sfcount[MCAST_INCLUDE])
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-10 17:45:47 +08:00
WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
igmp_ifc_event(in_dev);
#endif
}
spin_unlock_bh(&pmc->lock);
return err;
}
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
struct ip_sf_list *tomb, *sources;
spin_lock_bh(&pmc->lock);
tomb = pmc->tomb;
pmc->tomb = NULL;
sources = pmc->sources;
pmc->sources = NULL;
pmc->sfmode = MCAST_EXCLUDE;
pmc->sfcount[MCAST_INCLUDE] = 0;
pmc->sfcount[MCAST_EXCLUDE] = 1;
spin_unlock_bh(&pmc->lock);
ipv4/igmp: fix another memory leak in igmpv3_del_delrec() syzbot reported memory leaks [1] that I have back tracked to a missing cleanup from igmpv3_del_delrec() when (im->sfmode != MCAST_INCLUDE) Add ip_sf_list_clear_all() and kfree_pmc() helpers to explicitely handle the cleanups before freeing. [1] BUG: memory leak unreferenced object 0xffff888123e32b00 (size 64): comm "softirq", pid 0, jiffies 4294942968 (age 8.010s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e0 00 00 01 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<000000006105011b>] kmemleak_alloc_recursive include/linux/kmemleak.h:55 [inline] [<000000006105011b>] slab_post_alloc_hook mm/slab.h:439 [inline] [<000000006105011b>] slab_alloc mm/slab.c:3326 [inline] [<000000006105011b>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000004bba8073>] kmalloc include/linux/slab.h:547 [inline] [<000000004bba8073>] kzalloc include/linux/slab.h:742 [inline] [<000000004bba8073>] ip_mc_add1_src net/ipv4/igmp.c:1961 [inline] [<000000004bba8073>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2085 [<00000000a46a65a0>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2475 [<000000005956ca89>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:957 [<00000000848e2d2f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1246 [<00000000b9db185c>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2616 [<000000003028e438>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3130 [<0000000015b65589>] __sys_setsockopt+0x98/0x120 net/socket.c:2078 [<00000000ac198ef0>] __do_sys_setsockopt net/socket.c:2089 [inline] [<00000000ac198ef0>] __se_sys_setsockopt net/socket.c:2086 [inline] [<00000000ac198ef0>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086 [<000000000a770437>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<00000000d3adb93b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 9c8bb163ae78 ("igmp, mld: Fix memory leak in igmpv3/mld_del_delrec()") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Hangbin Liu <liuhangbin@gmail.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-05-23 07:51:22 +08:00
ip_sf_list_clear_all(tomb);
ip_sf_list_clear_all(sources);
}
/* Join a multicast group
*/
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
unsigned int mode)
{
__be32 addr = imr->imr_multiaddr.s_addr;
struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
int ifindex;
int count = 0;
int err;
ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRINUSE;
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
ifindex = imr->imr_ifindex;
for_each_pmc_rtnl(inet, i) {
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
if (i->multi.imr_multiaddr.s_addr == addr &&
i->multi.imr_ifindex == ifindex)
goto done;
count++;
}
err = -ENOBUFS;
if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (!iml)
goto done;
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
return err;
}
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
/* Join ASM (Any-Source Multicast) group
*/
int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ip_mc_join_group);
ipv4/igmp: init group mode as INCLUDE when join source group Based on RFC3376 5.1 If no interface state existed for that multicast address before the change (i.e., the change consisted of creating a new per-interface record), or if no state exists after the change (i.e., the change consisted of deleting a per-interface record), then the "non-existent" state is considered to have a filter mode of INCLUDE and an empty source list. Which means a new multicast group should start with state IN(). Function ip_mc_join_group() works correctly for IGMP ASM(Any-Source Multicast) mode. It adds a group with state EX() and inits crcount to mc_qrv, so the kernel will send a TO_EX() report message after adding group. But for IGMPv3 SSM(Source-specific multicast) JOIN_SOURCE_GROUP mode, we split the group joining into two steps. First we join the group like ASM, i.e. via ip_mc_join_group(). So the state changes from IN() to EX(). Then we add the source-specific address with INCLUDE mode. So the state changes from EX() to IN(A). Before the first step sends a group change record, we finished the second step. So we will only send the second change record. i.e. TO_IN(A). Regarding the RFC stands, we should actually send an ALLOW(A) message for SSM JOIN_SOURCE_GROUP as the state should mimic the 'IN() to IN(A)' transition. The issue was exposed by commit a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change"). Before this change, we used to send both ALLOW(A) and TO_IN(A). After this change we only send TO_IN(A). Fix it by adding a new parameter to init group mode. Also add new wrapper functions so we don't need to change too much code. v1 -> v2: In my first version I only cleared the group change record. But this is not enough. Because when a new group join, it will init as EXCLUDE and trigger an filter mode change in ip/ip6_mc_add_src(), which will clear all source addresses' sf_crcount. This will prevent early joined address sending state change records if multi source addressed joined at the same time. In v2 patch, I fixed it by directly initializing the mode to INCLUDE for SSM JOIN_SOURCE_GROUP. I also split the original patch into two separated patches for IPv4 and IPv6. Fixes: a052517a8ff65 ("net/multicast: should not send source list records when have filter mode change") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-10 22:41:26 +08:00
/* Join SSM (Source-Specific Multicast) group
*/
int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
unsigned int mode)
{
return __ip_mc_join_group(sk, imr, mode);
}
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
int err;
if (!psf) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, psf->sl_count, psf->sl_addr, 0);
RCU_INIT_POINTER(iml->sflist, NULL);
/* decrease mem now to avoid the memleak warning */
atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psf, rcu);
return err;
}
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
struct ip_mc_socklist __rcu **imlp;
struct in_device *in_dev;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
__be32 group = imr->imr_multiaddr.s_addr;
u32 ifindex;
int ret = -EADDRNOTAVAIL;
ASSERT_RTNL();
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, imr);
if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
igmp: fix the problem when mc leave group The problem was triggered by these steps: 1) create socket, bind and then setsockopt for add mc group. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("192.168.1.2"); setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); 2) drop the mc group for this socket. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("0.0.0.0"); setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); 3) and then drop the socket, I found the mc group was still used by the dev: netstat -g Interface RefCnt Group --------------- ------ --------------------- eth2 1 255.0.0.37 Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need to be released for the netdev when drop the socket, but this process was broken when route default is NULL, the reason is that: The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr is NULL, the default route dev will be chosen, then the ifindex is got from the dev, then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL, the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be released from the mc_list, but the dev didn't dec the refcnt for this mc group, so when dropping the socket, the mc_list is NULL and the dev still keep this group. v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs, so I add the checking for the in_dev before polling the mc_list, make sure when we remove the mc group, dec the refcnt to the real dev which was using the mc address. The problem would never happened again. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-02 13:50:48 +08:00
ret = -ENODEV;
goto out;
}
ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list;
(iml = rtnl_dereference(*imlp)) != NULL;
imlp = &iml->next_rcu) {
if (iml->multi.imr_multiaddr.s_addr != group)
continue;
if (ifindex) {
if (iml->multi.imr_ifindex != ifindex)
continue;
} else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
iml->multi.imr_address.s_addr)
continue;
(void) ip_mc_leave_src(sk, iml, in_dev);
*imlp = iml->next_rcu;
if (in_dev)
ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
igmp: fix the problem when mc leave group The problem was triggered by these steps: 1) create socket, bind and then setsockopt for add mc group. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("192.168.1.2"); setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); 2) drop the mc group for this socket. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("0.0.0.0"); setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); 3) and then drop the socket, I found the mc group was still used by the dev: netstat -g Interface RefCnt Group --------------- ------ --------------------- eth2 1 255.0.0.37 Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need to be released for the netdev when drop the socket, but this process was broken when route default is NULL, the reason is that: The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr is NULL, the default route dev will be chosen, then the ifindex is got from the dev, then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL, the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be released from the mc_list, but the dev didn't dec the refcnt for this mc group, so when dropping the socket, the mc_list is NULL and the dev still keep this group. v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs, so I add the checking for the in_dev before polling the mc_list, make sure when we remove the mc group, dec the refcnt to the real dev which was using the mc address. The problem would never happened again. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-02 13:50:48 +08:00
out:
return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);
int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mreq_source *mreqs, int ifindex)
{
int err;
struct ip_mreqn imr;
__be32 addr = mreqs->imr_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
int leavegroup = 0;
int i, j, rv;
if (!ipv4_is_multicast(addr))
return -EINVAL;
ASSERT_RTNL();
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
imr.imr_ifindex = ifindex;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
if ((pmc->multi.imr_multiaddr.s_addr ==
imr.imr_multiaddr.s_addr) &&
(pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
err = -EINVAL;
goto done;
}
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
if (pmc->sfmode != omode) {
err = -EINVAL;
goto done;
}
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
NULL, 0);
pmc->sfmode = omode;
}
psl = rtnl_dereference(pmc->sflist);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
break;
}
if (rv) /* source not found */
goto done; /* err = -EADDRNOTAVAIL */
/* special case - (INCLUDE, empty) == LEAVE_GROUP */
if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
leavegroup = 1;
goto done;
}
/* update the interface filter */
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
goto done;
}
/* else, add a new source to the filter */
if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
err = -ENOBUFS;
goto done;
}
if (!psl || psl->sl_count == psl->sl_max) {
struct ip_sf_socklist *newpsl;
int count = IP_SFBLOCK;
if (psl)
count += psl->sl_max;
newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
&sk->sk_omem_alloc);
}
rcu_assign_pointer(pmc->sflist, newpsl);
net: igmp: respect RCU rules in ip_mc_source() and ip_mc_msfilter() syzbot reported an UAF in ip_mc_sf_allow() [1] Whenever RCU protected list replaces an object, the pointer to the new object needs to be updated _before_ the call to kfree_rcu() or call_rcu() Because kfree_rcu(ptr, rcu) got support for NULL ptr only recently in commit 12edff045bc6 ("rcu: Make kfree_rcu() ignore NULL pointers"), I chose to use the conditional to make sure stable backports won't miss this detail. if (psl) kfree_rcu(psl, rcu); net/ipv6/mcast.c has similar issues, addressed in a separate patch. [1] BUG: KASAN: use-after-free in ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 Read of size 4 at addr ffff88807d37b904 by task syz-executor.5/908 CPU: 0 PID: 908 Comm: syz-executor.5 Not tainted 5.18.0-rc4-syzkaller-00064-g8f4dd16603ce #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xeb/0x467 mm/kasan/report.c:313 print_report mm/kasan/report.c:429 [inline] kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491 ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 raw_v4_input net/ipv4/raw.c:190 [inline] raw_local_deliver+0x4d1/0xbe0 net/ipv4/raw.c:218 ip_protocol_deliver_rcu+0xcf/0xb30 net/ipv4/ip_input.c:193 ip_local_deliver_finish+0x2ee/0x4c0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_local_deliver+0x1b3/0x200 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:461 [inline] ip_rcv_finish+0x1cb/0x2f0 net/ipv4/ip_input.c:437 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_rcv+0xaa/0xd0 net/ipv4/ip_input.c:556 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 netif_receive_skb_internal net/core/dev.c:5605 [inline] netif_receive_skb+0x13e/0x8e0 net/core/dev.c:5664 tun_rx_batched.isra.0+0x460/0x720 drivers/net/tun.c:1534 tun_get_user+0x28b7/0x3e30 drivers/net/tun.c:1985 tun_chr_write_iter+0xdb/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2050 [inline] new_sync_write+0x38a/0x560 fs/read_write.c:504 vfs_write+0x7c0/0xac0 fs/read_write.c:591 ksys_write+0x127/0x250 fs/read_write.c:644 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f3f12c3bbff Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007f3f13ea9130 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f3f12d9bf60 RCX: 00007f3f12c3bbff RDX: 0000000000000036 RSI: 0000000020002ac0 RDI: 00000000000000c8 RBP: 00007f3f12ce308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000036 R11: 0000000000000293 R12: 0000000000000000 R13: 00007fffb68dd79f R14: 00007f3f13ea9300 R15: 0000000000022000 </TASK> Allocated by task 908: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 kasan_kmalloc include/linux/kasan.h:234 [inline] __do_kmalloc mm/slab.c:3710 [inline] __kmalloc+0x209/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] sock_kmalloc net/core/sock.c:2501 [inline] sock_kmalloc+0xb5/0x100 net/core/sock.c:2492 ip_mc_source+0xba2/0x1100 net/ipv4/igmp.c:2392 do_ip_setsockopt net/ipv4/ip_sockglue.c:1296 [inline] ip_setsockopt+0x2312/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 753: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x13d/0x180 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:200 [inline] __cache_free mm/slab.c:3439 [inline] kmem_cache_free_bulk+0x69/0x460 mm/slab.c:3774 kfree_bulk include/linux/slab.h:437 [inline] kfree_rcu_work+0x51c/0xa10 kernel/rcu/tree.c:3318 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 kvfree_call_rcu+0x74/0x990 kernel/rcu/tree.c:3595 ip_mc_msfilter+0x712/0xb60 net/ipv4/igmp.c:2510 do_ip_setsockopt net/ipv4/ip_sockglue.c:1257 [inline] ip_setsockopt+0x32e1/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 call_rcu+0x99/0x790 kernel/rcu/tree.c:3074 mpls_dev_notify+0x552/0x8a0 net/mpls/af_mpls.c:1656 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1938 call_netdevice_notifiers_extack net/core/dev.c:1976 [inline] call_netdevice_notifiers net/core/dev.c:1990 [inline] unregister_netdevice_many+0x92e/0x1890 net/core/dev.c:10751 default_device_exit_batch+0x449/0x590 net/core/dev.c:11245 ops_exit_list+0x125/0x170 net/core/net_namespace.c:167 cleanup_net+0x4ea/0xb00 net/core/net_namespace.c:594 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 The buggy address belongs to the object at ffff88807d37b900 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 4 bytes inside of 64-byte region [ffff88807d37b900, ffff88807d37b940) The buggy address belongs to the physical page: page:ffffea0001f4dec0 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d37b180 pfn:0x7d37b flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 ffff888010c41340 ffffea0001c795c8 ffff888010c40200 raw: ffff88807d37b180 ffff88807d37b000 000000010000001f 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x342040(__GFP_IO|__GFP_NOWARN|__GFP_COMP|__GFP_HARDWALL|__GFP_THISNODE), pid 2963, tgid 2963 (udevd), ts 139732238007, free_ts 139730893262 prep_new_page mm/page_alloc.c:2441 [inline] get_page_from_freelist+0xba2/0x3e00 mm/page_alloc.c:4182 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5408 __alloc_pages_node include/linux/gfp.h:587 [inline] kmem_getpages mm/slab.c:1378 [inline] cache_grow_begin+0x75/0x350 mm/slab.c:2584 cache_alloc_refill+0x27f/0x380 mm/slab.c:2957 ____cache_alloc mm/slab.c:3040 [inline] ____cache_alloc mm/slab.c:3023 [inline] __do_cache_alloc mm/slab.c:3267 [inline] slab_alloc mm/slab.c:3309 [inline] __do_kmalloc mm/slab.c:3708 [inline] __kmalloc+0x3b3/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:714 [inline] tomoyo_encode2.part.0+0xe9/0x3a0 security/tomoyo/realpath.c:45 tomoyo_encode2 security/tomoyo/realpath.c:31 [inline] tomoyo_encode+0x28/0x50 security/tomoyo/realpath.c:80 tomoyo_realpath_from_path+0x186/0x620 security/tomoyo/realpath.c:288 tomoyo_get_realpath security/tomoyo/file.c:151 [inline] tomoyo_path_perm+0x21b/0x400 security/tomoyo/file.c:822 security_inode_getattr+0xcf/0x140 security/security.c:1350 vfs_getattr fs/stat.c:157 [inline] vfs_statx+0x16a/0x390 fs/stat.c:232 vfs_fstatat+0x8c/0xb0 fs/stat.c:255 __do_sys_newfstatat+0x91/0x110 fs/stat.c:425 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1356 [inline] free_pcp_prepare+0x549/0xd20 mm/page_alloc.c:1406 free_unref_page_prepare mm/page_alloc.c:3328 [inline] free_unref_page+0x19/0x6a0 mm/page_alloc.c:3423 __vunmap+0x85d/0xd30 mm/vmalloc.c:2667 __vfree+0x3c/0xd0 mm/vmalloc.c:2715 vfree+0x5a/0x90 mm/vmalloc.c:2746 __do_replace+0x16b/0x890 net/ipv6/netfilter/ip6_tables.c:1117 do_replace net/ipv6/netfilter/ip6_tables.c:1157 [inline] do_ip6t_set_ctl+0x90d/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1026 tcp_setsockopt+0x136/0x2520 net/ipv4/tcp.c:3696 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d37b800: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc ffff88807d37b880: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88807d37b900: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff88807d37b980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88807d37ba00: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc Fixes: c85bb41e9318 ("igmp: fix ip_mc_sf_allow race [v5]") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Cc: Flavio Leitner <fbl@sysclose.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-29 23:42:57 +08:00
if (psl)
kfree_rcu(psl, rcu);
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
break;
}
if (rv == 0) /* address already there is an error */
goto done;
for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
err = 0;
/* update the interface list */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
done:
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
int err = 0;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
int leavegroup = 0;
if (!ipv4_is_multicast(addr))
return -EINVAL;
if (msf->imsf_fmode != MCAST_INCLUDE &&
msf->imsf_fmode != MCAST_EXCLUDE)
return -EINVAL;
ASSERT_RTNL();
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = ifindex;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
/* special case - (INCLUDE, empty) == LEAVE_GROUP */
if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
leavegroup = 1;
goto done;
}
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
if (!pmc) { /* must have a prior join */
err = -EINVAL;
goto done;
}
if (msf->imsf_numsrc) {
newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
msf->imsf_numsrc),
GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
net/ipv4: Replace one-element array with flexible-array member There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Use an anonymous union with a couple of anonymous structs in order to keep userspace unchanged: $ pahole -C ip_msfilter net/ipv4/ip_sockglue.o struct ip_msfilter { union { struct { __be32 imsf_multiaddr_aux; /* 0 4 */ __be32 imsf_interface_aux; /* 4 4 */ __u32 imsf_fmode_aux; /* 8 4 */ __u32 imsf_numsrc_aux; /* 12 4 */ __be32 imsf_slist[1]; /* 16 4 */ }; /* 0 20 */ struct { __be32 imsf_multiaddr; /* 0 4 */ __be32 imsf_interface; /* 4 4 */ __u32 imsf_fmode; /* 8 4 */ __u32 imsf_numsrc; /* 12 4 */ __be32 imsf_slist_flex[0]; /* 16 0 */ }; /* 0 16 */ }; /* 0 20 */ /* size: 20, cachelines: 1, members: 1 */ /* last cacheline: 20 bytes */ }; Also, refactor the code accordingly and make use of the struct_size() and flex_array_size() helpers. This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-01 01:08:30 +08:00
memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
sock_kfree_s(sk, newpsl,
struct_size(newpsl, sl_addr,
newpsl->sl_max));
goto done;
}
} else {
newpsl = NULL;
(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, 0, NULL, 0);
}
psl = rtnl_dereference(pmc->sflist);
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
/* decrease mem now to avoid the memleak warning */
atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
&sk->sk_omem_alloc);
net: igmp: respect RCU rules in ip_mc_source() and ip_mc_msfilter() syzbot reported an UAF in ip_mc_sf_allow() [1] Whenever RCU protected list replaces an object, the pointer to the new object needs to be updated _before_ the call to kfree_rcu() or call_rcu() Because kfree_rcu(ptr, rcu) got support for NULL ptr only recently in commit 12edff045bc6 ("rcu: Make kfree_rcu() ignore NULL pointers"), I chose to use the conditional to make sure stable backports won't miss this detail. if (psl) kfree_rcu(psl, rcu); net/ipv6/mcast.c has similar issues, addressed in a separate patch. [1] BUG: KASAN: use-after-free in ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 Read of size 4 at addr ffff88807d37b904 by task syz-executor.5/908 CPU: 0 PID: 908 Comm: syz-executor.5 Not tainted 5.18.0-rc4-syzkaller-00064-g8f4dd16603ce #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xeb/0x467 mm/kasan/report.c:313 print_report mm/kasan/report.c:429 [inline] kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491 ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 raw_v4_input net/ipv4/raw.c:190 [inline] raw_local_deliver+0x4d1/0xbe0 net/ipv4/raw.c:218 ip_protocol_deliver_rcu+0xcf/0xb30 net/ipv4/ip_input.c:193 ip_local_deliver_finish+0x2ee/0x4c0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_local_deliver+0x1b3/0x200 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:461 [inline] ip_rcv_finish+0x1cb/0x2f0 net/ipv4/ip_input.c:437 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_rcv+0xaa/0xd0 net/ipv4/ip_input.c:556 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 netif_receive_skb_internal net/core/dev.c:5605 [inline] netif_receive_skb+0x13e/0x8e0 net/core/dev.c:5664 tun_rx_batched.isra.0+0x460/0x720 drivers/net/tun.c:1534 tun_get_user+0x28b7/0x3e30 drivers/net/tun.c:1985 tun_chr_write_iter+0xdb/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2050 [inline] new_sync_write+0x38a/0x560 fs/read_write.c:504 vfs_write+0x7c0/0xac0 fs/read_write.c:591 ksys_write+0x127/0x250 fs/read_write.c:644 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f3f12c3bbff Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007f3f13ea9130 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f3f12d9bf60 RCX: 00007f3f12c3bbff RDX: 0000000000000036 RSI: 0000000020002ac0 RDI: 00000000000000c8 RBP: 00007f3f12ce308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000036 R11: 0000000000000293 R12: 0000000000000000 R13: 00007fffb68dd79f R14: 00007f3f13ea9300 R15: 0000000000022000 </TASK> Allocated by task 908: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 kasan_kmalloc include/linux/kasan.h:234 [inline] __do_kmalloc mm/slab.c:3710 [inline] __kmalloc+0x209/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] sock_kmalloc net/core/sock.c:2501 [inline] sock_kmalloc+0xb5/0x100 net/core/sock.c:2492 ip_mc_source+0xba2/0x1100 net/ipv4/igmp.c:2392 do_ip_setsockopt net/ipv4/ip_sockglue.c:1296 [inline] ip_setsockopt+0x2312/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 753: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x13d/0x180 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:200 [inline] __cache_free mm/slab.c:3439 [inline] kmem_cache_free_bulk+0x69/0x460 mm/slab.c:3774 kfree_bulk include/linux/slab.h:437 [inline] kfree_rcu_work+0x51c/0xa10 kernel/rcu/tree.c:3318 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 kvfree_call_rcu+0x74/0x990 kernel/rcu/tree.c:3595 ip_mc_msfilter+0x712/0xb60 net/ipv4/igmp.c:2510 do_ip_setsockopt net/ipv4/ip_sockglue.c:1257 [inline] ip_setsockopt+0x32e1/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 call_rcu+0x99/0x790 kernel/rcu/tree.c:3074 mpls_dev_notify+0x552/0x8a0 net/mpls/af_mpls.c:1656 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1938 call_netdevice_notifiers_extack net/core/dev.c:1976 [inline] call_netdevice_notifiers net/core/dev.c:1990 [inline] unregister_netdevice_many+0x92e/0x1890 net/core/dev.c:10751 default_device_exit_batch+0x449/0x590 net/core/dev.c:11245 ops_exit_list+0x125/0x170 net/core/net_namespace.c:167 cleanup_net+0x4ea/0xb00 net/core/net_namespace.c:594 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 The buggy address belongs to the object at ffff88807d37b900 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 4 bytes inside of 64-byte region [ffff88807d37b900, ffff88807d37b940) The buggy address belongs to the physical page: page:ffffea0001f4dec0 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d37b180 pfn:0x7d37b flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 ffff888010c41340 ffffea0001c795c8 ffff888010c40200 raw: ffff88807d37b180 ffff88807d37b000 000000010000001f 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x342040(__GFP_IO|__GFP_NOWARN|__GFP_COMP|__GFP_HARDWALL|__GFP_THISNODE), pid 2963, tgid 2963 (udevd), ts 139732238007, free_ts 139730893262 prep_new_page mm/page_alloc.c:2441 [inline] get_page_from_freelist+0xba2/0x3e00 mm/page_alloc.c:4182 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5408 __alloc_pages_node include/linux/gfp.h:587 [inline] kmem_getpages mm/slab.c:1378 [inline] cache_grow_begin+0x75/0x350 mm/slab.c:2584 cache_alloc_refill+0x27f/0x380 mm/slab.c:2957 ____cache_alloc mm/slab.c:3040 [inline] ____cache_alloc mm/slab.c:3023 [inline] __do_cache_alloc mm/slab.c:3267 [inline] slab_alloc mm/slab.c:3309 [inline] __do_kmalloc mm/slab.c:3708 [inline] __kmalloc+0x3b3/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:714 [inline] tomoyo_encode2.part.0+0xe9/0x3a0 security/tomoyo/realpath.c:45 tomoyo_encode2 security/tomoyo/realpath.c:31 [inline] tomoyo_encode+0x28/0x50 security/tomoyo/realpath.c:80 tomoyo_realpath_from_path+0x186/0x620 security/tomoyo/realpath.c:288 tomoyo_get_realpath security/tomoyo/file.c:151 [inline] tomoyo_path_perm+0x21b/0x400 security/tomoyo/file.c:822 security_inode_getattr+0xcf/0x140 security/security.c:1350 vfs_getattr fs/stat.c:157 [inline] vfs_statx+0x16a/0x390 fs/stat.c:232 vfs_fstatat+0x8c/0xb0 fs/stat.c:255 __do_sys_newfstatat+0x91/0x110 fs/stat.c:425 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1356 [inline] free_pcp_prepare+0x549/0xd20 mm/page_alloc.c:1406 free_unref_page_prepare mm/page_alloc.c:3328 [inline] free_unref_page+0x19/0x6a0 mm/page_alloc.c:3423 __vunmap+0x85d/0xd30 mm/vmalloc.c:2667 __vfree+0x3c/0xd0 mm/vmalloc.c:2715 vfree+0x5a/0x90 mm/vmalloc.c:2746 __do_replace+0x16b/0x890 net/ipv6/netfilter/ip6_tables.c:1117 do_replace net/ipv6/netfilter/ip6_tables.c:1157 [inline] do_ip6t_set_ctl+0x90d/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1026 tcp_setsockopt+0x136/0x2520 net/ipv4/tcp.c:3696 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d37b800: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc ffff88807d37b880: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88807d37b900: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff88807d37b980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88807d37ba00: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc Fixes: c85bb41e9318 ("igmp: fix ip_mc_sf_allow race [v5]") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Cc: Flavio Leitner <fbl@sysclose.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-29 23:42:57 +08:00
} else {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
net: igmp: respect RCU rules in ip_mc_source() and ip_mc_msfilter() syzbot reported an UAF in ip_mc_sf_allow() [1] Whenever RCU protected list replaces an object, the pointer to the new object needs to be updated _before_ the call to kfree_rcu() or call_rcu() Because kfree_rcu(ptr, rcu) got support for NULL ptr only recently in commit 12edff045bc6 ("rcu: Make kfree_rcu() ignore NULL pointers"), I chose to use the conditional to make sure stable backports won't miss this detail. if (psl) kfree_rcu(psl, rcu); net/ipv6/mcast.c has similar issues, addressed in a separate patch. [1] BUG: KASAN: use-after-free in ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 Read of size 4 at addr ffff88807d37b904 by task syz-executor.5/908 CPU: 0 PID: 908 Comm: syz-executor.5 Not tainted 5.18.0-rc4-syzkaller-00064-g8f4dd16603ce #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xeb/0x467 mm/kasan/report.c:313 print_report mm/kasan/report.c:429 [inline] kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491 ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 raw_v4_input net/ipv4/raw.c:190 [inline] raw_local_deliver+0x4d1/0xbe0 net/ipv4/raw.c:218 ip_protocol_deliver_rcu+0xcf/0xb30 net/ipv4/ip_input.c:193 ip_local_deliver_finish+0x2ee/0x4c0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_local_deliver+0x1b3/0x200 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:461 [inline] ip_rcv_finish+0x1cb/0x2f0 net/ipv4/ip_input.c:437 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_rcv+0xaa/0xd0 net/ipv4/ip_input.c:556 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 netif_receive_skb_internal net/core/dev.c:5605 [inline] netif_receive_skb+0x13e/0x8e0 net/core/dev.c:5664 tun_rx_batched.isra.0+0x460/0x720 drivers/net/tun.c:1534 tun_get_user+0x28b7/0x3e30 drivers/net/tun.c:1985 tun_chr_write_iter+0xdb/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2050 [inline] new_sync_write+0x38a/0x560 fs/read_write.c:504 vfs_write+0x7c0/0xac0 fs/read_write.c:591 ksys_write+0x127/0x250 fs/read_write.c:644 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f3f12c3bbff Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007f3f13ea9130 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f3f12d9bf60 RCX: 00007f3f12c3bbff RDX: 0000000000000036 RSI: 0000000020002ac0 RDI: 00000000000000c8 RBP: 00007f3f12ce308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000036 R11: 0000000000000293 R12: 0000000000000000 R13: 00007fffb68dd79f R14: 00007f3f13ea9300 R15: 0000000000022000 </TASK> Allocated by task 908: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 kasan_kmalloc include/linux/kasan.h:234 [inline] __do_kmalloc mm/slab.c:3710 [inline] __kmalloc+0x209/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] sock_kmalloc net/core/sock.c:2501 [inline] sock_kmalloc+0xb5/0x100 net/core/sock.c:2492 ip_mc_source+0xba2/0x1100 net/ipv4/igmp.c:2392 do_ip_setsockopt net/ipv4/ip_sockglue.c:1296 [inline] ip_setsockopt+0x2312/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 753: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x13d/0x180 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:200 [inline] __cache_free mm/slab.c:3439 [inline] kmem_cache_free_bulk+0x69/0x460 mm/slab.c:3774 kfree_bulk include/linux/slab.h:437 [inline] kfree_rcu_work+0x51c/0xa10 kernel/rcu/tree.c:3318 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 kvfree_call_rcu+0x74/0x990 kernel/rcu/tree.c:3595 ip_mc_msfilter+0x712/0xb60 net/ipv4/igmp.c:2510 do_ip_setsockopt net/ipv4/ip_sockglue.c:1257 [inline] ip_setsockopt+0x32e1/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 call_rcu+0x99/0x790 kernel/rcu/tree.c:3074 mpls_dev_notify+0x552/0x8a0 net/mpls/af_mpls.c:1656 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1938 call_netdevice_notifiers_extack net/core/dev.c:1976 [inline] call_netdevice_notifiers net/core/dev.c:1990 [inline] unregister_netdevice_many+0x92e/0x1890 net/core/dev.c:10751 default_device_exit_batch+0x449/0x590 net/core/dev.c:11245 ops_exit_list+0x125/0x170 net/core/net_namespace.c:167 cleanup_net+0x4ea/0xb00 net/core/net_namespace.c:594 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 The buggy address belongs to the object at ffff88807d37b900 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 4 bytes inside of 64-byte region [ffff88807d37b900, ffff88807d37b940) The buggy address belongs to the physical page: page:ffffea0001f4dec0 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d37b180 pfn:0x7d37b flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 ffff888010c41340 ffffea0001c795c8 ffff888010c40200 raw: ffff88807d37b180 ffff88807d37b000 000000010000001f 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x342040(__GFP_IO|__GFP_NOWARN|__GFP_COMP|__GFP_HARDWALL|__GFP_THISNODE), pid 2963, tgid 2963 (udevd), ts 139732238007, free_ts 139730893262 prep_new_page mm/page_alloc.c:2441 [inline] get_page_from_freelist+0xba2/0x3e00 mm/page_alloc.c:4182 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5408 __alloc_pages_node include/linux/gfp.h:587 [inline] kmem_getpages mm/slab.c:1378 [inline] cache_grow_begin+0x75/0x350 mm/slab.c:2584 cache_alloc_refill+0x27f/0x380 mm/slab.c:2957 ____cache_alloc mm/slab.c:3040 [inline] ____cache_alloc mm/slab.c:3023 [inline] __do_cache_alloc mm/slab.c:3267 [inline] slab_alloc mm/slab.c:3309 [inline] __do_kmalloc mm/slab.c:3708 [inline] __kmalloc+0x3b3/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:714 [inline] tomoyo_encode2.part.0+0xe9/0x3a0 security/tomoyo/realpath.c:45 tomoyo_encode2 security/tomoyo/realpath.c:31 [inline] tomoyo_encode+0x28/0x50 security/tomoyo/realpath.c:80 tomoyo_realpath_from_path+0x186/0x620 security/tomoyo/realpath.c:288 tomoyo_get_realpath security/tomoyo/file.c:151 [inline] tomoyo_path_perm+0x21b/0x400 security/tomoyo/file.c:822 security_inode_getattr+0xcf/0x140 security/security.c:1350 vfs_getattr fs/stat.c:157 [inline] vfs_statx+0x16a/0x390 fs/stat.c:232 vfs_fstatat+0x8c/0xb0 fs/stat.c:255 __do_sys_newfstatat+0x91/0x110 fs/stat.c:425 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1356 [inline] free_pcp_prepare+0x549/0xd20 mm/page_alloc.c:1406 free_unref_page_prepare mm/page_alloc.c:3328 [inline] free_unref_page+0x19/0x6a0 mm/page_alloc.c:3423 __vunmap+0x85d/0xd30 mm/vmalloc.c:2667 __vfree+0x3c/0xd0 mm/vmalloc.c:2715 vfree+0x5a/0x90 mm/vmalloc.c:2746 __do_replace+0x16b/0x890 net/ipv6/netfilter/ip6_tables.c:1117 do_replace net/ipv6/netfilter/ip6_tables.c:1157 [inline] do_ip6t_set_ctl+0x90d/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1026 tcp_setsockopt+0x136/0x2520 net/ipv4/tcp.c:3696 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d37b800: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc ffff88807d37b880: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88807d37b900: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff88807d37b980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88807d37ba00: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc Fixes: c85bb41e9318 ("igmp: fix ip_mc_sf_allow race [v5]") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Cc: Flavio Leitner <fbl@sysclose.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-29 23:42:57 +08:00
}
rcu_assign_pointer(pmc->sflist, newpsl);
net: igmp: respect RCU rules in ip_mc_source() and ip_mc_msfilter() syzbot reported an UAF in ip_mc_sf_allow() [1] Whenever RCU protected list replaces an object, the pointer to the new object needs to be updated _before_ the call to kfree_rcu() or call_rcu() Because kfree_rcu(ptr, rcu) got support for NULL ptr only recently in commit 12edff045bc6 ("rcu: Make kfree_rcu() ignore NULL pointers"), I chose to use the conditional to make sure stable backports won't miss this detail. if (psl) kfree_rcu(psl, rcu); net/ipv6/mcast.c has similar issues, addressed in a separate patch. [1] BUG: KASAN: use-after-free in ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 Read of size 4 at addr ffff88807d37b904 by task syz-executor.5/908 CPU: 0 PID: 908 Comm: syz-executor.5 Not tainted 5.18.0-rc4-syzkaller-00064-g8f4dd16603ce #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xeb/0x467 mm/kasan/report.c:313 print_report mm/kasan/report.c:429 [inline] kasan_report.cold+0xf4/0x1c6 mm/kasan/report.c:491 ip_mc_sf_allow+0x6bb/0x6d0 net/ipv4/igmp.c:2655 raw_v4_input net/ipv4/raw.c:190 [inline] raw_local_deliver+0x4d1/0xbe0 net/ipv4/raw.c:218 ip_protocol_deliver_rcu+0xcf/0xb30 net/ipv4/ip_input.c:193 ip_local_deliver_finish+0x2ee/0x4c0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_local_deliver+0x1b3/0x200 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:461 [inline] ip_rcv_finish+0x1cb/0x2f0 net/ipv4/ip_input.c:437 NF_HOOK include/linux/netfilter.h:307 [inline] NF_HOOK include/linux/netfilter.h:301 [inline] ip_rcv+0xaa/0xd0 net/ipv4/ip_input.c:556 __netif_receive_skb_one_core+0x114/0x180 net/core/dev.c:5405 __netif_receive_skb+0x24/0x1b0 net/core/dev.c:5519 netif_receive_skb_internal net/core/dev.c:5605 [inline] netif_receive_skb+0x13e/0x8e0 net/core/dev.c:5664 tun_rx_batched.isra.0+0x460/0x720 drivers/net/tun.c:1534 tun_get_user+0x28b7/0x3e30 drivers/net/tun.c:1985 tun_chr_write_iter+0xdb/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2050 [inline] new_sync_write+0x38a/0x560 fs/read_write.c:504 vfs_write+0x7c0/0xac0 fs/read_write.c:591 ksys_write+0x127/0x250 fs/read_write.c:644 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f3f12c3bbff Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 99 fd ff ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 cc fd ff ff 48 RSP: 002b:00007f3f13ea9130 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f3f12d9bf60 RCX: 00007f3f12c3bbff RDX: 0000000000000036 RSI: 0000000020002ac0 RDI: 00000000000000c8 RBP: 00007f3f12ce308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000036 R11: 0000000000000293 R12: 0000000000000000 R13: 00007fffb68dd79f R14: 00007f3f13ea9300 R15: 0000000000022000 </TASK> Allocated by task 908: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:45 [inline] set_alloc_info mm/kasan/common.c:436 [inline] ____kasan_kmalloc mm/kasan/common.c:515 [inline] ____kasan_kmalloc mm/kasan/common.c:474 [inline] __kasan_kmalloc+0xa6/0xd0 mm/kasan/common.c:524 kasan_kmalloc include/linux/kasan.h:234 [inline] __do_kmalloc mm/slab.c:3710 [inline] __kmalloc+0x209/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] sock_kmalloc net/core/sock.c:2501 [inline] sock_kmalloc+0xb5/0x100 net/core/sock.c:2492 ip_mc_source+0xba2/0x1100 net/ipv4/igmp.c:2392 do_ip_setsockopt net/ipv4/ip_sockglue.c:1296 [inline] ip_setsockopt+0x2312/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Freed by task 753: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:45 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free+0x13d/0x180 mm/kasan/common.c:328 kasan_slab_free include/linux/kasan.h:200 [inline] __cache_free mm/slab.c:3439 [inline] kmem_cache_free_bulk+0x69/0x460 mm/slab.c:3774 kfree_bulk include/linux/slab.h:437 [inline] kfree_rcu_work+0x51c/0xa10 kernel/rcu/tree.c:3318 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 Last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 kvfree_call_rcu+0x74/0x990 kernel/rcu/tree.c:3595 ip_mc_msfilter+0x712/0xb60 net/ipv4/igmp.c:2510 do_ip_setsockopt net/ipv4/ip_sockglue.c:1257 [inline] ip_setsockopt+0x32e1/0x3ab0 net/ipv4/ip_sockglue.c:1432 raw_setsockopt+0x274/0x2c0 net/ipv4/raw.c:861 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Second to last potentially related work creation: kasan_save_stack+0x1e/0x40 mm/kasan/common.c:38 __kasan_record_aux_stack+0x7e/0x90 mm/kasan/generic.c:348 call_rcu+0x99/0x790 kernel/rcu/tree.c:3074 mpls_dev_notify+0x552/0x8a0 net/mpls/af_mpls.c:1656 notifier_call_chain+0xb5/0x200 kernel/notifier.c:84 call_netdevice_notifiers_info+0xb5/0x130 net/core/dev.c:1938 call_netdevice_notifiers_extack net/core/dev.c:1976 [inline] call_netdevice_notifiers net/core/dev.c:1990 [inline] unregister_netdevice_many+0x92e/0x1890 net/core/dev.c:10751 default_device_exit_batch+0x449/0x590 net/core/dev.c:11245 ops_exit_list+0x125/0x170 net/core/net_namespace.c:167 cleanup_net+0x4ea/0xb00 net/core/net_namespace.c:594 process_one_work+0x996/0x1610 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e9/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:298 The buggy address belongs to the object at ffff88807d37b900 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 4 bytes inside of 64-byte region [ffff88807d37b900, ffff88807d37b940) The buggy address belongs to the physical page: page:ffffea0001f4dec0 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d37b180 pfn:0x7d37b flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 ffff888010c41340 ffffea0001c795c8 ffff888010c40200 raw: ffff88807d37b180 ffff88807d37b000 000000010000001f 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x342040(__GFP_IO|__GFP_NOWARN|__GFP_COMP|__GFP_HARDWALL|__GFP_THISNODE), pid 2963, tgid 2963 (udevd), ts 139732238007, free_ts 139730893262 prep_new_page mm/page_alloc.c:2441 [inline] get_page_from_freelist+0xba2/0x3e00 mm/page_alloc.c:4182 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5408 __alloc_pages_node include/linux/gfp.h:587 [inline] kmem_getpages mm/slab.c:1378 [inline] cache_grow_begin+0x75/0x350 mm/slab.c:2584 cache_alloc_refill+0x27f/0x380 mm/slab.c:2957 ____cache_alloc mm/slab.c:3040 [inline] ____cache_alloc mm/slab.c:3023 [inline] __do_cache_alloc mm/slab.c:3267 [inline] slab_alloc mm/slab.c:3309 [inline] __do_kmalloc mm/slab.c:3708 [inline] __kmalloc+0x3b3/0x4d0 mm/slab.c:3719 kmalloc include/linux/slab.h:586 [inline] kzalloc include/linux/slab.h:714 [inline] tomoyo_encode2.part.0+0xe9/0x3a0 security/tomoyo/realpath.c:45 tomoyo_encode2 security/tomoyo/realpath.c:31 [inline] tomoyo_encode+0x28/0x50 security/tomoyo/realpath.c:80 tomoyo_realpath_from_path+0x186/0x620 security/tomoyo/realpath.c:288 tomoyo_get_realpath security/tomoyo/file.c:151 [inline] tomoyo_path_perm+0x21b/0x400 security/tomoyo/file.c:822 security_inode_getattr+0xcf/0x140 security/security.c:1350 vfs_getattr fs/stat.c:157 [inline] vfs_statx+0x16a/0x390 fs/stat.c:232 vfs_fstatat+0x8c/0xb0 fs/stat.c:255 __do_sys_newfstatat+0x91/0x110 fs/stat.c:425 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1356 [inline] free_pcp_prepare+0x549/0xd20 mm/page_alloc.c:1406 free_unref_page_prepare mm/page_alloc.c:3328 [inline] free_unref_page+0x19/0x6a0 mm/page_alloc.c:3423 __vunmap+0x85d/0xd30 mm/vmalloc.c:2667 __vfree+0x3c/0xd0 mm/vmalloc.c:2715 vfree+0x5a/0x90 mm/vmalloc.c:2746 __do_replace+0x16b/0x890 net/ipv6/netfilter/ip6_tables.c:1117 do_replace net/ipv6/netfilter/ip6_tables.c:1157 [inline] do_ip6t_set_ctl+0x90d/0xb90 net/ipv6/netfilter/ip6_tables.c:1639 nf_setsockopt+0x83/0xe0 net/netfilter/nf_sockopt.c:101 ipv6_setsockopt+0x122/0x180 net/ipv6/ipv6_sockglue.c:1026 tcp_setsockopt+0x136/0x2520 net/ipv4/tcp.c:3696 __sys_setsockopt+0x2db/0x6a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d37b800: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc ffff88807d37b880: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc >ffff88807d37b900: fa fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff88807d37b980: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff88807d37ba00: 00 00 00 00 00 fc fc fc fc fc fc fc fc fc fc fc Fixes: c85bb41e9318 ("igmp: fix ip_mc_sf_allow race [v5]") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Cc: Flavio Leitner <fbl@sysclose.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-29 23:42:57 +08:00
if (psl)
kfree_rcu(psl, rcu);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
sockptr_t optval, sockptr_t optlen)
{
int err, len, count, copycount, msf_size;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
if (!pmc) /* must have a prior join */
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
if (!psl) {
count = 0;
} else {
count = psl->sl_count;
}
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
net/ipv4: Replace one-element array with flexible-array member There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Use an anonymous union with a couple of anonymous structs in order to keep userspace unchanged: $ pahole -C ip_msfilter net/ipv4/ip_sockglue.o struct ip_msfilter { union { struct { __be32 imsf_multiaddr_aux; /* 0 4 */ __be32 imsf_interface_aux; /* 4 4 */ __u32 imsf_fmode_aux; /* 8 4 */ __u32 imsf_numsrc_aux; /* 12 4 */ __be32 imsf_slist[1]; /* 16 4 */ }; /* 0 20 */ struct { __be32 imsf_multiaddr; /* 0 4 */ __be32 imsf_interface; /* 4 4 */ __u32 imsf_fmode; /* 8 4 */ __u32 imsf_numsrc; /* 12 4 */ __be32 imsf_slist_flex[0]; /* 16 0 */ }; /* 0 16 */ }; /* 0 20 */ /* size: 20, cachelines: 1, members: 1 */ /* last cacheline: 20 bytes */ }; Also, refactor the code accordingly and make use of the struct_size() and flex_array_size() helpers. This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-01 01:08:30 +08:00
len = flex_array_size(psl, sl_addr, copycount);
msf->imsf_numsrc = count;
msf_size = IP_MSFILTER_SIZE(copycount);
if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
copy_to_sockptr_offset(optval,
offsetof(struct ip_msfilter, imsf_slist_flex),
psl->sl_addr, len))
return -EFAULT;
return 0;
done:
return err;
}
int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
sockptr_t optval, size_t ss_offset)
{
int i, count, copycount;
struct sockaddr_in *psin;
__be32 addr;
struct ip_mc_socklist *pmc;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
ASSERT_RTNL();
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
addr = psin->sin_addr.s_addr;
if (!ipv4_is_multicast(addr))
return -EINVAL;
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == addr &&
pmc->multi.imr_ifindex == gsf->gf_interface)
break;
}
if (!pmc) /* must have a prior join */
return -EADDRNOTAVAIL;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
memset(&ss, 0, sizeof(ss));
psin->sin_family = AF_INET;
psin->sin_addr.s_addr = psl->sl_addr[i];
if (copy_to_sockptr_offset(optval, ss_offset,
&ss, sizeof(ss)))
return -EFAULT;
ss_offset += sizeof(ss);
}
return 0;
}
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
int dif, int sdif)
{
const struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
int ret;
ret = 1;
if (!ipv4_is_multicast(loc_addr))
goto out;
rcu_read_lock();
for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
(pmc->multi.imr_ifindex == dif ||
(sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
ret = inet_test_bit(MC_ALL, sk);
if (!pmc)
goto unlock;
psl = rcu_dereference(pmc->sflist);
ret = (pmc->sfmode == MCAST_EXCLUDE);
if (!psl)
goto unlock;
for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
ret = 0;
if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
goto unlock;
if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
goto unlock;
ret = 1;
unlock:
rcu_read_unlock();
out:
return ret;
}
/*
* A socket is closing.
*/
void ip_mc_drop_socket(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
if (!inet->mc_list)
return;
rtnl_lock();
while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
struct in_device *in_dev;
inet->mc_list = iml->next_rcu;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
if (in_dev)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
}
rtnl_unlock();
}
/* called with rcu_read_lock() */
int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
mc_hash = rcu_dereference(in_dev->mc_hash);
if (mc_hash) {
u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
for (im = rcu_dereference(mc_hash[hash]);
im != NULL;
im = rcu_dereference(im->next_hash)) {
if (im->multiaddr == mc_addr)
break;
}
} else {
for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == mc_addr)
break;
}
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
} else if (im) {
if (src_addr) {
igmp: Add ip_mc_list lock in ip_check_mc_rcu I got below panic when doing fuzz test: Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 4056 Comm: syz-executor.3 Tainted: G B 5.14.0-rc1-00195-gcff5c4254439-dirty #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x7a/0x9b panic+0x2cd/0x5af end_report.cold+0x5a/0x5a kasan_report+0xec/0x110 ip_check_mc_rcu+0x556/0x5d0 __mkroute_output+0x895/0x1740 ip_route_output_key_hash_rcu+0x2d0/0x1050 ip_route_output_key_hash+0x182/0x2e0 ip_route_output_flow+0x28/0x130 udp_sendmsg+0x165d/0x2280 udpv6_sendmsg+0x121e/0x24f0 inet6_sendmsg+0xf7/0x140 sock_sendmsg+0xe9/0x180 ____sys_sendmsg+0x2b8/0x7a0 ___sys_sendmsg+0xf0/0x160 __sys_sendmmsg+0x17e/0x3c0 __x64_sys_sendmmsg+0x9e/0x100 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x462eb9 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3df5af1c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462eb9 RDX: 0000000000000312 RSI: 0000000020001700 RDI: 0000000000000007 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3df5af26bc R13: 00000000004c372d R14: 0000000000700b10 R15: 00000000ffffffff It is one use-after-free in ip_check_mc_rcu. In ip_mc_del_src, the ip_sf_list of pmc has been freed under pmc->lock protection. But access to ip_sf_list in ip_check_mc_rcu is not protected by the lock. Signed-off-by: Liu Jian <liujian56@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-16 12:06:17 +08:00
spin_lock_bh(&im->lock);
for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
if (psf)
rv = psf->sf_count[MCAST_INCLUDE] ||
psf->sf_count[MCAST_EXCLUDE] !=
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
igmp: Add ip_mc_list lock in ip_check_mc_rcu I got below panic when doing fuzz test: Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 4056 Comm: syz-executor.3 Tainted: G B 5.14.0-rc1-00195-gcff5c4254439-dirty #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x7a/0x9b panic+0x2cd/0x5af end_report.cold+0x5a/0x5a kasan_report+0xec/0x110 ip_check_mc_rcu+0x556/0x5d0 __mkroute_output+0x895/0x1740 ip_route_output_key_hash_rcu+0x2d0/0x1050 ip_route_output_key_hash+0x182/0x2e0 ip_route_output_flow+0x28/0x130 udp_sendmsg+0x165d/0x2280 udpv6_sendmsg+0x121e/0x24f0 inet6_sendmsg+0xf7/0x140 sock_sendmsg+0xe9/0x180 ____sys_sendmsg+0x2b8/0x7a0 ___sys_sendmsg+0xf0/0x160 __sys_sendmmsg+0x17e/0x3c0 __x64_sys_sendmmsg+0x9e/0x100 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x462eb9 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f3df5af1c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000133 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462eb9 RDX: 0000000000000312 RSI: 0000000020001700 RDI: 0000000000000007 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3df5af26bc R13: 00000000004c372d R14: 0000000000700b10 R15: 00000000ffffffff It is one use-after-free in ip_check_mc_rcu. In ip_mc_del_src, the ip_sf_list of pmc has been freed under pmc->lock protection. But access to ip_sf_list in ip_check_mc_rcu is not protected by the lock. Signed-off-by: Liu Jian <liujian56@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-07-16 12:06:17 +08:00
spin_unlock_bh(&im->lock);
} else
rv = 1; /* unspecified source; tentatively allow */
}
return rv;
}
#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
struct seq_net_private p;
struct net_device *dev;
struct in_device *in_dev;
};
#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ip_mc_list *im = NULL;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
im = rcu_dereference(in_dev->mc_list);
if (im) {
state->in_dev = in_dev;
break;
}
}
return im;
}
static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
im = rcu_dereference(im->next_rcu);
while (!im) {
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
im = rcu_dereference(state->in_dev->mc_list);
}
return im;
}
static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
{
struct ip_mc_list *im = igmp_mc_get_first(seq);
if (im)
while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
--pos;
return pos ? NULL : im;
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(rcu)
{
rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_mc_list *im;
if (v == SEQ_START_TOKEN)
im = igmp_mc_get_first(seq);
else
im = igmp_mc_get_next(seq, v);
++*pos;
return im;
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
__releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
state->dev = NULL;
rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
else {
struct ip_mc_list *im = v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
long delta;
#ifdef CONFIG_IP_MULTICAST
querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
IGMP_V2_SEEN(state->in_dev) ? "V2" :
"V3";
#else
querier = "NONE";
#endif
if (rcu_access_pointer(state->in_dev->mc_list) == im) {
seq_printf(seq, "%d\t%-10s: %5d %7s\n",
state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
delta = im->timer.expires - jiffies;
seq_printf(seq,
"\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
im->multiaddr, im->users,
im->tm_running,
im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
im->reporter);
}
return 0;
}
static const struct seq_operations igmp_mc_seq_ops = {
.start = igmp_mc_seq_start,
.next = igmp_mc_seq_next,
.stop = igmp_mc_seq_stop,
.show = igmp_mc_seq_show,
};
struct igmp_mcf_iter_state {
struct seq_net_private p;
struct net_device *dev;
struct in_device *idev;
struct ip_mc_list *im;
};
#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ip_sf_list *psf = NULL;
struct ip_mc_list *im = NULL;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
state->idev = NULL;
state->im = NULL;
for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
idev = __in_dev_get_rcu(state->dev);
if (unlikely(!idev))
continue;
im = rcu_dereference(idev->mc_list);
if (likely(im)) {
spin_lock_bh(&im->lock);
psf = im->sources;
if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
spin_unlock_bh(&im->lock);
}
}
return psf;
}
static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
psf = psf->sf_next;
while (!psf) {
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
state->im = rcu_dereference(state->idev->mc_list);
}
if (!state->im)
break;
spin_lock_bh(&state->im->lock);
psf = state->im->sources;
}
out:
return psf;
}
static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
struct ip_sf_list *psf = igmp_mcf_get_first(seq);
if (psf)
while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
--pos;
return pos ? NULL : psf;
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(rcu)
{
rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_sf_list *psf;
if (v == SEQ_START_TOKEN)
psf = igmp_mcf_get_first(seq);
else
psf = igmp_mcf_get_next(seq, v);
++*pos;
return psf;
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
__releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (likely(state->im)) {
spin_unlock_bh(&state->im->lock);
state->im = NULL;
}
state->idev = NULL;
state->dev = NULL;
rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
struct ip_sf_list *psf = v;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
} else {
seq_printf(seq,
"%3d %6.6s 0x%08x "
"0x%08x %6lu %6lu\n",
state->dev->ifindex, state->dev->name,
ntohl(state->im->multiaddr),
ntohl(psf->sf_inaddr),
psf->sf_count[MCAST_INCLUDE],
psf->sf_count[MCAST_EXCLUDE]);
}
return 0;
}
static const struct seq_operations igmp_mcf_seq_ops = {
.start = igmp_mcf_seq_start,
.next = igmp_mcf_seq_next,
.stop = igmp_mcf_seq_stop,
.show = igmp_mcf_seq_show,
};
static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
int err;
pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
sizeof(struct igmp_mc_iter_state));
if (!pde)
goto out_igmp;
pde = proc_create_net("mcfilter", 0444, net->proc_net,
&igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
if (!pde)
goto out_mcfilter;
err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
SOCK_DGRAM, 0, net);
if (err < 0) {
pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
err);
goto out_sock;
}
return 0;
out_sock:
remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
remove_proc_entry("igmp", net->proc_net);
out_igmp:
return -ENOMEM;
}
static void __net_exit igmp_net_exit(struct net *net)
{
remove_proc_entry("mcfilter", net->proc_net);
remove_proc_entry("igmp", net->proc_net);
inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}
static struct pernet_operations igmp_net_ops = {
.init = igmp_net_init,
.exit = igmp_net_exit,
};
#endif
static int igmp_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev;
switch (event) {
case NETDEV_RESEND_IGMP:
in_dev = __in_dev_get_rtnl(dev);
if (in_dev)
ip_mc_rejoin_groups(in_dev);
break;
default:
break;
}
return NOTIFY_DONE;
}
static struct notifier_block igmp_notifier = {
.notifier_call = igmp_netdev_event,
};
int __init igmp_mc_init(void)
{
#if defined(CONFIG_PROC_FS)
int err;
err = register_pernet_subsys(&igmp_net_ops);
if (err)
return err;
err = register_netdevice_notifier(&igmp_notifier);
if (err)
goto reg_notif_fail;
return 0;
reg_notif_fail:
unregister_pernet_subsys(&igmp_net_ops);
return err;
#else
return register_netdevice_notifier(&igmp_notifier);
#endif
}