linux/net/ipv4/igmp.c

3048 lines
72 KiB
C
Raw Normal View History

/*
* Linux NET3: Internet Group Management Protocol [IGMP]
*
* This code implements the IGMP protocol as defined in RFC1112. There has
* been a further revision of this protocol since which is now supported.
*
* If you have trouble with this module be careful what gcc you have used,
* the older version didn't come out right using gcc 2.5.8, the newer one
* seems to fall out with gcc 2.6.2.
*
* Authors:
* Alan Cox <alan@lxorguk.ukuu.org.uk>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Fixes:
*
* Alan Cox : Added lots of __inline__ to optimise
* the memory usage of all the tiny little
* functions.
* Alan Cox : Dumped the header building experiment.
* Alan Cox : Minor tweaks ready for multicast routing
* and extended IGMP protocol.
* Alan Cox : Removed a load of inline directives. Gcc 2.5.8
* writes utterly bogus code otherwise (sigh)
* fixed IGMP loopback to behave in the manner
* desired by mrouted, fixed the fact it has been
* broken since 1.3.6 and cleaned up a few minor
* points.
*
* Chih-Jen Chang : Tried to revise IGMP to Version 2
* Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
* The enhancements are mainly based on Steve Deering's
* ipmulti-3.5 source code.
* Chih-Jen Chang : Added the igmp_get_mrouter_info and
* Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of
* the mrouted version on that device.
* Chih-Jen Chang : Added the max_resp_time parameter to
* Tsu-Sheng Tsao igmp_heard_query(). Using this parameter
* to identify the multicast router version
* and do what the IGMP version 2 specified.
* Chih-Jen Chang : Added a timer to revert to IGMP V2 router
* Tsu-Sheng Tsao if the specified time expired.
* Alan Cox : Stop IGMP from 0.0.0.0 being accepted.
* Alan Cox : Use GFP_ATOMIC in the right places.
* Christian Daudt : igmp timer wasn't set for local group
* memberships but was being deleted,
* which caused a "del_timer() called
* from %p with timer not initialized\n"
* message (960131).
* Christian Daudt : removed del_timer from
* igmp_timer_expire function (960205).
* Christian Daudt : igmp_heard_report now only calls
* igmp_timer_expire if tm->running is
* true (960216).
* Malcolm Beattie : ttl comparison wrong in igmp_rcv made
* igmp_heard_query never trigger. Expiry
* miscalculation fixed in igmp_heard_query
* and random() made to return unsigned to
* prevent negative expiry times.
* Alexey Kuznetsov: Wrong group leaving behaviour, backport
* fix from pending 2.1.x patches.
* Alan Cox: Forget to enable FDDI support earlier.
* Alexey Kuznetsov: Fixed leaving groups on device down.
* Alexey Kuznetsov: Accordance to igmp-v2-06 draft.
* David L Stevens: IGMPv3 support, with help from
* Vinay Kulkarni
*/
#include <linux/module.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#endif
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
#define IGMP_INITIAL_REPORT_DELAY (1)
/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
* IGMP specs require to report membership immediately after
* joining a group, but we delay the first report by a
* small interval. It seems more natural and still does not
* contradict to specs provided this delay is small enough.
*/
#define IGMP_V1_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
static int unsolicited_report_interval(struct in_device *in_dev)
{
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
int interval_ms, interval_jiffies;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
interval_ms = IN_DEV_CONF_GET(
in_dev,
IGMPV2_UNSOLICITED_REPORT_INTERVAL);
else /* v3 */
net: igmp: Allow user-space configuration of igmp unsolicited report interval Adds the new procfs knobs: /proc/sys/net/ipv4/conf/*/igmpv2_unsolicited_report_interval /proc/sys/net/ipv4/conf/*/igmpv3_unsolicited_report_interval Which will allow userspace configuration of the IGMP unsolicited report interval (see below) in milliseconds. The defaults are 10000ms for IGMPv2 and 1000ms for IGMPv3 in accordance with RFC2236 and RFC3376. Background: If an IGMP join packet is lost you will not receive data sent to the multicast group so if no data arrives from that multicast group in a period of time after the IGMP join a second IGMP join will be sent. The delay between joins is the "IGMP Unsolicited Report Interval". Prior to this patch this value was hard coded in the kernel to 10s for IGMPv2 and 1s for IGMPv3. 10s is unsuitable for some use-cases, such as IPTV as it can cause channel change to be slow in the presence of packet loss. This patch allows the value to be overridden from userspace for both IGMPv2 and IGMPv3 such that it can be tuned accoding to the network. Tested with Wireshark and a simple program to join a (non-existent) multicast group. The distribution of timings for the second join differ based upon setting the procfs knobs. igmpvX_unsolicited_report_interval is intended to follow the pattern established by force_igmp_version, and while a procfs entry has been added a corresponding sysctl knob has not as it is my understanding that sysctl is deprecated[1]. [1]: http://lwn.net/Articles/247243/ Signed-off-by: William Manley <william.manley@youview.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Benjamin LaHaise <bcrl@kvack.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-08-07 02:03:15 +08:00
interval_ms = IN_DEV_CONF_GET(
in_dev,
IGMPV3_UNSOLICITED_REPORT_INTERVAL);
interval_jiffies = msecs_to_jiffies(interval_ms);
/* _timer functions can't handle a delay of 0 jiffies so ensure
* we always return a positive value.
*/
if (interval_jiffies <= 0)
interval_jiffies = 1;
return interval_jiffies;
}
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
#endif
static void ip_mc_clear_src(struct ip_mc_list *pmc);
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta);
static void ip_ma_put(struct ip_mc_list *im)
{
if (refcount_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
kfree_rcu(im, rcu);
}
}
#define for_each_pmc_rcu(in_dev, pmc) \
for (pmc = rcu_dereference(in_dev->mc_list); \
pmc != NULL; \
pmc = rcu_dereference(pmc->next_rcu))
#define for_each_pmc_rtnl(in_dev, pmc) \
for (pmc = rtnl_dereference(in_dev->mc_list); \
pmc != NULL; \
pmc = rtnl_dereference(pmc->next_rcu))
#ifdef CONFIG_IP_MULTICAST
/*
* Timer management
*/
static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
if (del_timer(&im->timer))
refcount_dec(&im->refcnt);
im->tm_running = 0;
im->reporter = 0;
im->unsolicit_count = 0;
spin_unlock_bh(&im->lock);
}
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
int tv = prandom_u32() % max_delay;
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
refcount_inc(&im->refcnt);
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
int tv = prandom_u32() % in_dev->mr_maxdelay;
igmp: Make igmp group member RFC 3376 compliant 5.2. Action on Reception of a Query When a system receives a Query, it does not respond immediately. Instead, it delays its response by a random amount of time, bounded by the Max Resp Time value derived from the Max Resp Code in the received Query message. A system may receive a variety of Queries on different interfaces and of different kinds (e.g., General Queries, Group-Specific Queries, and Group-and-Source-Specific Queries), each of which may require its own delayed response. Before scheduling a response to a Query, the system must first consider previously scheduled pending responses and in many cases schedule a combined response. Therefore, the system must be able to maintain the following state: o A timer per interface for scheduling responses to General Queries. o A per-group and interface timer for scheduling responses to Group- Specific and Group-and-Source-Specific Queries. o A per-group and interface list of sources to be reported in the response to a Group-and-Source-Specific Query. When a new Query with the Router-Alert option arrives on an interface, provided the system has state to report, a delay for a response is randomly selected in the range (0, [Max Resp Time]) where Max Resp Time is derived from Max Resp Code in the received Query message. The following rules are then used to determine if a Report needs to be scheduled and the type of Report to schedule. The rules are considered in order and only the first matching rule is applied. 1. If there is a pending response to a previous General Query scheduled sooner than the selected delay, no additional response needs to be scheduled. 2. If the received Query is a General Query, the interface timer is used to schedule a response to the General Query after the selected delay. Any previously pending response to a General Query is canceled. --8<-- Currently the timer is rearmed with new random expiration time for every incoming query regardless of possibly already pending report. Which is not aligned with the above RFE. It also might happen that higher rate of incoming queries can postpone the report after the expiration time of the first query causing group membership loss. Now the per interface general query timer is rearmed only when there is no pending report already scheduled on that interface or the newly selected expiration time is before the already pending scheduled report. Signed-off-by: Michal Tesar <mtesar@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-02 21:38:36 +08:00
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
time_after_eq(exp, (in_dev->mr_gq_timer).expires))
return;
in_dev->mr_gq_running = 1;
igmp: Make igmp group member RFC 3376 compliant 5.2. Action on Reception of a Query When a system receives a Query, it does not respond immediately. Instead, it delays its response by a random amount of time, bounded by the Max Resp Time value derived from the Max Resp Code in the received Query message. A system may receive a variety of Queries on different interfaces and of different kinds (e.g., General Queries, Group-Specific Queries, and Group-and-Source-Specific Queries), each of which may require its own delayed response. Before scheduling a response to a Query, the system must first consider previously scheduled pending responses and in many cases schedule a combined response. Therefore, the system must be able to maintain the following state: o A timer per interface for scheduling responses to General Queries. o A per-group and interface timer for scheduling responses to Group- Specific and Group-and-Source-Specific Queries. o A per-group and interface list of sources to be reported in the response to a Group-and-Source-Specific Query. When a new Query with the Router-Alert option arrives on an interface, provided the system has state to report, a delay for a response is randomly selected in the range (0, [Max Resp Time]) where Max Resp Time is derived from Max Resp Code in the received Query message. The following rules are then used to determine if a Report needs to be scheduled and the type of Report to schedule. The rules are considered in order and only the first matching rule is applied. 1. If there is a pending response to a previous General Query scheduled sooner than the selected delay, no additional response needs to be scheduled. 2. If the received Query is a General Query, the interface timer is used to schedule a response to the General Query after the selected delay. Any previously pending response to a General Query is canceled. --8<-- Currently the timer is rearmed with new random expiration time for every incoming query regardless of possibly already pending report. Which is not aligned with the above RFE. It also might happen that higher rate of incoming queries can postpone the report after the expiration time of the first query causing group membership loss. Now the per interface general query timer is rearmed only when there is no pending report already scheduled on that interface or the newly selected expiration time is before the already pending scheduled report. Signed-off-by: Michal Tesar <mtesar@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-02 21:38:36 +08:00
if (!mod_timer(&in_dev->mr_gq_timer, exp))
in_dev_hold(in_dev);
}
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
int tv = prandom_u32() % delay;
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
}
static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
if (del_timer(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
im->tm_running = 1;
spin_unlock_bh(&im->lock);
return;
}
refcount_dec(&im->refcnt);
}
igmp_start_timer(im, max_delay);
spin_unlock_bh(&im->lock);
}
/*
* Send an IGMP report.
*/
#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
int gdeleted, int sdeleted)
{
switch (type) {
case IGMPV3_MODE_IS_INCLUDE:
case IGMPV3_MODE_IS_EXCLUDE:
if (gdeleted || sdeleted)
return 0;
if (!(pmc->gsquery && !psf->sf_gsresp)) {
if (pmc->sfmode == MCAST_INCLUDE)
return 1;
/* don't include if this source is excluded
* in all filters
*/
if (psf->sf_count[MCAST_INCLUDE])
return type == IGMPV3_MODE_IS_INCLUDE;
return pmc->sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
}
return 0;
case IGMPV3_CHANGE_TO_INCLUDE:
if (gdeleted || sdeleted)
return 0;
return psf->sf_count[MCAST_INCLUDE] != 0;
case IGMPV3_CHANGE_TO_EXCLUDE:
if (gdeleted || sdeleted)
return 0;
if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
psf->sf_count[MCAST_INCLUDE])
return 0;
return pmc->sfcount[MCAST_EXCLUDE] ==
psf->sf_count[MCAST_EXCLUDE];
case IGMPV3_ALLOW_NEW_SOURCES:
if (gdeleted || !psf->sf_crcount)
return 0;
return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
case IGMPV3_BLOCK_OLD_SOURCES:
if (pmc->sfmode == MCAST_INCLUDE)
return gdeleted || (psf->sf_crcount && sdeleted);
return psf->sf_crcount && !gdeleted && !sdeleted;
}
return 0;
}
static int
igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
{
struct ip_sf_list *psf;
int scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
}
return scount;
}
ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: <IRQ> [<ffffffff80578226>] ? skb_put+0x3a/0x3b [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu <lw1a2.jing@gmail.com> Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: David L Stevens <david.stevens@oracle.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-06 03:27:38 +08:00
static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
struct sk_buff *skb;
struct rtable *rt;
struct iphdr *pip;
struct igmpv3_report *pig;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = dev_net(dev);
struct flowi4 fl4;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: <IRQ> [<ffffffff80578226>] ? skb_put+0x3a/0x3b [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu <lw1a2.jing@gmail.com> Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: David L Stevens <david.stevens@oracle.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-06 03:27:38 +08:00
unsigned int size = mtu;
while (1) {
skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
if (skb)
break;
size >>= 1;
if (size < 256)
return NULL;
}
skb->priority = TC_PRIO_CONTROL;
rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
0, 0,
IPPROTO_IGMP, 0, dev->ifindex);
if (IS_ERR(rt)) {
kfree_skb(skb);
return NULL;
}
skb_dst_set(skb, &rt->dst);
skb->dev = dev;
skb_reserve(skb, hlen);
mld, igmp: Fix reserved tailroom calculation The current reserved_tailroom calculation fails to take hlen and tlen into account. skb: [__hlen__|__data____________|__tlen___|__extra__] ^ ^ head skb_end_offset In this representation, hlen + data + tlen is the size passed to alloc_skb. "extra" is the extra space made available in __alloc_skb because of rounding up by kmalloc. We can reorder the representation like so: [__hlen__|__data____________|__extra__|__tlen___] ^ ^ head skb_end_offset The maximum space available for ip headers and payload without fragmentation is min(mtu, data + extra). Therefore, reserved_tailroom = data + extra + tlen - min(mtu, data + extra) = skb_end_offset - hlen - min(mtu, skb_end_offset - hlen - tlen) = skb_tailroom - min(mtu, skb_tailroom - tlen) ; after skb_reserve(hlen) Compare the second line to the current expression: reserved_tailroom = skb_end_offset - min(mtu, skb_end_offset) and we can see that hlen and tlen are not taken into account. The min() in the third line can be expanded into: if mtu < skb_tailroom - tlen: reserved_tailroom = skb_tailroom - mtu else: reserved_tailroom = tlen Depending on hlen, tlen, mtu and the number of multicast address records, the current code may output skbs that have less tailroom than dev->needed_tailroom or it may output more skbs than needed because not all space available is used. Fixes: 4c672e4b ("ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs") Signed-off-by: Benjamin Poirier <bpoirier@suse.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-01 07:03:33 +08:00
skb_tailroom_reserve(skb, mtu, tlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
skb_put(skb, sizeof(struct iphdr) + 4);
pip->version = 4;
pip->ihl = (sizeof(struct iphdr)+4)>>2;
pip->tos = 0xc0;
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
pip->daddr = fl4.daddr;
pip->saddr = fl4.saddr;
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
ip_select_ident(net, skb, NULL);
((u8 *)&pip[1])[0] = IPOPT_RA;
((u8 *)&pip[1])[1] = 4;
((u8 *)&pip[1])[2] = 0;
((u8 *)&pip[1])[3] = 0;
skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
skb_put(skb, sizeof(*pig));
pig = igmpv3_report_hdr(skb);
pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
pig->resv1 = 0;
pig->csum = 0;
pig->resv2 = 0;
pig->ngrec = 0;
return skb;
}
static int igmpv3_sendpack(struct sk_buff *skb)
{
struct igmphdr *pig = igmp_hdr(skb);
const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, struct igmpv3_grec **ppgr)
{
struct net_device *dev = pmc->interface->dev;
struct igmpv3_report *pih;
struct igmpv3_grec *pgr;
if (!skb)
skb = igmpv3_newpack(dev, dev->mtu);
if (!skb)
return NULL;
pgr = skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
pgr->grec_nsrcs = 0;
pgr->grec_mca = pmc->multiaddr;
pih = igmpv3_report_hdr(skb);
pih->ngrec = htons(ntohs(pih->ngrec)+1);
*ppgr = pgr;
return skb;
}
ipv6: mld: fix add_grhead skb_over_panic for devs with large MTUs It has been reported that generating an MLD listener report on devices with large MTUs (e.g. 9000) and a high number of IPv6 addresses can trigger a skb_over_panic(): skbuff: skb_over_panic: text:ffffffff80612a5d len:3776 put:20 head:ffff88046d751000 data:ffff88046d751010 tail:0xed0 end:0xec0 dev:port1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:100! invalid opcode: 0000 [#1] SMP Modules linked in: ixgbe(O) CPU: 3 PID: 0 Comm: swapper/3 Tainted: G O 3.14.23+ #4 [...] Call Trace: <IRQ> [<ffffffff80578226>] ? skb_put+0x3a/0x3b [<ffffffff80612a5d>] ? add_grhead+0x45/0x8e [<ffffffff80612e3a>] ? add_grec+0x394/0x3d4 [<ffffffff80613222>] ? mld_ifc_timer_expire+0x195/0x20d [<ffffffff8061308d>] ? mld_dad_timer_expire+0x45/0x45 [<ffffffff80255b5d>] ? call_timer_fn.isra.29+0x12/0x68 [<ffffffff80255d16>] ? run_timer_softirq+0x163/0x182 [<ffffffff80250e6f>] ? __do_softirq+0xe0/0x21d [<ffffffff8025112b>] ? irq_exit+0x4e/0xd3 [<ffffffff802214bb>] ? smp_apic_timer_interrupt+0x3b/0x46 [<ffffffff8063f10a>] ? apic_timer_interrupt+0x6a/0x70 mld_newpack() skb allocations are usually requested with dev->mtu in size, since commit 72e09ad107e7 ("ipv6: avoid high order allocations") we have changed the limit in order to be less likely to fail. However, in MLD/IGMP code, we have some rather ugly AVAILABLE(skb) macros, which determine if we may end up doing an skb_put() for adding another record. To avoid possible fragmentation, we check the skb's tailroom as skb->dev->mtu - skb->len, which is a wrong assumption as the actual max allocation size can be much smaller. The IGMP case doesn't have this issue as commit 57e1ab6eaddc ("igmp: refine skb allocations") stores the allocation size in the cb[]. Set a reserved_tailroom to make it fit into the MTU and use skb_availroom() helper instead. This also allows to get rid of igmp_skb_size(). Reported-by: Wei Liu <lw1a2.jing@gmail.com> Fixes: 72e09ad107e7 ("ipv6: avoid high order allocations") Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Cc: Eric Dumazet <edumazet@google.com> Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: David L Stevens <david.stevens@oracle.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-11-06 03:27:38 +08:00
#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, int gdeleted, int sdeleted)
{
struct net_device *dev = pmc->interface->dev;
struct net *net = dev_net(dev);
struct igmpv3_report *pih;
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
type == IGMPV3_CHANGE_TO_EXCLUDE;
stotal = scount = 0;
psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
if (!*psf_list)
goto empty_source;
pih = skb ? igmpv3_report_hdr(skb) : NULL;
/* EX and TO_EX get a fresh packet, if needed */
if (truncate) {
if (pih && pih->ngrec &&
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
igmpv3_sendpack(skb);
skb = igmpv3_newpack(dev, dev->mtu);
}
}
first = 1;
psf_prev = NULL;
for (psf = *psf_list; psf; psf = psf_next) {
__be32 *psrc;
psf_next = psf->sf_next;
if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
psf_prev = psf;
continue;
}
/* Based on RFC3376 5.1. Should not send source-list change
* records when there is a filter mode change.
*/
if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
(!gdeleted && pmc->crcount)) &&
(type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
goto decrease_sf_crcount;
/* clear marks on query responses */
if (isquery)
psf->sf_gsresp = 0;
if (AVAILABLE(skb) < sizeof(__be32) +
first*sizeof(struct igmpv3_grec)) {
if (truncate && !first)
break; /* truncate these */
if (pgr)
pgr->grec_nsrcs = htons(scount);
if (skb)
igmpv3_sendpack(skb);
skb = igmpv3_newpack(dev, dev->mtu);
first = 1;
scount = 0;
}
if (first) {
skb = add_grhead(skb, pmc, type, &pgr);
first = 0;
}
if (!skb)
return NULL;
psrc = skb_put(skb, sizeof(__be32));
*psrc = psf->sf_inaddr;
scount++; stotal++;
if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
*psf_list = psf->sf_next;
kfree(psf);
continue;
}
}
psf_prev = psf;
}
empty_source:
if (!stotal) {
if (type == IGMPV3_ALLOW_NEW_SOURCES ||
type == IGMPV3_BLOCK_OLD_SOURCES)
return skb;
if (pmc->crcount || isquery) {
/* make sure we have room for group header */
if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
skb = add_grhead(skb, pmc, type, &pgr);
}
}
if (pgr)
pgr->grec_nsrcs = htons(scount);
if (isquery)
pmc->gsquery = 0; /* clear query state on report */
return skb;
}
static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
struct sk_buff *skb = NULL;
struct net *net = dev_net(in_dev->dev);
int type;
if (!pmc) {
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(pmc->multiaddr) &&
!net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
else
type = IGMPV3_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
rcu_read_unlock();
} else {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
type = IGMPV3_MODE_IS_EXCLUDE;
else
type = IGMPV3_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
spin_unlock_bh(&pmc->lock);
}
if (!skb)
return 0;
return igmpv3_sendpack(skb);
}
/*
* remove zero-count source records from a source filter list
*/
static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
{
struct ip_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
for (psf = *ppsf; psf; psf = psf_next) {
psf_next = psf->sf_next;
if (psf->sf_crcount == 0) {
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
*ppsf = psf->sf_next;
kfree(psf);
} else
psf_prev = psf;
}
}
static void igmpv3_send_cr(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
struct sk_buff *skb = NULL;
int type, dtype;
rcu_read_lock();
spin_lock_bh(&in_dev->mc_tomb_lock);
/* deleted MCA's */
pmc_prev = NULL;
for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
pmc_next = pmc->next;
if (pmc->sfmode == MCAST_INCLUDE) {
type = IGMPV3_BLOCK_OLD_SOURCES;
dtype = IGMPV3_BLOCK_OLD_SOURCES;
skb = add_grec(skb, pmc, type, 1, 0);
skb = add_grec(skb, pmc, dtype, 1, 1);
}
if (pmc->crcount) {
if (pmc->sfmode == MCAST_EXCLUDE) {
type = IGMPV3_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 1, 0);
}
pmc->crcount--;
if (pmc->crcount == 0) {
igmpv3_clear_zeros(&pmc->tomb);
igmpv3_clear_zeros(&pmc->sources);
}
}
if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
if (pmc_prev)
pmc_prev->next = pmc_next;
else
in_dev->mc_tomb = pmc_next;
in_dev_put(pmc->interface);
kfree(pmc);
} else
pmc_prev = pmc;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
/* change recs */
for_each_pmc_rcu(in_dev, pmc) {
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE]) {
type = IGMPV3_BLOCK_OLD_SOURCES;
dtype = IGMPV3_ALLOW_NEW_SOURCES;
} else {
type = IGMPV3_ALLOW_NEW_SOURCES;
dtype = IGMPV3_BLOCK_OLD_SOURCES;
}
skb = add_grec(skb, pmc, type, 0, 0);
skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */
/* filter mode changes */
if (pmc->crcount) {
if (pmc->sfmode == MCAST_EXCLUDE)
type = IGMPV3_CHANGE_TO_EXCLUDE;
else
type = IGMPV3_CHANGE_TO_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0);
pmc->crcount--;
}
spin_unlock_bh(&pmc->lock);
}
rcu_read_unlock();
if (!skb)
return;
(void) igmpv3_sendpack(skb);
}
static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
int type)
{
struct sk_buff *skb;
struct iphdr *iph;
struct igmphdr *ih;
struct rtable *rt;
struct net_device *dev = in_dev->dev;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = dev_net(dev);
__be32 group = pmc ? pmc->multiaddr : 0;
struct flowi4 fl4;
__be32 dst;
int hlen, tlen;
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return 0;
if (type == IGMP_HOST_LEAVE_MESSAGE)
dst = IGMP_ALL_ROUTER;
else
dst = group;
rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
0, 0,
IPPROTO_IGMP, 0, dev->ifindex);
if (IS_ERR(rt))
return -1;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
if (!skb) {
ip_rt_put(rt);
return -1;
}
skb->priority = TC_PRIO_CONTROL;
skb_dst_set(skb, &rt->dst);
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
iph = ip_hdr(skb);
skb_put(skb, sizeof(struct iphdr) + 4);
iph->version = 4;
iph->ihl = (sizeof(struct iphdr)+4)>>2;
iph->tos = 0xc0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->protocol = IPPROTO_IGMP;
ip_select_ident(net, skb, NULL);
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
((u8 *)&iph[1])[3] = 0;
ih = skb_put(skb, sizeof(struct igmphdr));
ih->type = type;
ih->code = 0;
ih->csum = 0;
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
{
struct in_device *in_dev = (struct in_device *)data;
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
in_dev_put(in_dev);
}
static void igmp_ifc_timer_expire(unsigned long data)
{
struct in_device *in_dev = (struct in_device *)data;
igmpv3_send_cr(in_dev);
if (in_dev->mr_ifc_count) {
in_dev->mr_ifc_count--;
igmp_ifc_start_timer(in_dev,
unsolicited_report_interval(in_dev));
}
in_dev_put(in_dev);
}
static void igmp_ifc_event(struct in_device *in_dev)
{
struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_start_timer(in_dev, 1);
}
static void igmp_timer_expire(unsigned long data)
{
struct ip_mc_list *im = (struct ip_mc_list *)data;
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
im->tm_running = 0;
if (im->unsolicit_count) {
im->unsolicit_count--;
igmp_start_timer(im, unsolicited_report_interval(in_dev));
}
im->reporter = 1;
spin_unlock(&im->lock);
if (IGMP_V1_SEEN(in_dev))
igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
else if (IGMP_V2_SEEN(in_dev))
igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
else
igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
ip_ma_put(im);
}
/* mark EXCLUDE-mode sources */
static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
struct ip_sf_list *psf;
int i, scount;
scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
/* skip inactive filters */
if (psf->sf_count[MCAST_INCLUDE] ||
pmc->sfcount[MCAST_EXCLUDE] !=
psf->sf_count[MCAST_EXCLUDE])
break;
if (srcs[i] == psf->sf_inaddr) {
scount++;
break;
}
}
}
pmc->gsquery = 0;
if (scount == nsrcs) /* all sources excluded */
return 0;
return 1;
}
static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
struct ip_sf_list *psf;
int i, scount;
if (pmc->sfmode == MCAST_EXCLUDE)
return igmp_xmarksources(pmc, nsrcs, srcs);
/* mark INCLUDE-mode sources */
scount = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++)
if (srcs[i] == psf->sf_inaddr) {
psf->sf_gsresp = 1;
scount++;
break;
}
}
if (!scount) {
pmc->gsquery = 0;
return 0;
}
pmc->gsquery = 1;
return 1;
}
/* return true if packet was dropped */
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
struct net *net = dev_net(in_dev->dev);
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
return false;
if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return false;
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == group) {
igmp_stop_timer(im);
break;
}
}
rcu_read_unlock();
return false;
}
/* return true if packet was dropped */
static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
int len)
{
struct igmphdr *ih = igmp_hdr(skb);
struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
struct ip_mc_list *im;
__be32 group = ih->group;
int max_delay;
int mark = 0;
struct net *net = dev_net(in_dev->dev);
if (len == 8) {
if (ih->code == 0) {
/* Alas, old v1 router presents here. */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
IGMP_V1_ROUTER_PRESENT_TIMEOUT;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
IGMP_V2_ROUTER_PRESENT_TIMEOUT;
}
/* cancel the interface change timer */
in_dev->mr_ifc_count = 0;
if (del_timer(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
} else if (len < 12) {
return true; /* ignore bogus packet; freed by caller */
} else if (IGMP_V1_SEEN(in_dev)) {
/* This is a v3 query with v1 queriers present */
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
group = 0;
} else if (IGMP_V2_SEEN(in_dev)) {
/* this is a v3 query with v2 queriers present;
* Interpretation of the max_delay code is problematic here.
* A real v2 host would use ih_code directly, while v3 has a
* different encoding. We use the v3 encoding as more likely
* to be intended in a v3 query.
*/
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
} else { /* v3 */
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
return true;
ih3 = igmpv3_query_hdr(skb);
if (ih3->nsrcs) {
if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
+ ntohs(ih3->nsrcs)*sizeof(__be32)))
return true;
ih3 = igmpv3_query_hdr(skb);
}
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
in_dev->mr_maxdelay = max_delay;
if (ih3->qrv)
in_dev->mr_qrv = ih3->qrv;
if (!group) { /* general query */
if (ih3->nsrcs)
return true; /* no sources allowed */
igmp_gq_start_timer(in_dev);
return false;
}
/* mark sources to include, if group & source-specific */
mark = ih3->nsrcs != 0;
}
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
* which the query arrived excl. those that belong
* to a "local" group (224.0.0.X)
* - For timers already running check if they need to
* be reset.
* - Use the igmp->igmp_code field as the maximum
* delay possible
*/
rcu_read_lock();
for_each_pmc_rcu(in_dev, im) {
int changed;
if (group && group != im->multiaddr)
continue;
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(im->multiaddr) &&
!net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
im->gsquery = im->gsquery && mark;
else
im->gsquery = mark;
changed = !im->gsquery ||
igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
spin_unlock_bh(&im->lock);
if (changed)
igmp_mod_timer(im, max_delay);
}
rcu_read_unlock();
return false;
}
/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
/* This basically follows the spec line by line -- see RFC1112 */
struct igmphdr *ih;
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
int len = skb->len;
bool dropped = true;
if (!in_dev)
goto drop;
if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
goto drop;
if (skb_checksum_simple_validate(skb))
goto drop;
ih = igmp_hdr(skb);
switch (ih->type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
dropped = igmp_heard_query(in_dev, skb, len);
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
/* Is it our report looped back? */
if (rt_is_output_route(skb_rtable(skb)))
break;
/* don't rely on MC router hearing unicast reports */
if (skb->pkt_type == PACKET_MULTICAST ||
skb->pkt_type == PACKET_BROADCAST)
dropped = igmp_heard_report(in_dev, ih->group);
break;
case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
return pim_rcv_v1(skb);
#endif
case IGMPV3_HOST_MEMBERSHIP_REPORT:
case IGMP_DVMRP:
case IGMP_TRACE:
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_MTRACE:
case IGMP_MTRACE_RESP:
break;
default:
break;
}
drop:
if (dropped)
kfree_skb(skb);
else
consume_skb(skb);
return 0;
}
#endif
/*
* Add a filter to a device
*/
static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
{
char buf[MAX_ADDR_LEN];
struct net_device *dev = in_dev->dev;
/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
We will get multicast token leakage, when IFF_MULTICAST
is changed. This check should be done in ndo_set_rx_mode
routine. Something sort of:
if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
--ANK
*/
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_add(dev, buf);
}
/*
* Remove a filter from a device
*/
static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
{
char buf[MAX_ADDR_LEN];
struct net_device *dev = in_dev->dev;
if (arp_mc_map(addr, buf, dev, 0) == 0)
dev_mc_del(dev, buf);
}
#ifdef CONFIG_IP_MULTICAST
/*
* deleted ip_mc_list manipulation
*/
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc;
struct net *net = dev_net(in_dev->dev);
/* this is an "ip_mc_list" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
* used for management of the delete list. Using the same structure
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
2017-06-21 01:46:27 +08:00
spin_lock_init(&pmc->lock);
spin_lock_bh(&im->lock);
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
pmc->tomb = im->tomb;
pmc->sources = im->sources;
im->tomb = im->sources = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = pmc->crcount;
}
spin_unlock_bh(&im->lock);
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc->next = in_dev->mc_tomb;
in_dev->mc_tomb = pmc;
spin_unlock_bh(&in_dev->mc_tomb_lock);
}
/*
* restore ip_mc_list deleted records
*/
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc, *pmc_prev;
struct ip_sf_list *psf;
struct net *net = dev_net(in_dev->dev);
__be32 multiaddr = im->multiaddr;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc_prev = NULL;
for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
if (pmc->multiaddr == multiaddr)
break;
pmc_prev = pmc;
}
if (pmc) {
if (pmc_prev)
pmc_prev->next = pmc->next;
else
in_dev->mc_tomb = pmc->next;
}
spin_unlock_bh(&in_dev->mc_tomb_lock);
spin_lock_bh(&im->lock);
if (pmc) {
im->interface = pmc->interface;
im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
im->sfmode = pmc->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
im->tomb = pmc->tomb;
im->sources = pmc->sources;
for (psf = im->sources; psf; psf = psf->sf_next)
psf->sf_crcount = im->crcount;
}
in_dev_put(pmc->interface);
kfree(pmc);
}
spin_unlock_bh(&im->lock);
}
/*
* flush ip_mc_list deleted records
*/
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *nextpmc;
spin_lock_bh(&in_dev->mc_tomb_lock);
pmc = in_dev->mc_tomb;
in_dev->mc_tomb = NULL;
spin_unlock_bh(&in_dev->mc_tomb_lock);
for (; pmc; pmc = nextpmc) {
nextpmc = pmc->next;
ip_mc_clear_src(pmc);
in_dev_put(pmc->interface);
kfree(pmc);
}
/* clear dead sources, too */
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
struct ip_sf_list *psf, *psf_next;
spin_lock_bh(&pmc->lock);
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
for (; psf; psf = psf_next) {
psf_next = psf->sf_next;
kfree(psf);
}
}
rcu_read_unlock();
}
#endif
static void igmp_group_dropped(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
int reporter;
#endif
if (im->loaded) {
im->loaded = 0;
ip_mc_filter_del(in_dev, im->multiaddr);
}
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return;
reporter = im->reporter;
igmp_stop_timer(im);
if (!in_dev->dead) {
if (IGMP_V1_SEEN(in_dev))
return;
if (IGMP_V2_SEEN(in_dev)) {
if (reporter)
igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
return;
}
/* IGMPv3 */
igmpv3_add_delrec(in_dev, im);
igmp_ifc_event(in_dev);
}
#endif
}
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
#endif
if (im->loaded == 0) {
im->loaded = 1;
ip_mc_filter_add(in_dev, im->multiaddr);
}
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
return;
if (in_dev->dead)
return;
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
spin_unlock_bh(&im->lock);
return;
}
/* else, v3 */
im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_event(in_dev);
#endif
}
/*
* Multicast list managers
*/
static u32 ip_mc_hash(const struct ip_mc_list *im)
{
return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
}
static void ip_mc_hash_add(struct in_device *in_dev,
struct ip_mc_list *im)
{
struct ip_mc_list __rcu **mc_hash;
u32 hash;
mc_hash = rtnl_dereference(in_dev->mc_hash);
if (mc_hash) {
hash = ip_mc_hash(im);
im->next_hash = mc_hash[hash];
rcu_assign_pointer(mc_hash[hash], im);
return;
}
/* do not use a hash table for small number of items */
if (in_dev->mc_count < 4)
return;
mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
GFP_KERNEL);
if (!mc_hash)
return;
for_each_pmc_rtnl(in_dev, im) {
hash = ip_mc_hash(im);
im->next_hash = mc_hash[hash];
RCU_INIT_POINTER(mc_hash[hash], im);
}
rcu_assign_pointer(in_dev->mc_hash, mc_hash);
}
static void ip_mc_hash_remove(struct in_device *in_dev,
struct ip_mc_list *im)
{
struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
struct ip_mc_list *aux;
if (!mc_hash)
return;
mc_hash += ip_mc_hash(im);
while ((aux = rtnl_dereference(*mc_hash)) != im)
mc_hash = &aux->next_hash;
*mc_hash = im->next_hash;
}
/*
* A socket has joined a multicast group on device dev.
*/
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
struct ip_mc_list *im;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
#endif
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == addr) {
im->users++;
ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
goto out;
}
}
im = kzalloc(sizeof(*im), GFP_KERNEL);
if (!im)
goto out;
im->users = 1;
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
/* initial mode is (EX, empty) */
im->sfmode = MCAST_EXCLUDE;
im->sfcount[MCAST_EXCLUDE] = 1;
refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
#endif
im->next_rcu = in_dev->mc_list;
in_dev->mc_count++;
rcu_assign_pointer(in_dev->mc_list, im);
ip_mc_hash_add(in_dev, im);
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
return;
}
EXPORT_SYMBOL(ip_mc_inc_group);
static int ip_mc_check_iphdr(struct sk_buff *skb)
{
const struct iphdr *iph;
unsigned int len;
unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
if (!pskb_may_pull(skb, offset))
return -EINVAL;
iph = ip_hdr(skb);
if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
return -EINVAL;
offset += ip_hdrlen(skb) - sizeof(*iph);
if (!pskb_may_pull(skb, offset))
return -EINVAL;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
return -EINVAL;
len = skb_network_offset(skb) + ntohs(iph->tot_len);
if (skb->len < len || len < offset)
return -EINVAL;
skb_set_transport_header(skb, offset);
return 0;
}
static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
{
unsigned int len = skb_transport_offset(skb);
len += sizeof(struct igmpv3_report);
return pskb_may_pull(skb, len) ? 0 : -EINVAL;
}
static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
unsigned int len = skb_transport_offset(skb);
len += sizeof(struct igmphdr);
if (skb->len < len)
return -EINVAL;
/* IGMPv{1,2}? */
if (skb->len != len) {
/* or IGMPv3? */
len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
if (skb->len < len || !pskb_may_pull(skb, len))
return -EINVAL;
}
/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
* all-systems destination addresses (224.0.0.1) for general queries
*/
if (!igmp_hdr(skb)->group &&
ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
return -EINVAL;
return 0;
}
static int ip_mc_check_igmp_msg(struct sk_buff *skb)
{
switch (igmp_hdr(skb)->type) {
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
/* fall through */
return 0;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
return ip_mc_check_igmp_reportv3(skb);
case IGMP_HOST_MEMBERSHIP_QUERY:
return ip_mc_check_igmp_query(skb);
default:
return -ENOMSG;
}
}
static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
return skb_checksum_simple_validate(skb);
}
static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
{
struct sk_buff *skb_chk;
unsigned int transport_len;
unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
int ret = -EINVAL;
transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
skb_chk = skb_checksum_trimmed(skb, transport_len,
ip_mc_validate_checksum);
if (!skb_chk)
goto err;
if (!pskb_may_pull(skb_chk, len))
goto err;
ret = ip_mc_check_igmp_msg(skb_chk);
if (ret)
goto err;
if (skb_trimmed)
*skb_trimmed = skb_chk;
/* free now unneeded clone */
else if (skb_chk != skb)
kfree_skb(skb_chk);
ret = 0;
err:
if (ret && skb_chk && skb_chk != skb)
kfree_skb(skb_chk);
return ret;
}
/**
* ip_mc_check_igmp - checks whether this is a sane IGMP packet
* @skb: the skb to validate
* @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
*
* Checks whether an IPv4 packet is a valid IGMP packet. If so sets
* skb transport header accordingly and returns zero.
*
* -EINVAL: A broken packet was detected, i.e. it violates some internet
* standard
* -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
* -ENOMEM: A memory allocation failure happened.
*
* Optionally, an skb pointer might be provided via skb_trimmed (or set it
* to NULL): After parsing an IGMP packet successfully it will point to
* an skb which has its tail aligned to the IP packet end. This might
* either be the originally provided skb or a trimmed, cloned version if
* the skb frame had data beyond the IP packet. A cloned skb allows us
* to leave the original skb and its full frame unchanged (which might be
* desirable for layer 2 frame jugglers).
*
* Caller needs to set the skb network header and free any returned skb if it
* differs from the provided skb.
*/
int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
{
int ret = ip_mc_check_iphdr(skb);
if (ret < 0)
return ret;
if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -ENOMSG;
return __ip_mc_check_igmp(skb, skb_trimmed);
}
EXPORT_SYMBOL(ip_mc_check_igmp);
/*
* Resend IGMP JOIN report; used by netdev notifier.
*/
static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
struct net *net = dev_net(in_dev->dev);
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
if (ipv4_is_local_multicast(im->multiaddr) &&
!net->ipv4.sysctl_igmp_llm_reports)
IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey <pdowney@brocade.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-08-27 23:46:26 +08:00
continue;
/* a failover is happening and switches
* must be notified immediately
*/
if (IGMP_V1_SEEN(in_dev))
type = IGMP_HOST_MEMBERSHIP_REPORT;
else if (IGMP_V2_SEEN(in_dev))
type = IGMPV2_HOST_MEMBERSHIP_REPORT;
else
type = IGMPV3_HOST_MEMBERSHIP_REPORT;
igmp_send_report(in_dev, im, type);
}
#endif
}
/*
* A socket has left a multicast group on device dev
*/
void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
{
struct ip_mc_list *i;
struct ip_mc_list __rcu **ip;
ASSERT_RTNL();
for (ip = &in_dev->mc_list;
(i = rtnl_dereference(*ip)) != NULL;
ip = &i->next_rcu) {
if (i->multiaddr == addr) {
if (--i->users == 0) {
ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
igmp_group_dropped(i);
ip_mc_clear_src(i);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
ip_ma_put(i);
return;
}
break;
}
}
}
EXPORT_SYMBOL(ip_mc_dec_group);
/* Device changing type */
void ip_mc_unmap(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc)
igmp_group_dropped(pmc);
}
void ip_mc_remap(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
igmp_group_added(pmc);
}
}
/* Device going down */
void ip_mc_down(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
ASSERT_RTNL();
for_each_pmc_rtnl(in_dev, pmc)
igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST
in_dev->mr_ifc_count = 0;
if (del_timer(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
in_dev->mr_gq_running = 0;
if (del_timer(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
void ip_mc_init_dev(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
(unsigned long)in_dev);
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
spin_lock_init(&in_dev->mc_tomb_lock);
}
/* Device going up */
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
#ifdef CONFIG_IP_MULTICAST
struct net *net = dev_net(in_dev->dev);
#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
igmpv3_del_delrec(in_dev, pmc);
#endif
igmp_group_added(pmc);
}
}
/*
* Device is about to be destroyed: clean up.
*/
void ip_mc_destroy_dev(struct in_device *in_dev)
{
struct ip_mc_list *i;
ASSERT_RTNL();
/* Deactivate timers */
ip_mc_down(in_dev);
#ifdef CONFIG_IP_MULTICAST
igmpv3_clear_delrec(in_dev);
#endif
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
ip_ma_put(i);
}
}
/* RTNL is locked */
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
struct net_device *dev = NULL;
struct in_device *idev = NULL;
if (imr->imr_ifindex) {
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
idev = inetdev_by_index(net, imr->imr_ifindex);
return idev;
}
if (imr->imr_address.s_addr) {
dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
if (!dev)
return NULL;
}
if (!dev) {
struct rtable *rt = ip_route_output(net,
imr->imr_multiaddr.s_addr,
0, 0, 0);
if (!IS_ERR(rt)) {
dev = rt->dst.dev;
ip_rt_put(rt);
}
}
if (dev) {
imr->imr_ifindex = dev->ifindex;
idev = __in_dev_get_rtnl(dev);
}
return idev;
}
/*
* Join a socket to a group
*/
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
int rv = 0;
psf_prev = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
}
if (!psf || psf->sf_count[sfmode] == 0) {
/* source filter not found, or count wrong => bug */
return -ESRCH;
}
psf->sf_count[sfmode]--;
if (psf->sf_count[sfmode] == 0) {
ip_rt_multicast_event(pmc->interface);
}
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct in_device *in_dev = pmc->interface;
struct net *net = dev_net(in_dev->dev);
#endif
/* no more filters for this source */
if (psf_prev)
psf_prev->sf_next = psf->sf_next;
else
pmc->sources = psf->sf_next;
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
} else
#endif
kfree(psf);
}
return rv;
}
#ifndef CONFIG_IP_MULTICAST
#define igmp_ifc_event(x) do { } while (0)
#endif
static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta)
{
struct ip_mc_list *pmc;
int changerec = 0;
int i, err;
if (!in_dev)
return -ENODEV;
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
if (!delta) {
err = -EINVAL;
if (!pmc->sfcount[sfmode])
goto out_unlock;
pmc->sfcount[sfmode]--;
}
err = 0;
for (i = 0; i < sfcount; i++) {
int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
changerec |= rv > 0;
if (!err && rv < 0)
err = rv;
}
if (pmc->sfmode == MCAST_EXCLUDE &&
pmc->sfcount[MCAST_EXCLUDE] == 0 &&
pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
struct net *net = dev_net(in_dev->dev);
#endif
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
} else if (sf_setstate(pmc) || changerec) {
igmp_ifc_event(pmc->interface);
#endif
}
out_unlock:
spin_unlock_bh(&pmc->lock);
return err;
}
/*
* Add multicast single-source filter to the interface list
*/
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
{
struct ip_sf_list *psf, *psf_prev;
psf_prev = NULL;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == *psfsrc)
break;
psf_prev = psf;
}
if (!psf) {
psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
if (!psf)
return -ENOBUFS;
psf->sf_inaddr = *psfsrc;
if (psf_prev) {
psf_prev->sf_next = psf;
} else
pmc->sources = psf;
}
psf->sf_count[sfmode]++;
if (psf->sf_count[sfmode] == 1) {
ip_rt_multicast_event(pmc->interface);
}
return 0;
}
#ifdef CONFIG_IP_MULTICAST
static void sf_markstate(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
for (psf = pmc->sources; psf; psf = psf->sf_next)
if (pmc->sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
} else
psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
}
static int sf_setstate(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *dpsf;
int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
int qrv = pmc->interface->mr_qrv;
int new_in, rv;
rv = 0;
for (psf = pmc->sources; psf; psf = psf->sf_next) {
if (pmc->sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
} else
new_in = psf->sf_count[MCAST_INCLUDE] != 0;
if (new_in) {
if (!psf->sf_oldin) {
struct ip_sf_list *prev = NULL;
for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
prev = dpsf;
}
if (dpsf) {
if (prev)
prev->sf_next = dpsf->sf_next;
else
pmc->tomb = dpsf->sf_next;
kfree(dpsf);
}
psf->sf_crcount = qrv;
rv++;
}
} else if (psf->sf_oldin) {
psf->sf_crcount = 0;
/*
* add or update "delete" records if an active filter
* is now inactive
*/
for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
if (dpsf->sf_inaddr == psf->sf_inaddr)
break;
if (!dpsf) {
dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
if (!dpsf)
continue;
*dpsf = *psf;
/* pmc->lock held by callers */
dpsf->sf_next = pmc->tomb;
pmc->tomb = dpsf;
}
dpsf->sf_crcount = qrv;
rv++;
}
}
return rv;
}
#endif
/*
* Add multicast source filter list to the interface list
*/
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
int sfcount, __be32 *psfsrc, int delta)
{
struct ip_mc_list *pmc;
int isexclude;
int i, err;
if (!in_dev)
return -ENODEV;
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
if (*pmca == pmc->multiaddr)
break;
}
if (!pmc) {
/* MCA not found?? bug */
rcu_read_unlock();
return -ESRCH;
}
spin_lock_bh(&pmc->lock);
rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
sf_markstate(pmc);
#endif
isexclude = pmc->sfmode == MCAST_EXCLUDE;
if (!delta)
pmc->sfcount[sfmode]++;
err = 0;
for (i = 0; i < sfcount; i++) {
err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
if (err)
break;
}
if (err) {
int j;
if (!delta)
pmc->sfcount[sfmode]--;
for (j = 0; j < i; j++)
(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
struct net *net = dev_net(pmc->interface->dev);
in_dev = pmc->interface;
#endif
/* filter mode change */
if (pmc->sfcount[MCAST_EXCLUDE])
pmc->sfmode = MCAST_EXCLUDE;
else if (pmc->sfcount[MCAST_INCLUDE])
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
} else if (sf_setstate(pmc)) {
igmp_ifc_event(in_dev);
#endif
}
spin_unlock_bh(&pmc->lock);
return err;
}
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
spin_lock_bh(&pmc->lock);
tomb = pmc->tomb;
pmc->tomb = NULL;
sources = pmc->sources;
pmc->sources = NULL;
pmc->sfmode = MCAST_EXCLUDE;
pmc->sfcount[MCAST_INCLUDE] = 0;
pmc->sfcount[MCAST_EXCLUDE] = 1;
spin_unlock_bh(&pmc->lock);
for (psf = tomb; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
for (psf = sources; psf; psf = nextpsf) {
nextpsf = psf->sf_next;
kfree(psf);
}
}
/* Join a multicast group
*/
int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
__be32 addr = imr->imr_multiaddr.s_addr;
struct ip_mc_socklist *iml, *i;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
int ifindex;
int count = 0;
int err;
ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRINUSE;
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
ifindex = imr->imr_ifindex;
for_each_pmc_rtnl(inet, i) {
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
if (i->multi.imr_multiaddr.s_addr == addr &&
i->multi.imr_ifindex == ifindex)
goto done;
count++;
}
err = -ENOBUFS;
if (count >= net->ipv4.sysctl_igmp_max_memberships)
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (!iml)
goto done;
[IPV4]: multicast API "join" issues This patch corrects a few problems with the IP_ADD_MEMBERSHIP socket option: 1) The existing code makes an attempt at reference counting joins when using the ip_mreqn/imr_ifindex interface. Joining the same group on the same socket is an error, whatever the API. This leads to unexpected results when mixing ip_mreqn by index with ip_mreqn by address, ip_mreq, or other API's. For example, ip_mreq followed by ip_mreqn of the same group will "work" while the same two reversed will not. Fixed to always return EADDRINUSE on a duplicate join and removed the (now unused) reference count in ip_mc_socklist. 2) The group-search list in ip_mc_join_group() is comparing a full ip_mreqn structure and all of it must match for it to find the group. This doesn't correctly match a group that was joined with ip_mreq or ip_mreqn with an address (with or without an index). It also doesn't match groups that are joined by different addresses on the same interface. All of these are the same multicast group, which is identified by group address and interface index. Fixed the check to correctly match groups so we don't get duplicate group entries on the ip_mc_socklist. 3) The old code allocates a multicast address before searching for duplicates requiring it to free in various error cases. This patch moves the allocate until after the search and igmp_max_memberships check, so never a need to allocate, then free an entry. Signed-off-by: David L Stevens <dlstevens@us.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2005-07-09 08:38:07 +08:00
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
iml->sfmode = MCAST_EXCLUDE;
rcu_assign_pointer(inet->mc_list, iml);
ip_mc_inc_group(in_dev, addr);
err = 0;
done:
return err;
}
EXPORT_SYMBOL(ip_mc_join_group);
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
int err;
if (!psf) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, psf->sl_count, psf->sl_addr, 0);
RCU_INIT_POINTER(iml->sflist, NULL);
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psf, rcu);
return err;
}
int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
struct ip_mc_socklist __rcu **imlp;
struct in_device *in_dev;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
__be32 group = imr->imr_multiaddr.s_addr;
u32 ifindex;
int ret = -EADDRNOTAVAIL;
ASSERT_RTNL();
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, imr);
if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
igmp: fix the problem when mc leave group The problem was triggered by these steps: 1) create socket, bind and then setsockopt for add mc group. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("192.168.1.2"); setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); 2) drop the mc group for this socket. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("0.0.0.0"); setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); 3) and then drop the socket, I found the mc group was still used by the dev: netstat -g Interface RefCnt Group --------------- ------ --------------------- eth2 1 255.0.0.37 Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need to be released for the netdev when drop the socket, but this process was broken when route default is NULL, the reason is that: The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr is NULL, the default route dev will be chosen, then the ifindex is got from the dev, then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL, the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be released from the mc_list, but the dev didn't dec the refcnt for this mc group, so when dropping the socket, the mc_list is NULL and the dev still keep this group. v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs, so I add the checking for the in_dev before polling the mc_list, make sure when we remove the mc group, dec the refcnt to the real dev which was using the mc address. The problem would never happened again. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-02 13:50:48 +08:00
ret = -ENODEV;
goto out;
}
ifindex = imr->imr_ifindex;
for (imlp = &inet->mc_list;
(iml = rtnl_dereference(*imlp)) != NULL;
imlp = &iml->next_rcu) {
if (iml->multi.imr_multiaddr.s_addr != group)
continue;
if (ifindex) {
if (iml->multi.imr_ifindex != ifindex)
continue;
} else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
iml->multi.imr_address.s_addr)
continue;
(void) ip_mc_leave_src(sk, iml, in_dev);
*imlp = iml->next_rcu;
if (in_dev)
ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
return 0;
}
igmp: fix the problem when mc leave group The problem was triggered by these steps: 1) create socket, bind and then setsockopt for add mc group. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("192.168.1.2"); setsockopt(sockfd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); 2) drop the mc group for this socket. mreq.imr_multiaddr.s_addr = inet_addr("255.0.0.37"); mreq.imr_interface.s_addr = inet_addr("0.0.0.0"); setsockopt(sockfd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mreq, sizeof(mreq)); 3) and then drop the socket, I found the mc group was still used by the dev: netstat -g Interface RefCnt Group --------------- ------ --------------------- eth2 1 255.0.0.37 Normally even though the IP_DROP_MEMBERSHIP return error, the mc group still need to be released for the netdev when drop the socket, but this process was broken when route default is NULL, the reason is that: The ip_mc_leave_group() will choose the in_dev by the imr_interface.s_addr, if input addr is NULL, the default route dev will be chosen, then the ifindex is got from the dev, then polling the inet->mc_list and return -ENODEV, but if the default route dev is NULL, the in_dev and ifIndex is both NULL, when polling the inet->mc_list, the mc group will be released from the mc_list, but the dev didn't dec the refcnt for this mc group, so when dropping the socket, the mc_list is NULL and the dev still keep this group. v1->v2: According Hideaki's suggestion, we should align with IPv6 (RFC3493) and BSDs, so I add the checking for the in_dev before polling the mc_list, make sure when we remove the mc group, dec the refcnt to the real dev which was using the mc address. The problem would never happened again. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-02 13:50:48 +08:00
out:
return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);
int ip_mc_source(int add, int omode, struct sock *sk, struct
ip_mreq_source *mreqs, int ifindex)
{
int err;
struct ip_mreqn imr;
__be32 addr = mreqs->imr_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev = NULL;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
int leavegroup = 0;
int i, j, rv;
if (!ipv4_is_multicast(addr))
return -EINVAL;
ASSERT_RTNL();
imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
imr.imr_address.s_addr = mreqs->imr_interface;
imr.imr_ifindex = ifindex;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
if ((pmc->multi.imr_multiaddr.s_addr ==
imr.imr_multiaddr.s_addr) &&
(pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
err = -EINVAL;
goto done;
}
/* if a source filter was set, must be the same mode as before */
if (pmc->sflist) {
if (pmc->sfmode != omode) {
err = -EINVAL;
goto done;
}
} else if (pmc->sfmode != omode) {
/* allow mode switches for empty-set filters */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
NULL, 0);
pmc->sfmode = omode;
}
psl = rtnl_dereference(pmc->sflist);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
rv = !0;
for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
break;
}
if (rv) /* source not found */
goto done; /* err = -EADDRNOTAVAIL */
/* special case - (INCLUDE, empty) == LEAVE_GROUP */
if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
leavegroup = 1;
goto done;
}
/* update the interface filter */
ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
for (j = i+1; j < psl->sl_count; j++)
psl->sl_addr[j-1] = psl->sl_addr[j];
psl->sl_count--;
err = 0;
goto done;
}
/* else, add a new source to the filter */
if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
err = -ENOBUFS;
goto done;
}
if (!psl || psl->sl_count == psl->sl_max) {
struct ip_sf_socklist *newpsl;
int count = IP_SFBLOCK;
if (psl)
count += psl->sl_max;
newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = count;
newpsl->sl_count = count - IP_SFBLOCK;
if (psl) {
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psl, rcu);
}
rcu_assign_pointer(pmc->sflist, newpsl);
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
for (i = 0; i < psl->sl_count; i++) {
rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
sizeof(__be32));
if (rv == 0)
break;
}
if (rv == 0) /* address already there is an error */
goto done;
for (j = psl->sl_count-1; j >= i; j--)
psl->sl_addr[j+1] = psl->sl_addr[j];
psl->sl_addr[i] = mreqs->imr_sourceaddr;
psl->sl_count++;
err = 0;
/* update the interface list */
ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
&mreqs->imr_sourceaddr, 1);
done:
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
int err = 0;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *newpsl, *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
int leavegroup = 0;
if (!ipv4_is_multicast(addr))
return -EINVAL;
if (msf->imsf_fmode != MCAST_INCLUDE &&
msf->imsf_fmode != MCAST_EXCLUDE)
return -EINVAL;
ASSERT_RTNL();
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = ifindex;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
/* special case - (INCLUDE, empty) == LEAVE_GROUP */
if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
leavegroup = 1;
goto done;
}
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
if (!pmc) { /* must have a prior join */
err = -EINVAL;
goto done;
}
if (msf->imsf_numsrc) {
newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
memcpy(newpsl->sl_addr, msf->imsf_slist,
msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
goto done;
}
} else {
newpsl = NULL;
(void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, 0, NULL, 0);
}
psl = rtnl_dereference(pmc->sflist);
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
/* decrease mem now to avoid the memleak warning */
atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psl, rcu);
} else
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
rcu_assign_pointer(pmc->sflist, newpsl);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
if (leavegroup)
err = ip_mc_leave_group(sk, &imr);
return err;
}
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_msfilter __user *optval, int __user *optlen)
{
int err, len, count, copycount;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
struct in_device *in_dev;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
ASSERT_RTNL();
if (!ipv4_is_multicast(addr))
return -EINVAL;
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = ip_mc_find_dev(net, &imr);
if (!in_dev) {
err = -ENODEV;
goto done;
}
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
pmc->multi.imr_ifindex == imr.imr_ifindex)
break;
}
if (!pmc) /* must have a prior join */
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
if (!psl) {
len = 0;
count = 0;
} else {
count = psl->sl_count;
}
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
len = copycount * sizeof(psl->sl_addr[0]);
msf->imsf_numsrc = count;
if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
return -EFAULT;
return 0;
done:
return err;
}
int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct group_filter __user *optval, int __user *optlen)
{
int err, i, count, copycount;
struct sockaddr_in *psin;
__be32 addr;
struct ip_mc_socklist *pmc;
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
ASSERT_RTNL();
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
addr = psin->sin_addr.s_addr;
if (!ipv4_is_multicast(addr))
return -EINVAL;
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == addr &&
pmc->multi.imr_ifindex == gsf->gf_interface)
break;
}
if (!pmc) /* must have a prior join */
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
return -EFAULT;
}
for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
psin = (struct sockaddr_in *)&ss;
memset(&ss, 0, sizeof(ss));
psin->sin_family = AF_INET;
psin->sin_addr.s_addr = psl->sl_addr[i];
if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
return -EFAULT;
}
return 0;
done:
return err;
}
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
int ret;
ret = 1;
if (!ipv4_is_multicast(loc_addr))
goto out;
rcu_read_lock();
for_each_pmc_rcu(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
pmc->multi.imr_ifindex == dif)
break;
}
ret = inet->mc_all;
if (!pmc)
goto unlock;
psl = rcu_dereference(pmc->sflist);
ret = (pmc->sfmode == MCAST_EXCLUDE);
if (!psl)
goto unlock;
for (i = 0; i < psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
ret = 0;
if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
goto unlock;
if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
goto unlock;
ret = 1;
unlock:
rcu_read_unlock();
out:
return ret;
}
/*
* A socket is closing.
*/
void ip_mc_drop_socket(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *iml;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
struct net *net = sock_net(sk);
if (!inet->mc_list)
return;
rtnl_lock();
while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
struct in_device *in_dev;
inet->mc_list = iml->next_rcu;
netns: Fix crash by making igmp per namespace This patch makes the multicast socket to be per namespace. When a network namespace is created, other than the init_net and a multicast packet is received, the kernel goes to a hang or a kernel panic. How to reproduce ? * create a child network namespace * create a pair virtual device veth * ip link add type veth * move one side to the pair network device to the child namespace * ip link set netns <childpid> dev veth1 * ping -I veth0 224.0.0.1 The bug appears because the function ip_mc_init_dev does not initialize the different multicast fields as it exits because it is not the init_net. BUG: soft lockup - CPU#0 stuck for 61s! [avahi-daemon:2695] Modules linked in: irq event stamp: 50350 hardirqs last enabled at (50349): [<c03ee949>] _spin_unlock_irqrestore+0x34/0x39 hardirqs last disabled at (50350): [<c03ec639>] schedule+0x9f/0x5ff softirqs last enabled at (45712): [<c0374d4b>] ip_setsockopt+0x8e7/0x909 softirqs last disabled at (45710): [<c03ee682>] _spin_lock_bh+0x8/0x27 Pid: 2695, comm: avahi-daemon Not tainted (2.6.27-rc2-00029-g0872073 #3) EIP: 0060:[<c03ee47c>] EFLAGS: 00000297 CPU: 0 EIP is at __read_lock_failed+0x8/0x10 EAX: c4f38810 EBX: c4f38810 ECX: 00000000 EDX: c04cc22e ESI: fb0000e0 EDI: 00000011 EBP: 0f02000a ESP: c4e3faa0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 CR0: 8005003b CR2: 44618a40 CR3: 04e37000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c02311f8>] ? _raw_read_lock+0x23/0x25 [<c0390666>] ? ip_check_mc+0x1c/0x83 [<c036d478>] ? ip_route_input+0x229/0xe92 [<c022e2e4>] ? trace_hardirqs_on_thunk+0xc/0x10 [<c0104c9c>] ? do_IRQ+0x69/0x7d [<c0102e64>] ? restore_nocheck_notrace+0x0/0xe [<c036fdba>] ? ip_rcv+0x227/0x505 [<c0358764>] ? netif_receive_skb+0xfe/0x2b3 [<c03588d2>] ? netif_receive_skb+0x26c/0x2b3 [<c035af31>] ? process_backlog+0x73/0xbd [<c035a8cd>] ? net_rx_action+0xc1/0x1ae [<c01218a8>] ? __do_softirq+0x7b/0xef [<c0121953>] ? do_softirq+0x37/0x4d [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c0122037>] ? local_bh_enable+0x96/0xab [<c035b50d>] ? dev_queue_xmit+0x3d4/0x40b [<c012181e>] ? _local_bh_enable+0x79/0x88 [<c035fcb8>] ? neigh_resolve_output+0x20f/0x239 [<c0373118>] ? ip_finish_output+0x1df/0x209 [<c0373364>] ? ip_dev_loopback_xmit+0x62/0x66 [<c0371db5>] ? ip_local_out+0x15/0x17 [<c0372013>] ? ip_push_pending_frames+0x25c/0x2bb [<c03891b8>] ? udp_push_pending_frames+0x2bb/0x30e [<c038a189>] ? udp_sendmsg+0x413/0x51d [<c038a1a9>] ? udp_sendmsg+0x433/0x51d [<c038f927>] ? inet_sendmsg+0x35/0x3f [<c034f092>] ? sock_sendmsg+0xb8/0xd1 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c022e6de>] ? copy_from_user+0x32/0x5e [<c022e6de>] ? copy_from_user+0x32/0x5e [<c034f238>] ? sys_sendmsg+0x18d/0x1f0 [<c0175e90>] ? pipe_write+0x3cb/0x3d7 [<c0170347>] ? do_sync_write+0xbe/0x105 [<c012d554>] ? autoremove_wake_function+0x0/0x2b [<c03503b2>] ? sys_socketcall+0x176/0x1b0 [<c01085ea>] ? syscall_trace_enter+0x6c/0x7b [<c0102e1a>] ? syscall_call+0x7/0xb Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2008-08-14 07:15:57 +08:00
in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
if (in_dev)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
kfree_rcu(iml, rcu);
}
rtnl_unlock();
}
/* called with rcu_read_lock() */
int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
struct ip_mc_list *im;
struct ip_mc_list __rcu **mc_hash;
struct ip_sf_list *psf;
int rv = 0;
mc_hash = rcu_dereference(in_dev->mc_hash);
if (mc_hash) {
u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);
for (im = rcu_dereference(mc_hash[hash]);
im != NULL;
im = rcu_dereference(im->next_hash)) {
if (im->multiaddr == mc_addr)
break;
}
} else {
for_each_pmc_rcu(in_dev, im) {
if (im->multiaddr == mc_addr)
break;
}
}
if (im && proto == IPPROTO_IGMP) {
rv = 1;
} else if (im) {
if (src_addr) {
for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
}
if (psf)
rv = psf->sf_count[MCAST_INCLUDE] ||
psf->sf_count[MCAST_EXCLUDE] !=
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
} else
rv = 1; /* unspecified source; tentatively allow */
}
return rv;
}
#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
struct seq_net_private p;
struct net_device *dev;
struct in_device *in_dev;
};
#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private)
static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ip_mc_list *im = NULL;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
im = rcu_dereference(in_dev->mc_list);
if (im) {
state->in_dev = in_dev;
break;
}
}
return im;
}
static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
im = rcu_dereference(im->next_rcu);
while (!im) {
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
im = rcu_dereference(state->in_dev->mc_list);
}
return im;
}
static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
{
struct ip_mc_list *im = igmp_mc_get_first(seq);
if (im)
while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
--pos;
return pos ? NULL : im;
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(rcu)
{
rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_mc_list *im;
if (v == SEQ_START_TOKEN)
im = igmp_mc_get_first(seq);
else
im = igmp_mc_get_next(seq, v);
++*pos;
return im;
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
__releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
state->dev = NULL;
rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
else {
struct ip_mc_list *im = (struct ip_mc_list *)v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
long delta;
#ifdef CONFIG_IP_MULTICAST
querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
IGMP_V2_SEEN(state->in_dev) ? "V2" :
"V3";
#else
querier = "NONE";
#endif
if (rcu_access_pointer(state->in_dev->mc_list) == im) {
seq_printf(seq, "%d\t%-10s: %5d %7s\n",
state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
}
delta = im->timer.expires - jiffies;
seq_printf(seq,
"\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
im->multiaddr, im->users,
im->tm_running,
im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
im->reporter);
}
return 0;
}
static const struct seq_operations igmp_mc_seq_ops = {
.start = igmp_mc_seq_start,
.next = igmp_mc_seq_next,
.stop = igmp_mc_seq_stop,
.show = igmp_mc_seq_show,
};
static int igmp_mc_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &igmp_mc_seq_ops,
sizeof(struct igmp_mc_iter_state));
}
static const struct file_operations igmp_mc_seq_fops = {
.owner = THIS_MODULE,
.open = igmp_mc_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
struct igmp_mcf_iter_state {
struct seq_net_private p;
struct net_device *dev;
struct in_device *idev;
struct ip_mc_list *im;
};
#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private)
static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ip_sf_list *psf = NULL;
struct ip_mc_list *im = NULL;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
state->idev = NULL;
state->im = NULL;
for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
idev = __in_dev_get_rcu(state->dev);
if (unlikely(!idev))
continue;
im = rcu_dereference(idev->mc_list);
if (likely(im)) {
spin_lock_bh(&im->lock);
psf = im->sources;
if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
spin_unlock_bh(&im->lock);
}
}
return psf;
}
static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
psf = psf->sf_next;
while (!psf) {
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
state->im = rcu_dereference(state->idev->mc_list);
}
if (!state->im)
break;
spin_lock_bh(&state->im->lock);
psf = state->im->sources;
}
out:
return psf;
}
static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
struct ip_sf_list *psf = igmp_mcf_get_first(seq);
if (psf)
while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
--pos;
return pos ? NULL : psf;
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(rcu)
{
rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_sf_list *psf;
if (v == SEQ_START_TOKEN)
psf = igmp_mcf_get_first(seq);
else
psf = igmp_mcf_get_next(seq, v);
++*pos;
return psf;
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
__releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (likely(state->im)) {
spin_unlock_bh(&state->im->lock);
state->im = NULL;
}
state->idev = NULL;
state->dev = NULL;
rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
struct ip_sf_list *psf = (struct ip_sf_list *)v;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
seq_puts(seq, "Idx Device MCA SRC INC EXC\n");
} else {
seq_printf(seq,
"%3d %6.6s 0x%08x "
"0x%08x %6lu %6lu\n",
state->dev->ifindex, state->dev->name,
ntohl(state->im->multiaddr),
ntohl(psf->sf_inaddr),
psf->sf_count[MCAST_INCLUDE],
psf->sf_count[MCAST_EXCLUDE]);
}
return 0;
}
static const struct seq_operations igmp_mcf_seq_ops = {
.start = igmp_mcf_seq_start,
.next = igmp_mcf_seq_next,
.stop = igmp_mcf_seq_stop,
.show = igmp_mcf_seq_show,
};
static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &igmp_mcf_seq_ops,
sizeof(struct igmp_mcf_iter_state));
}
static const struct file_operations igmp_mcf_seq_fops = {
.owner = THIS_MODULE,
.open = igmp_mcf_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
};
static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
int err;
pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
if (!pde)
goto out_igmp;
pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
&igmp_mcf_seq_fops);
if (!pde)
goto out_mcfilter;
err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
SOCK_DGRAM, 0, net);
if (err < 0) {
pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
err);
goto out_sock;
}
/* Sysctl initialization */
net->ipv4.sysctl_igmp_max_memberships = 20;
net->ipv4.sysctl_igmp_max_msf = 10;
/* IGMP reports for link-local multicast groups are enabled by default */
net->ipv4.sysctl_igmp_llm_reports = 1;
net->ipv4.sysctl_igmp_qrv = 2;
return 0;
out_sock:
remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
remove_proc_entry("igmp", net->proc_net);
out_igmp:
return -ENOMEM;
}
static void __net_exit igmp_net_exit(struct net *net)
{
remove_proc_entry("mcfilter", net->proc_net);
remove_proc_entry("igmp", net->proc_net);
inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}
static struct pernet_operations igmp_net_ops = {
.init = igmp_net_init,
.exit = igmp_net_exit,
};
#endif
static int igmp_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct in_device *in_dev;
switch (event) {
case NETDEV_RESEND_IGMP:
in_dev = __in_dev_get_rtnl(dev);
if (in_dev)
ip_mc_rejoin_groups(in_dev);
break;
default:
break;
}
return NOTIFY_DONE;
}
static struct notifier_block igmp_notifier = {
.notifier_call = igmp_netdev_event,
};
int __init igmp_mc_init(void)
{
#if defined(CONFIG_PROC_FS)
int err;
err = register_pernet_subsys(&igmp_net_ops);
if (err)
return err;
err = register_netdevice_notifier(&igmp_notifier);
if (err)
goto reg_notif_fail;
return 0;
reg_notif_fail:
unregister_pernet_subsys(&igmp_net_ops);
return err;
#else
return register_netdevice_notifier(&igmp_notifier);
#endif
}