linux/drivers/net/ethernet/freescale/gianfar.c

3749 lines
96 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/* drivers/net/ethernet/freescale/gianfar.c
*
* Gianfar Ethernet Driver
* This driver is designed for the non-CPM ethernet controllers
* on the 85xx and 83xx family of integrated processors
* Based on 8260_io/fcc_enet.c
*
* Author: Andy Fleming
* Maintainer: Kumar Gala
* Modifier: Sandeep Gopalpet <sandeep.kumar@freescale.com>
*
* Copyright 2002-2009, 2011-2013 Freescale Semiconductor, Inc.
* Copyright 2007 MontaVista Software, Inc.
*
* Gianfar: AKA Lambda Draconis, "Dragon"
* RA 11 31 24.2
* Dec +69 19 52
* V 3.84
* B-V +1.62
*
* Theory of operation
*
* The driver is initialized through of_device. Configuration information
* is therefore conveyed through an OF-style device tree.
*
* The Gianfar Ethernet Controller uses a ring of buffer
* descriptors. The beginning is indicated by a register
* pointing to the physical address of the start of the ring.
* The end is determined by a "wrap" bit being set in the
* last descriptor of the ring.
*
* When a packet is received, the RXF bit in the
* IEVENT register is set, triggering an interrupt when the
* corresponding bit in the IMASK register is also set (if
* interrupt coalescing is active, then the interrupt may not
* happen immediately, but will wait until either a set number
* of frames or amount of time have passed). In NAPI, the
* interrupt handler will signal there is work to be done, and
* exit. This method will start at the last known empty
* descriptor, and process every subsequent descriptor until there
* are none left with data (NAPI will stop after a set number of
* packets to give time to other tasks, but will eventually
* process all the packets). The data arrives inside a
* pre-allocated skb, and so after the skb is passed up to the
* stack, a new skb must be allocated, and the address field in
* the buffer descriptor must be updated to indicate this new
* skb.
*
* When the kernel requests that a packet be transmitted, the
* driver starts where it left off last time, and points the
* descriptor at the buffer which was passed in. The driver
* then informs the DMA engine that there are packets ready to
* be transmitted. Once the controller is finished transmitting
* the packet, an interrupt may be triggered (under the same
* conditions as for reception, but depending on the TXF bit).
* The driver then cleans up the buffer.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/unistd.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/of_mdio.h>
#include <linux/of_platform.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/in.h>
#include <linux/net_tstamp.h>
#include <asm/io.h>
#ifdef CONFIG_PPC
#include <asm/reg.h>
#include <asm/mpc85xx.h>
#endif
#include <asm/irq.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/dma-mapping.h>
#include <linux/crc32.h>
#include <linux/mii.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
#include <linux/of.h>
#include <linux/of_net.h>
#include "gianfar.h"
#define TX_TIMEOUT (5*HZ)
MODULE_AUTHOR("Freescale Semiconductor, Inc");
MODULE_DESCRIPTION("Gianfar Ethernet Driver");
MODULE_LICENSE("GPL");
static void gfar_init_rxbdp(struct gfar_priv_rx_q *rx_queue, struct rxbd8 *bdp,
dma_addr_t buf)
{
u32 lstatus;
bdp->bufPtr = cpu_to_be32(buf);
lstatus = BD_LFLAG(RXBD_EMPTY | RXBD_INTERRUPT);
if (bdp == rx_queue->rx_bd_base + rx_queue->rx_ring_size - 1)
lstatus |= BD_LFLAG(RXBD_WRAP);
gfar_wmb();
bdp->lstatus = cpu_to_be32(lstatus);
}
static void gfar_init_tx_rx_base(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
gianfar: Fix compiler and sparse warnings commit fba4ed030cfae7efdb6b79a57b0c5a9d72c9de83 ("gianfar: Add Multiple Queue Support") introduced the following warnings: CHECK gianfar.c gianfar.c:333:8: warning: incorrect type in assignment (different address spaces) gianfar.c:333:8: expected unsigned int [usertype] *baddr gianfar.c:333:8: got unsigned int [noderef] <asn:2>*<noident> [... 67 lines skipped ...] gianfar.c:2565:3: warning: incorrect type in argument 1 (different type sizes) gianfar.c:2565:3: expected unsigned long const *addr gianfar.c:2565:3: got unsigned int *<noident> CC gianfar.o gianfar.c: In function 'gfar_probe': gianfar.c:985: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:985: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:993: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:993: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c: In function 'gfar_configure_coalescing': gianfar.c:1680: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1680: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1688: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1688: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c: In function 'gfar_poll': gianfar.c:2565: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:2565: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:2566: warning: passing argument 2 of 'test_bit' from incompatible pointer type gianfar.c:2585: warning: passing argument 2 of 'set_bit' from incompatible pointer type Following warnings left unfixed (looks like sparse doesn't like locks in loops, so __acquires/__releases() doesn't help): gianfar.c:441:40: warning: context imbalance in 'lock_rx_qs': wrong count at exit gianfar.c:441:40: context '<noident>': wanted 0, got 1 gianfar.c:449:40: warning: context imbalance in 'lock_tx_qs': wrong count at exit gianfar.c:449:40: context '<noident>': wanted 0, got 1 gianfar.c:458:3: warning: context imbalance in 'unlock_rx_qs': __context__ statement expected different context gianfar.c:458:3: context '<noident>': wanted >= 0, got -1 gianfar.c:466:3: warning: context imbalance in 'unlock_tx_qs': __context__ statement expected different context gianfar.c:466:3: context '<noident>': wanted >= 0, got -1 Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-04 20:53:00 +08:00
u32 __iomem *baddr;
int i;
baddr = &regs->tbase0;
for (i = 0; i < priv->num_tx_queues; i++) {
gfar_write(baddr, priv->tx_queue[i]->tx_bd_dma_base);
baddr += 2;
}
baddr = &regs->rbase0;
for (i = 0; i < priv->num_rx_queues; i++) {
gfar_write(baddr, priv->rx_queue[i]->rx_bd_dma_base);
baddr += 2;
}
}
static void gfar_init_rqprm(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 __iomem *baddr;
int i;
baddr = &regs->rqprm0;
for (i = 0; i < priv->num_rx_queues; i++) {
gfar_write(baddr, priv->rx_queue[i]->rx_ring_size |
(DEFAULT_RX_LFC_THR << FBTHR_SHIFT));
baddr++;
}
}
static void gfar_rx_offload_en(struct gfar_private *priv)
{
gianfar: Fix and cleanup Rx FCB indication This fixes a less obvious error on one hand, and prevents futher similar errors by disambiguating and optimizing RxFCB indication, on the other hand. The error consists in NETIF_F_HW_VLAN_TX flag being used as an indication of Rx FCB insertion. This happened as soon gfar_uses_fcb(), which despite its name indicates Rx FCB insertion, started incorporating is_vlan_on(). is_vlan_on(), on the other hand, is also a misleading construct because we need to differentiate b/w hw VLAN extraction/VLEX (marked by VLAN_RX flag) and hw VLAN insertion/VLINS (VLAN_TX flag), which are different mechanisms using different types of FCBs. The hw spec for the RxFCB feature is as follows: In the case of RxBD rings, FCBs (Frame Control Block) are inserted by the eTSEC whenever RCTRL[PRSDEP] is set to a non-zero value. Only one FCB is inserted per frame (in the buffer pointed to by the RxBD with bit F set). TOE acceleration for receive is enabled for all rx frames in this case. This patch introduces priv->uses_rxfcb field to quickly signal RxFCB insertion in accordance with the specification above. The dependency on FSL_GIANFAR_DEV_HAS_TIMER was also eliminated as another source of confusion. The actual dependency is to priv->hwts_rx_en. Upon changing priv->hwts_rx_en via IOCTL, the gfar device is being restarted and on init_mac() the priv->hwts_rx_en flag determines RxFCB insertion, and rctrl is programmed accordingly. The patch takes care of this case too. Though maybe not as self documenting as the inlining version uses_fcb(), priv->uses_rxfcb has the main purpose to quickly signal, on the hot path, that the incoming frame has a *Rx* FCB block inserted which needs to be pulled out before passing the skb to the stack. This is a performance critical operation, it needs to happen fast, that's why uses_rxfcb is placed in the first cacheline of gfar_private. This is also why a cached rctrl cannot be used instead: 1) because we don't have 32 bits available in the first cacheline of gfar_priv (but only 16); 2) bit operations are expensive on the hot path. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-02-14 13:00:07 +08:00
/* set this when rx hw offload (TOE) functions are being used */
priv->uses_rxfcb = 0;
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->ndev->features & (NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX))
priv->uses_rxfcb = 1;
if (priv->hwts_rx_en || priv->rx_filer_enable)
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
priv->uses_rxfcb = 1;
}
static void gfar_mac_rx_config(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 rctrl = 0;
if (priv->rx_filer_enable) {
rctrl |= RCTRL_FILREN | RCTRL_PRSDEP_INIT;
/* Program the RIR0 reg with the required distribution */
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
if (priv->poll_mode == GFAR_SQ_POLLING)
gfar_write(&regs->rir0, DEFAULT_2RXQ_RIR0);
else /* GFAR_MQ_POLLING */
gfar_write(&regs->rir0, DEFAULT_8RXQ_RIR0);
}
/* Restore PROMISC mode */
if (priv->ndev->flags & IFF_PROMISC)
rctrl |= RCTRL_PROM;
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->ndev->features & NETIF_F_RXCSUM)
rctrl |= RCTRL_CHECKSUMMING;
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->extended_hash)
rctrl |= RCTRL_EXTHASH | RCTRL_EMEN;
if (priv->padding) {
rctrl &= ~RCTRL_PAL_MASK;
rctrl |= RCTRL_PADDING(priv->padding);
}
/* Enable HW time stamping if requested from user space */
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->hwts_rx_en)
rctrl |= RCTRL_PRSDEP_INIT | RCTRL_TS_ENABLE;
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->ndev->features & NETIF_F_HW_VLAN_CTAG_RX)
rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT;
/* Clear the LFC bit */
gfar_write(&regs->rctrl, rctrl);
/* Init flow control threshold values */
gfar_init_rqprm(priv);
gfar_write(&regs->ptv, DEFAULT_LFC_PTVVAL);
rctrl |= RCTRL_LFC;
/* Init rctrl based on our settings */
gfar_write(&regs->rctrl, rctrl);
}
static void gfar_mac_tx_config(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tctrl = 0;
if (priv->ndev->features & NETIF_F_IP_CSUM)
tctrl |= TCTRL_INIT_CSUM;
if (priv->prio_sched_en)
tctrl |= TCTRL_TXSCHED_PRIO;
else {
tctrl |= TCTRL_TXSCHED_WRRS;
gfar_write(&regs->tr03wt, DEFAULT_WRRS_WEIGHT);
gfar_write(&regs->tr47wt, DEFAULT_WRRS_WEIGHT);
}
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (priv->ndev->features & NETIF_F_HW_VLAN_CTAG_TX)
tctrl |= TCTRL_VLINS;
gfar_write(&regs->tctrl, tctrl);
}
static void gfar_configure_coalescing(struct gfar_private *priv,
unsigned long tx_mask, unsigned long rx_mask)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 __iomem *baddr;
if (priv->mode == MQ_MG_MODE) {
int i = 0;
baddr = &regs->txic0;
for_each_set_bit(i, &tx_mask, priv->num_tx_queues) {
gfar_write(baddr + i, 0);
if (likely(priv->tx_queue[i]->txcoalescing))
gfar_write(baddr + i, priv->tx_queue[i]->txic);
}
baddr = &regs->rxic0;
for_each_set_bit(i, &rx_mask, priv->num_rx_queues) {
gfar_write(baddr + i, 0);
if (likely(priv->rx_queue[i]->rxcoalescing))
gfar_write(baddr + i, priv->rx_queue[i]->rxic);
}
} else {
/* Backward compatible case -- even if we enable
* multiple queues, there's only single reg to program
*/
gfar_write(&regs->txic, 0);
if (likely(priv->tx_queue[0]->txcoalescing))
gfar_write(&regs->txic, priv->tx_queue[0]->txic);
gfar_write(&regs->rxic, 0);
if (unlikely(priv->rx_queue[0]->rxcoalescing))
gfar_write(&regs->rxic, priv->rx_queue[0]->rxic);
}
}
static void gfar_configure_coalescing_all(struct gfar_private *priv)
{
gfar_configure_coalescing(priv, 0xFF, 0xFF);
}
static struct net_device_stats *gfar_get_stats(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
unsigned long rx_packets = 0, rx_bytes = 0, rx_dropped = 0;
unsigned long tx_packets = 0, tx_bytes = 0;
int i;
for (i = 0; i < priv->num_rx_queues; i++) {
rx_packets += priv->rx_queue[i]->stats.rx_packets;
rx_bytes += priv->rx_queue[i]->stats.rx_bytes;
rx_dropped += priv->rx_queue[i]->stats.rx_dropped;
}
dev->stats.rx_packets = rx_packets;
dev->stats.rx_bytes = rx_bytes;
dev->stats.rx_dropped = rx_dropped;
for (i = 0; i < priv->num_tx_queues; i++) {
tx_bytes += priv->tx_queue[i]->stats.tx_bytes;
tx_packets += priv->tx_queue[i]->stats.tx_packets;
}
dev->stats.tx_bytes = tx_bytes;
dev->stats.tx_packets = tx_packets;
return &dev->stats;
}
/* Set the appropriate hash bit for the given addr */
/* The algorithm works like so:
* 1) Take the Destination Address (ie the multicast address), and
* do a CRC on it (little endian), and reverse the bits of the
* result.
* 2) Use the 8 most significant bits as a hash into a 256-entry
* table. The table is controlled through 8 32-bit registers:
* gaddr0-7. gaddr0's MSB is entry 0, and gaddr7's LSB is
* gaddr7. This means that the 3 most significant bits in the
* hash index which gaddr register to use, and the 5 other bits
* indicate which bit (assuming an IBM numbering scheme, which
* for PowerPC (tm) is usually the case) in the register holds
* the entry.
*/
static void gfar_set_hash_for_addr(struct net_device *dev, u8 *addr)
{
u32 tempval;
struct gfar_private *priv = netdev_priv(dev);
u32 result = ether_crc(ETH_ALEN, addr);
int width = priv->hash_width;
u8 whichbit = (result >> (32 - width)) & 0x1f;
u8 whichreg = result >> (32 - width + 5);
u32 value = (1 << (31-whichbit));
tempval = gfar_read(priv->hash_regs[whichreg]);
tempval |= value;
gfar_write(priv->hash_regs[whichreg], tempval);
}
/* There are multiple MAC Address register pairs on some controllers
* This function sets the numth pair to a given address
*/
static void gfar_set_mac_for_addr(struct net_device *dev, int num,
const u8 *addr)
{
struct gfar_private *priv = netdev_priv(dev);
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
u32 __iomem *macptr = &regs->macstnaddr1;
macptr += num*2;
/* For a station address of 0x12345678ABCD in transmission
* order (BE), MACnADDR1 is set to 0xCDAB7856 and
* MACnADDR2 is set to 0x34120000.
*/
tempval = (addr[5] << 24) | (addr[4] << 16) |
(addr[3] << 8) | addr[2];
gfar_write(macptr, tempval);
tempval = (addr[1] << 24) | (addr[0] << 16);
gfar_write(macptr+1, tempval);
}
static int gfar_set_mac_addr(struct net_device *dev, void *p)
{
eth_mac_addr(dev, p);
gfar_set_mac_for_addr(dev, 0, dev->dev_addr);
return 0;
}
static void gfar_ints_disable(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_grps; i++) {
struct gfar __iomem *regs = priv->gfargrp[i].regs;
/* Clear IEVENT */
gfar_write(&regs->ievent, IEVENT_INIT_CLEAR);
/* Initialize IMASK */
gfar_write(&regs->imask, IMASK_INIT_CLEAR);
}
}
static void gfar_ints_enable(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_grps; i++) {
struct gfar __iomem *regs = priv->gfargrp[i].regs;
/* Unmask the interrupts we look for */
gfar_write(&regs->imask, IMASK_DEFAULT);
}
}
static int gfar_alloc_tx_queues(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_tx_queues; i++) {
priv->tx_queue[i] = kzalloc(sizeof(struct gfar_priv_tx_q),
GFP_KERNEL);
if (!priv->tx_queue[i])
return -ENOMEM;
priv->tx_queue[i]->tx_skbuff = NULL;
priv->tx_queue[i]->qindex = i;
priv->tx_queue[i]->dev = priv->ndev;
spin_lock_init(&(priv->tx_queue[i]->txlock));
}
return 0;
}
static int gfar_alloc_rx_queues(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_rx_queues; i++) {
priv->rx_queue[i] = kzalloc(sizeof(struct gfar_priv_rx_q),
GFP_KERNEL);
if (!priv->rx_queue[i])
return -ENOMEM;
priv->rx_queue[i]->qindex = i;
priv->rx_queue[i]->ndev = priv->ndev;
}
return 0;
}
static void gfar_free_tx_queues(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_tx_queues; i++)
kfree(priv->tx_queue[i]);
}
static void gfar_free_rx_queues(struct gfar_private *priv)
{
int i;
for (i = 0; i < priv->num_rx_queues; i++)
kfree(priv->rx_queue[i]);
}
static void unmap_group_regs(struct gfar_private *priv)
{
int i;
for (i = 0; i < MAXGROUPS; i++)
if (priv->gfargrp[i].regs)
iounmap(priv->gfargrp[i].regs);
}
static void free_gfar_dev(struct gfar_private *priv)
{
int i, j;
for (i = 0; i < priv->num_grps; i++)
for (j = 0; j < GFAR_NUM_IRQS; j++) {
kfree(priv->gfargrp[i].irqinfo[j]);
priv->gfargrp[i].irqinfo[j] = NULL;
}
free_netdev(priv->ndev);
}
static void disable_napi(struct gfar_private *priv)
{
int i;
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
for (i = 0; i < priv->num_grps; i++) {
napi_disable(&priv->gfargrp[i].napi_rx);
napi_disable(&priv->gfargrp[i].napi_tx);
}
}
static void enable_napi(struct gfar_private *priv)
{
int i;
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
for (i = 0; i < priv->num_grps; i++) {
napi_enable(&priv->gfargrp[i].napi_rx);
napi_enable(&priv->gfargrp[i].napi_tx);
}
}
static int gfar_parse_group(struct device_node *np,
struct gfar_private *priv, const char *model)
{
struct gfar_priv_grp *grp = &priv->gfargrp[priv->num_grps];
int i;
gianfar: dont conditionally alloc Rx/Err irq structs Commit ee873fda3bec7c668407b837fc5519eb961fcd37 "gianfar: Pack struct gfar_priv_grp into three cachelines" causes the following null dereference at driver init on sbc8548: libphy: Freescale PowerQUICC MII Bus: probed Unable to handle kernel paging request for data at address 0x00000000 Faulting instruction address: 0xc01d6a38 Oops: Kernel access of bad area, sig: 11 [#1] [...] NIP [c01d6a38] gfar_parse_group+0x228/0x280 LR [c01d6a34] gfar_parse_group+0x224/0x280 Call Trace: [ef82dd60] [c01d6a34] gfar_parse_group+0x224/0x280 (unreliable) [ef82dd90] [c01d73a4] gfar_probe+0x284/0xfe0 The reason is that the commit also changed the allocation of the Rx and error handling irq structs to be skipped for !MQ_MG_MODE. In the !MQ_MG_MODE case, only the Tx irq struct is allocated. Digging further, we see that MQ_MG_MODE is set only if we find the OF compatible string "fsl,etsec2". A quick grep in the dts directory shows lots of boards that support Rx/Tx/Err, but without this specific compat string. And hence they go after the unallocated Rx/Error structs and cause the above oops. Hence such a change can not be deployed until all the dts files are updated and sufficiently deployed. Further, the optimization is of limited value, since the kmalloc'd struct in question has only a single unsigned int, and an (IFNAMSIZ + 6) sized string. Note that no changes to the freeing code are needed here, as it already did an unconditional free of Rx/Tx/Error gfar_irqinfo. Cc: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-02-04 17:49:42 +08:00
for (i = 0; i < GFAR_NUM_IRQS; i++) {
grp->irqinfo[i] = kzalloc(sizeof(struct gfar_irqinfo),
GFP_KERNEL);
if (!grp->irqinfo[i])
return -ENOMEM;
}
grp->regs = of_iomap(np, 0);
if (!grp->regs)
return -ENOMEM;
gfar_irq(grp, TX)->irq = irq_of_parse_and_map(np, 0);
/* If we aren't the FEC we have multiple interrupts */
if (model && strcasecmp(model, "FEC")) {
gfar_irq(grp, RX)->irq = irq_of_parse_and_map(np, 1);
gfar_irq(grp, ER)->irq = irq_of_parse_and_map(np, 2);
if (!gfar_irq(grp, TX)->irq ||
!gfar_irq(grp, RX)->irq ||
!gfar_irq(grp, ER)->irq)
return -EINVAL;
}
grp->priv = priv;
spin_lock_init(&grp->grplock);
if (priv->mode == MQ_MG_MODE) {
u32 rxq_mask, txq_mask;
int ret;
grp->rx_bit_map = (DEFAULT_MAPPING >> priv->num_grps);
grp->tx_bit_map = (DEFAULT_MAPPING >> priv->num_grps);
ret = of_property_read_u32(np, "fsl,rx-bit-map", &rxq_mask);
if (!ret) {
grp->rx_bit_map = rxq_mask ?
rxq_mask : (DEFAULT_MAPPING >> priv->num_grps);
}
ret = of_property_read_u32(np, "fsl,tx-bit-map", &txq_mask);
if (!ret) {
grp->tx_bit_map = txq_mask ?
txq_mask : (DEFAULT_MAPPING >> priv->num_grps);
}
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
if (priv->poll_mode == GFAR_SQ_POLLING) {
/* One Q per interrupt group: Q0 to G0, Q1 to G1 */
grp->rx_bit_map = (DEFAULT_MAPPING >> priv->num_grps);
grp->tx_bit_map = (DEFAULT_MAPPING >> priv->num_grps);
}
} else {
grp->rx_bit_map = 0xFF;
grp->tx_bit_map = 0xFF;
}
/* bit_map's MSB is q0 (from q0 to q7) but, for_each_set_bit parses
* right to left, so we need to revert the 8 bits to get the q index
*/
grp->rx_bit_map = bitrev8(grp->rx_bit_map);
grp->tx_bit_map = bitrev8(grp->tx_bit_map);
/* Calculate RSTAT, TSTAT, RQUEUE and TQUEUE values,
* also assign queues to groups
*/
for_each_set_bit(i, &grp->rx_bit_map, priv->num_rx_queues) {
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
if (!grp->rx_queue)
grp->rx_queue = priv->rx_queue[i];
grp->num_rx_queues++;
grp->rstat |= (RSTAT_CLEAR_RHALT >> i);
priv->rqueue |= ((RQUEUE_EN0 | RQUEUE_EX0) >> i);
priv->rx_queue[i]->grp = grp;
}
for_each_set_bit(i, &grp->tx_bit_map, priv->num_tx_queues) {
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
if (!grp->tx_queue)
grp->tx_queue = priv->tx_queue[i];
grp->num_tx_queues++;
grp->tstat |= (TSTAT_CLEAR_THALT >> i);
priv->tqueue |= (TQUEUE_EN0 >> i);
priv->tx_queue[i]->grp = grp;
}
priv->num_grps++;
return 0;
}
static int gfar_of_group_count(struct device_node *np)
{
struct device_node *child;
int num = 0;
for_each_available_child_of_node(np, child)
if (of_node_name_eq(child, "queue-group"))
num++;
return num;
}
/* Reads the controller's registers to determine what interface
* connects it to the PHY.
*/
static phy_interface_t gfar_get_interface(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 ecntrl;
ecntrl = gfar_read(&regs->ecntrl);
if (ecntrl & ECNTRL_SGMII_MODE)
return PHY_INTERFACE_MODE_SGMII;
if (ecntrl & ECNTRL_TBI_MODE) {
if (ecntrl & ECNTRL_REDUCED_MODE)
return PHY_INTERFACE_MODE_RTBI;
else
return PHY_INTERFACE_MODE_TBI;
}
if (ecntrl & ECNTRL_REDUCED_MODE) {
if (ecntrl & ECNTRL_REDUCED_MII_MODE) {
return PHY_INTERFACE_MODE_RMII;
}
else {
phy_interface_t interface = priv->interface;
/* This isn't autodetected right now, so it must
* be set by the device tree or platform code.
*/
if (interface == PHY_INTERFACE_MODE_RGMII_ID)
return PHY_INTERFACE_MODE_RGMII_ID;
return PHY_INTERFACE_MODE_RGMII;
}
}
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
return PHY_INTERFACE_MODE_GMII;
return PHY_INTERFACE_MODE_MII;
}
static int gfar_of_init(struct platform_device *ofdev, struct net_device **pdev)
{
const char *model;
const void *mac_addr;
int err = 0, i;
phy_interface_t interface;
struct net_device *dev = NULL;
struct gfar_private *priv = NULL;
struct device_node *np = ofdev->dev.of_node;
struct device_node *child = NULL;
u32 stash_len = 0;
u32 stash_idx = 0;
unsigned int num_tx_qs, num_rx_qs;
unsigned short mode, poll_mode;
if (!np)
return -ENODEV;
if (of_device_is_compatible(np, "fsl,etsec2")) {
mode = MQ_MG_MODE;
poll_mode = GFAR_SQ_POLLING;
} else {
mode = SQ_SG_MODE;
poll_mode = GFAR_SQ_POLLING;
}
if (mode == SQ_SG_MODE) {
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
num_tx_qs = 1;
num_rx_qs = 1;
} else { /* MQ_MG_MODE */
/* get the actual number of supported groups */
unsigned int num_grps = gfar_of_group_count(np);
if (num_grps == 0 || num_grps > MAXGROUPS) {
dev_err(&ofdev->dev, "Invalid # of int groups(%d)\n",
num_grps);
pr_err("Cannot do alloc_etherdev, aborting\n");
return -EINVAL;
}
if (poll_mode == GFAR_SQ_POLLING) {
num_tx_qs = num_grps; /* one txq per int group */
num_rx_qs = num_grps; /* one rxq per int group */
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
} else { /* GFAR_MQ_POLLING */
u32 tx_queues, rx_queues;
int ret;
/* parse the num of HW tx and rx queues */
ret = of_property_read_u32(np, "fsl,num_tx_queues",
&tx_queues);
num_tx_qs = ret ? 1 : tx_queues;
ret = of_property_read_u32(np, "fsl,num_rx_queues",
&rx_queues);
num_rx_qs = ret ? 1 : rx_queues;
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
}
}
if (num_tx_qs > MAX_TX_QS) {
pr_err("num_tx_qs(=%d) greater than MAX_TX_QS(=%d)\n",
num_tx_qs, MAX_TX_QS);
pr_err("Cannot do alloc_etherdev, aborting\n");
return -EINVAL;
}
if (num_rx_qs > MAX_RX_QS) {
pr_err("num_rx_qs(=%d) greater than MAX_RX_QS(=%d)\n",
num_rx_qs, MAX_RX_QS);
pr_err("Cannot do alloc_etherdev, aborting\n");
return -EINVAL;
}
*pdev = alloc_etherdev_mq(sizeof(*priv), num_tx_qs);
dev = *pdev;
if (NULL == dev)
return -ENOMEM;
priv = netdev_priv(dev);
priv->ndev = dev;
priv->mode = mode;
priv->poll_mode = poll_mode;
priv->num_tx_queues = num_tx_qs;
netif_set_real_num_rx_queues(dev, num_rx_qs);
priv->num_rx_queues = num_rx_qs;
err = gfar_alloc_tx_queues(priv);
if (err)
goto tx_alloc_failed;
err = gfar_alloc_rx_queues(priv);
if (err)
goto rx_alloc_failed;
err = of_property_read_string(np, "model", &model);
if (err) {
pr_err("Device model property missing, aborting\n");
goto rx_alloc_failed;
}
/* Init Rx queue filer rule set linked list */
INIT_LIST_HEAD(&priv->rx_list.list);
priv->rx_list.count = 0;
mutex_init(&priv->rx_queue_access);
for (i = 0; i < MAXGROUPS; i++)
priv->gfargrp[i].regs = NULL;
/* Parse and initialize group specific information */
if (priv->mode == MQ_MG_MODE) {
for_each_available_child_of_node(np, child) {
if (!of_node_name_eq(child, "queue-group"))
continue;
err = gfar_parse_group(child, priv, model);
if (err) {
of_node_put(child);
goto err_grp_init;
}
}
} else { /* SQ_SG_MODE */
err = gfar_parse_group(np, priv, model);
if (err)
goto err_grp_init;
}
if (of_property_read_bool(np, "bd-stash")) {
priv->device_flags |= FSL_GIANFAR_DEV_HAS_BD_STASHING;
priv->bd_stash_en = 1;
}
err = of_property_read_u32(np, "rx-stash-len", &stash_len);
if (err == 0)
priv->rx_stash_size = stash_len;
err = of_property_read_u32(np, "rx-stash-idx", &stash_idx);
if (err == 0)
priv->rx_stash_index = stash_idx;
if (stash_len || stash_idx)
priv->device_flags |= FSL_GIANFAR_DEV_HAS_BUF_STASHING;
mac_addr = of_get_mac_address(np);
if (!IS_ERR(mac_addr)) {
ether_addr_copy(dev->dev_addr, mac_addr);
} else {
eth_hw_addr_random(dev);
dev_info(&ofdev->dev, "Using random MAC address: %pM\n", dev->dev_addr);
}
if (model && !strcasecmp(model, "TSEC"))
priv->device_flags |= FSL_GIANFAR_DEV_HAS_GIGABIT |
FSL_GIANFAR_DEV_HAS_COALESCE |
FSL_GIANFAR_DEV_HAS_RMON |
FSL_GIANFAR_DEV_HAS_MULTI_INTR;
if (model && !strcasecmp(model, "eTSEC"))
priv->device_flags |= FSL_GIANFAR_DEV_HAS_GIGABIT |
FSL_GIANFAR_DEV_HAS_COALESCE |
FSL_GIANFAR_DEV_HAS_RMON |
FSL_GIANFAR_DEV_HAS_MULTI_INTR |
FSL_GIANFAR_DEV_HAS_CSUM |
FSL_GIANFAR_DEV_HAS_VLAN |
FSL_GIANFAR_DEV_HAS_MAGIC_PACKET |
FSL_GIANFAR_DEV_HAS_EXTENDED_HASH |
FSL_GIANFAR_DEV_HAS_TIMER |
FSL_GIANFAR_DEV_HAS_RX_FILER;
/* Use PHY connection type from the DT node if one is specified there.
* rgmii-id really needs to be specified. Other types can be
* detected by hardware
*/
err = of_get_phy_mode(np, &interface);
if (!err)
priv->interface = interface;
else
priv->interface = gfar_get_interface(dev);
if (of_find_property(np, "fsl,magic-packet", NULL))
priv->device_flags |= FSL_GIANFAR_DEV_HAS_MAGIC_PACKET;
if (of_get_property(np, "fsl,wake-on-filer", NULL))
priv->device_flags |= FSL_GIANFAR_DEV_HAS_WAKE_ON_FILER;
priv->phy_node = of_parse_phandle(np, "phy-handle", 0);
/* In the case of a fixed PHY, the DT node associated
* to the PHY is the Ethernet MAC DT node.
*/
if (!priv->phy_node && of_phy_is_fixed_link(np)) {
err = of_phy_register_fixed_link(np);
if (err)
goto err_grp_init;
priv->phy_node = of_node_get(np);
}
/* Find the TBI PHY. If it's not there, we don't support SGMII */
priv->tbi_node = of_parse_phandle(np, "tbi-handle", 0);
return 0;
err_grp_init:
unmap_group_regs(priv);
rx_alloc_failed:
gfar_free_rx_queues(priv);
tx_alloc_failed:
gfar_free_tx_queues(priv);
free_gfar_dev(priv);
return err;
}
gianfar: Fix compiler and sparse warnings commit fba4ed030cfae7efdb6b79a57b0c5a9d72c9de83 ("gianfar: Add Multiple Queue Support") introduced the following warnings: CHECK gianfar.c gianfar.c:333:8: warning: incorrect type in assignment (different address spaces) gianfar.c:333:8: expected unsigned int [usertype] *baddr gianfar.c:333:8: got unsigned int [noderef] <asn:2>*<noident> [... 67 lines skipped ...] gianfar.c:2565:3: warning: incorrect type in argument 1 (different type sizes) gianfar.c:2565:3: expected unsigned long const *addr gianfar.c:2565:3: got unsigned int *<noident> CC gianfar.o gianfar.c: In function 'gfar_probe': gianfar.c:985: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:985: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:993: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:993: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c: In function 'gfar_configure_coalescing': gianfar.c:1680: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1680: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1688: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:1688: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c: In function 'gfar_poll': gianfar.c:2565: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:2565: warning: passing argument 1 of 'find_next_bit' from incompatible pointer type gianfar.c:2566: warning: passing argument 2 of 'test_bit' from incompatible pointer type gianfar.c:2585: warning: passing argument 2 of 'set_bit' from incompatible pointer type Following warnings left unfixed (looks like sparse doesn't like locks in loops, so __acquires/__releases() doesn't help): gianfar.c:441:40: warning: context imbalance in 'lock_rx_qs': wrong count at exit gianfar.c:441:40: context '<noident>': wanted 0, got 1 gianfar.c:449:40: warning: context imbalance in 'lock_tx_qs': wrong count at exit gianfar.c:449:40: context '<noident>': wanted 0, got 1 gianfar.c:458:3: warning: context imbalance in 'unlock_rx_qs': __context__ statement expected different context gianfar.c:458:3: context '<noident>': wanted >= 0, got -1 gianfar.c:466:3: warning: context imbalance in 'unlock_tx_qs': __context__ statement expected different context gianfar.c:466:3: context '<noident>': wanted >= 0, got -1 Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-04 20:53:00 +08:00
static u32 cluster_entry_per_class(struct gfar_private *priv, u32 rqfar,
u32 class)
{
u32 rqfpr = FPR_FILER_MASK;
u32 rqfcr = 0x0;
rqfar--;
rqfcr = RQFCR_CLE | RQFCR_PID_MASK | RQFCR_CMP_EXACT;
priv->ftp_rqfpr[rqfar] = rqfpr;
priv->ftp_rqfcr[rqfar] = rqfcr;
gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
rqfar--;
rqfcr = RQFCR_CMP_NOMATCH;
priv->ftp_rqfpr[rqfar] = rqfpr;
priv->ftp_rqfcr[rqfar] = rqfcr;
gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
rqfar--;
rqfcr = RQFCR_CMP_EXACT | RQFCR_PID_PARSE | RQFCR_CLE | RQFCR_AND;
rqfpr = class;
priv->ftp_rqfcr[rqfar] = rqfcr;
priv->ftp_rqfpr[rqfar] = rqfpr;
gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
rqfar--;
rqfcr = RQFCR_CMP_EXACT | RQFCR_PID_MASK | RQFCR_AND;
rqfpr = class;
priv->ftp_rqfcr[rqfar] = rqfcr;
priv->ftp_rqfpr[rqfar] = rqfpr;
gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
return rqfar;
}
static void gfar_init_filer_table(struct gfar_private *priv)
{
int i = 0x0;
u32 rqfar = MAX_FILER_IDX;
u32 rqfcr = 0x0;
u32 rqfpr = FPR_FILER_MASK;
/* Default rule */
rqfcr = RQFCR_CMP_MATCH;
priv->ftp_rqfcr[rqfar] = rqfcr;
priv->ftp_rqfpr[rqfar] = rqfpr;
gfar_write_filer(priv, rqfar, rqfcr, rqfpr);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6 | RQFPR_UDP);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV6 | RQFPR_TCP);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4 | RQFPR_UDP);
rqfar = cluster_entry_per_class(priv, rqfar, RQFPR_IPV4 | RQFPR_TCP);
/* cur_filer_idx indicated the first non-masked rule */
priv->cur_filer_idx = rqfar;
/* Rest are masked rules */
rqfcr = RQFCR_CMP_NOMATCH;
for (i = 0; i < rqfar; i++) {
priv->ftp_rqfcr[i] = rqfcr;
priv->ftp_rqfpr[i] = rqfpr;
gfar_write_filer(priv, i, rqfcr, rqfpr);
}
}
#ifdef CONFIG_PPC
static void __gfar_detect_errata_83xx(struct gfar_private *priv)
{
unsigned int pvr = mfspr(SPRN_PVR);
unsigned int svr = mfspr(SPRN_SVR);
unsigned int mod = (svr >> 16) & 0xfff6; /* w/o E suffix */
unsigned int rev = svr & 0xffff;
/* MPC8313 Rev 2.0 and higher; All MPC837x */
if ((pvr == 0x80850010 && mod == 0x80b0 && rev >= 0x0020) ||
(pvr == 0x80861010 && (mod & 0xfff9) == 0x80c0))
priv->errata |= GFAR_ERRATA_74;
/* MPC8313 and MPC837x all rev */
if ((pvr == 0x80850010 && mod == 0x80b0) ||
(pvr == 0x80861010 && (mod & 0xfff9) == 0x80c0))
priv->errata |= GFAR_ERRATA_76;
/* MPC8313 Rev < 2.0 */
if (pvr == 0x80850010 && mod == 0x80b0 && rev < 0x0020)
priv->errata |= GFAR_ERRATA_12;
}
static void __gfar_detect_errata_85xx(struct gfar_private *priv)
{
unsigned int svr = mfspr(SPRN_SVR);
if ((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) == 0x20))
priv->errata |= GFAR_ERRATA_12;
/* P2020/P1010 Rev 1; MPC8548 Rev 2 */
if (((SVR_SOC_VER(svr) == SVR_P2020) && (SVR_REV(svr) < 0x20)) ||
((SVR_SOC_VER(svr) == SVR_P2010) && (SVR_REV(svr) < 0x20)) ||
((SVR_SOC_VER(svr) == SVR_8548) && (SVR_REV(svr) < 0x31)))
priv->errata |= GFAR_ERRATA_76; /* aka eTSEC 20 */
}
#endif
static void gfar_detect_errata(struct gfar_private *priv)
{
struct device *dev = &priv->ofdev->dev;
/* no plans to fix */
priv->errata |= GFAR_ERRATA_A002;
#ifdef CONFIG_PPC
if (pvr_version_is(PVR_VER_E500V1) || pvr_version_is(PVR_VER_E500V2))
__gfar_detect_errata_85xx(priv);
else /* non-mpc85xx parts, i.e. e300 core based */
__gfar_detect_errata_83xx(priv);
#endif
if (priv->errata)
dev_info(dev, "enabled errata workarounds, flags: 0x%x\n",
priv->errata);
}
static void gfar_init_addr_hash_table(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_EXTENDED_HASH) {
priv->extended_hash = 1;
priv->hash_width = 9;
priv->hash_regs[0] = &regs->igaddr0;
priv->hash_regs[1] = &regs->igaddr1;
priv->hash_regs[2] = &regs->igaddr2;
priv->hash_regs[3] = &regs->igaddr3;
priv->hash_regs[4] = &regs->igaddr4;
priv->hash_regs[5] = &regs->igaddr5;
priv->hash_regs[6] = &regs->igaddr6;
priv->hash_regs[7] = &regs->igaddr7;
priv->hash_regs[8] = &regs->gaddr0;
priv->hash_regs[9] = &regs->gaddr1;
priv->hash_regs[10] = &regs->gaddr2;
priv->hash_regs[11] = &regs->gaddr3;
priv->hash_regs[12] = &regs->gaddr4;
priv->hash_regs[13] = &regs->gaddr5;
priv->hash_regs[14] = &regs->gaddr6;
priv->hash_regs[15] = &regs->gaddr7;
} else {
priv->extended_hash = 0;
priv->hash_width = 8;
priv->hash_regs[0] = &regs->gaddr0;
priv->hash_regs[1] = &regs->gaddr1;
priv->hash_regs[2] = &regs->gaddr2;
priv->hash_regs[3] = &regs->gaddr3;
priv->hash_regs[4] = &regs->gaddr4;
priv->hash_regs[5] = &regs->gaddr5;
priv->hash_regs[6] = &regs->gaddr6;
priv->hash_regs[7] = &regs->gaddr7;
}
}
static int __gfar_is_rx_idle(struct gfar_private *priv)
{
u32 res;
/* Normaly TSEC should not hang on GRS commands, so we should
* actually wait for IEVENT_GRSC flag.
*/
if (!gfar_has_errata(priv, GFAR_ERRATA_A002))
return 0;
/* Read the eTSEC register at offset 0xD1C. If bits 7-14 are
* the same as bits 23-30, the eTSEC Rx is assumed to be idle
* and the Rx can be safely reset.
*/
res = gfar_read((void __iomem *)priv->gfargrp[0].regs + 0xd1c);
res &= 0x7f807f80;
if ((res & 0xffff) == (res >> 16))
return 1;
return 0;
}
/* Halt the receive and transmit queues */
static void gfar_halt_nodisable(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
unsigned int timeout;
int stopped;
gfar_ints_disable(priv);
if (gfar_is_dma_stopped(priv))
return;
/* Stop the DMA, and wait for it to stop */
tempval = gfar_read(&regs->dmactrl);
tempval |= (DMACTRL_GRS | DMACTRL_GTS);
gfar_write(&regs->dmactrl, tempval);
retry:
timeout = 1000;
while (!(stopped = gfar_is_dma_stopped(priv)) && timeout) {
cpu_relax();
timeout--;
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
}
if (!timeout)
stopped = gfar_is_dma_stopped(priv);
if (!stopped && !gfar_is_rx_dma_stopped(priv) &&
!__gfar_is_rx_idle(priv))
goto retry;
}
/* Halt the receive and transmit queues */
static void gfar_halt(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
/* Dissable the Rx/Tx hw queues */
gfar_write(&regs->rqueue, 0);
gfar_write(&regs->tqueue, 0);
mdelay(10);
gfar_halt_nodisable(priv);
/* Disable Rx/Tx DMA */
tempval = gfar_read(&regs->maccfg1);
tempval &= ~(MACCFG1_RX_EN | MACCFG1_TX_EN);
gfar_write(&regs->maccfg1, tempval);
}
static void free_skb_tx_queue(struct gfar_priv_tx_q *tx_queue)
{
struct txbd8 *txbdp;
struct gfar_private *priv = netdev_priv(tx_queue->dev);
int i, j;
txbdp = tx_queue->tx_bd_base;
for (i = 0; i < tx_queue->tx_ring_size; i++) {
if (!tx_queue->tx_skbuff[i])
continue;
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
dma_unmap_single(priv->dev, be32_to_cpu(txbdp->bufPtr),
be16_to_cpu(txbdp->length), DMA_TO_DEVICE);
txbdp->lstatus = 0;
for (j = 0; j < skb_shinfo(tx_queue->tx_skbuff[i])->nr_frags;
j++) {
txbdp++;
dma_unmap_page(priv->dev, be32_to_cpu(txbdp->bufPtr),
be16_to_cpu(txbdp->length),
DMA_TO_DEVICE);
}
txbdp++;
dev_kfree_skb_any(tx_queue->tx_skbuff[i]);
tx_queue->tx_skbuff[i] = NULL;
}
kfree(tx_queue->tx_skbuff);
tx_queue->tx_skbuff = NULL;
}
static void free_skb_rx_queue(struct gfar_priv_rx_q *rx_queue)
{
int i;
struct rxbd8 *rxbdp = rx_queue->rx_bd_base;
dev_kfree_skb(rx_queue->skb);
for (i = 0; i < rx_queue->rx_ring_size; i++) {
struct gfar_rx_buff *rxb = &rx_queue->rx_buff[i];
rxbdp->lstatus = 0;
rxbdp->bufPtr = 0;
rxbdp++;
if (!rxb->page)
continue;
dma_unmap_page(rx_queue->dev, rxb->dma,
PAGE_SIZE, DMA_FROM_DEVICE);
__free_page(rxb->page);
rxb->page = NULL;
}
kfree(rx_queue->rx_buff);
rx_queue->rx_buff = NULL;
}
/* If there are any tx skbs or rx skbs still around, free them.
* Then free tx_skbuff and rx_skbuff
*/
static void free_skb_resources(struct gfar_private *priv)
{
struct gfar_priv_tx_q *tx_queue = NULL;
struct gfar_priv_rx_q *rx_queue = NULL;
int i;
/* Go through all the buffer descriptors and free their data buffers */
for (i = 0; i < priv->num_tx_queues; i++) {
struct netdev_queue *txq;
tx_queue = priv->tx_queue[i];
txq = netdev_get_tx_queue(tx_queue->dev, tx_queue->qindex);
if (tx_queue->tx_skbuff)
free_skb_tx_queue(tx_queue);
netdev_tx_reset_queue(txq);
}
for (i = 0; i < priv->num_rx_queues; i++) {
rx_queue = priv->rx_queue[i];
if (rx_queue->rx_buff)
free_skb_rx_queue(rx_queue);
}
dma_free_coherent(priv->dev,
sizeof(struct txbd8) * priv->total_tx_ring_size +
sizeof(struct rxbd8) * priv->total_rx_ring_size,
priv->tx_queue[0]->tx_bd_base,
priv->tx_queue[0]->tx_bd_dma_base);
}
void stop_gfar(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
netif_tx_stop_all_queues(dev);
smp_mb__before_atomic();
set_bit(GFAR_DOWN, &priv->state);
smp_mb__after_atomic();
disable_napi(priv);
/* disable ints and gracefully shut down Rx/Tx DMA */
gfar_halt(priv);
phy_stop(dev->phydev);
free_skb_resources(priv);
}
static void gfar_start(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
int i = 0;
/* Enable Rx/Tx hw queues */
gfar_write(&regs->rqueue, priv->rqueue);
gfar_write(&regs->tqueue, priv->tqueue);
/* Initialize DMACTRL to have WWR and WOP */
tempval = gfar_read(&regs->dmactrl);
tempval |= DMACTRL_INIT_SETTINGS;
gfar_write(&regs->dmactrl, tempval);
/* Make sure we aren't stopped */
tempval = gfar_read(&regs->dmactrl);
tempval &= ~(DMACTRL_GRS | DMACTRL_GTS);
gfar_write(&regs->dmactrl, tempval);
for (i = 0; i < priv->num_grps; i++) {
regs = priv->gfargrp[i].regs;
/* Clear THLT/RHLT, so that the DMA starts polling now */
gfar_write(&regs->tstat, priv->gfargrp[i].tstat);
gfar_write(&regs->rstat, priv->gfargrp[i].rstat);
}
/* Enable Rx/Tx DMA */
tempval = gfar_read(&regs->maccfg1);
tempval |= (MACCFG1_RX_EN | MACCFG1_TX_EN);
gfar_write(&regs->maccfg1, tempval);
gfar_ints_enable(priv);
netif_trans_update(priv->ndev); /* prevent tx timeout */
}
static bool gfar_new_page(struct gfar_priv_rx_q *rxq, struct gfar_rx_buff *rxb)
{
struct page *page;
dma_addr_t addr;
page = dev_alloc_page();
if (unlikely(!page))
return false;
addr = dma_map_page(rxq->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(rxq->dev, addr))) {
__free_page(page);
return false;
}
rxb->dma = addr;
rxb->page = page;
rxb->page_offset = 0;
return true;
}
static void gfar_rx_alloc_err(struct gfar_priv_rx_q *rx_queue)
{
struct gfar_private *priv = netdev_priv(rx_queue->ndev);
struct gfar_extra_stats *estats = &priv->extra_stats;
netdev_err(rx_queue->ndev, "Can't alloc RX buffers\n");
atomic64_inc(&estats->rx_alloc_err);
}
static void gfar_alloc_rx_buffs(struct gfar_priv_rx_q *rx_queue,
int alloc_cnt)
{
struct rxbd8 *bdp;
struct gfar_rx_buff *rxb;
int i;
i = rx_queue->next_to_use;
bdp = &rx_queue->rx_bd_base[i];
rxb = &rx_queue->rx_buff[i];
while (alloc_cnt--) {
/* try reuse page */
if (unlikely(!rxb->page)) {
if (unlikely(!gfar_new_page(rx_queue, rxb))) {
gfar_rx_alloc_err(rx_queue);
break;
}
}
/* Setup the new RxBD */
gfar_init_rxbdp(rx_queue, bdp,
rxb->dma + rxb->page_offset + RXBUF_ALIGNMENT);
/* Update to the next pointer */
bdp++;
rxb++;
if (unlikely(++i == rx_queue->rx_ring_size)) {
i = 0;
bdp = rx_queue->rx_bd_base;
rxb = rx_queue->rx_buff;
}
}
rx_queue->next_to_use = i;
rx_queue->next_to_alloc = i;
}
static void gfar_init_bds(struct net_device *ndev)
{
struct gfar_private *priv = netdev_priv(ndev);
struct gfar __iomem *regs = priv->gfargrp[0].regs;
struct gfar_priv_tx_q *tx_queue = NULL;
struct gfar_priv_rx_q *rx_queue = NULL;
struct txbd8 *txbdp;
u32 __iomem *rfbptr;
int i, j;
for (i = 0; i < priv->num_tx_queues; i++) {
tx_queue = priv->tx_queue[i];
/* Initialize some variables in our dev structure */
tx_queue->num_txbdfree = tx_queue->tx_ring_size;
tx_queue->dirty_tx = tx_queue->tx_bd_base;
tx_queue->cur_tx = tx_queue->tx_bd_base;
tx_queue->skb_curtx = 0;
tx_queue->skb_dirtytx = 0;
/* Initialize Transmit Descriptor Ring */
txbdp = tx_queue->tx_bd_base;
for (j = 0; j < tx_queue->tx_ring_size; j++) {
txbdp->lstatus = 0;
txbdp->bufPtr = 0;
txbdp++;
}
/* Set the last descriptor in the ring to indicate wrap */
txbdp--;
txbdp->status = cpu_to_be16(be16_to_cpu(txbdp->status) |
TXBD_WRAP);
}
rfbptr = &regs->rfbptr0;
for (i = 0; i < priv->num_rx_queues; i++) {
rx_queue = priv->rx_queue[i];
rx_queue->next_to_clean = 0;
rx_queue->next_to_use = 0;
rx_queue->next_to_alloc = 0;
/* make sure next_to_clean != next_to_use after this
* by leaving at least 1 unused descriptor
*/
gfar_alloc_rx_buffs(rx_queue, gfar_rxbd_unused(rx_queue));
rx_queue->rfbptr = rfbptr;
rfbptr += 2;
}
}
static int gfar_alloc_skb_resources(struct net_device *ndev)
{
void *vaddr;
dma_addr_t addr;
int i, j;
struct gfar_private *priv = netdev_priv(ndev);
struct device *dev = priv->dev;
struct gfar_priv_tx_q *tx_queue = NULL;
struct gfar_priv_rx_q *rx_queue = NULL;
priv->total_tx_ring_size = 0;
for (i = 0; i < priv->num_tx_queues; i++)
priv->total_tx_ring_size += priv->tx_queue[i]->tx_ring_size;
priv->total_rx_ring_size = 0;
for (i = 0; i < priv->num_rx_queues; i++)
priv->total_rx_ring_size += priv->rx_queue[i]->rx_ring_size;
/* Allocate memory for the buffer descriptors */
vaddr = dma_alloc_coherent(dev,
(priv->total_tx_ring_size *
sizeof(struct txbd8)) +
(priv->total_rx_ring_size *
sizeof(struct rxbd8)),
&addr, GFP_KERNEL);
if (!vaddr)
return -ENOMEM;
for (i = 0; i < priv->num_tx_queues; i++) {
tx_queue = priv->tx_queue[i];
tx_queue->tx_bd_base = vaddr;
tx_queue->tx_bd_dma_base = addr;
tx_queue->dev = ndev;
/* enet DMA only understands physical addresses */
addr += sizeof(struct txbd8) * tx_queue->tx_ring_size;
vaddr += sizeof(struct txbd8) * tx_queue->tx_ring_size;
}
/* Start the rx descriptor ring where the tx ring leaves off */
for (i = 0; i < priv->num_rx_queues; i++) {
rx_queue = priv->rx_queue[i];
rx_queue->rx_bd_base = vaddr;
rx_queue->rx_bd_dma_base = addr;
rx_queue->ndev = ndev;
rx_queue->dev = dev;
addr += sizeof(struct rxbd8) * rx_queue->rx_ring_size;
vaddr += sizeof(struct rxbd8) * rx_queue->rx_ring_size;
}
/* Setup the skbuff rings */
for (i = 0; i < priv->num_tx_queues; i++) {
tx_queue = priv->tx_queue[i];
tx_queue->tx_skbuff =
kmalloc_array(tx_queue->tx_ring_size,
sizeof(*tx_queue->tx_skbuff),
GFP_KERNEL);
if (!tx_queue->tx_skbuff)
goto cleanup;
for (j = 0; j < tx_queue->tx_ring_size; j++)
tx_queue->tx_skbuff[j] = NULL;
}
for (i = 0; i < priv->num_rx_queues; i++) {
rx_queue = priv->rx_queue[i];
rx_queue->rx_buff = kcalloc(rx_queue->rx_ring_size,
sizeof(*rx_queue->rx_buff),
GFP_KERNEL);
if (!rx_queue->rx_buff)
goto cleanup;
}
gfar_init_bds(ndev);
return 0;
cleanup:
free_skb_resources(priv);
return -ENOMEM;
}
/* Bring the controller up and running */
int startup_gfar(struct net_device *ndev)
{
struct gfar_private *priv = netdev_priv(ndev);
int err;
gfar_mac_reset(priv);
err = gfar_alloc_skb_resources(ndev);
if (err)
return err;
gfar_init_tx_rx_base(priv);
smp_mb__before_atomic();
clear_bit(GFAR_DOWN, &priv->state);
smp_mb__after_atomic();
/* Start Rx/Tx DMA and enable the interrupts */
gfar_start(priv);
/* force link state update after mac reset */
priv->oldlink = 0;
priv->oldspeed = 0;
priv->oldduplex = -1;
phy_start(ndev->phydev);
enable_napi(priv);
netif_tx_wake_all_queues(ndev);
return 0;
}
static u32 gfar_get_flowctrl_cfg(struct gfar_private *priv)
{
struct net_device *ndev = priv->ndev;
struct phy_device *phydev = ndev->phydev;
u32 val = 0;
if (!phydev->duplex)
return val;
gianfar: Fix race in TBI/SerDes configuration The init_phy() function attaches to the PHY, then configures the SerDes<->TBI link (in SGMII mode). The TBI is on the MDIO bus with the PHY (sort of) and is accessed via the gianfar's MDIO registers, using the functions gfar_local_mdio_read/write(), which don't do any locking. The previously attached PHY will start a work-queue on a timer, and probably an irq handler as well, which will talk to the PHY and thus use the MDIO bus. This uses phy_read/write(), which have locking, but not against the gfar_local_mdio versions. The result is that PHY code will try to use the MDIO bus at the same time as the SerDes setup code, corrupting the transfers. Setting up the SerDes before attaching to the PHY will insure that there is no race between the SerDes code and *our* PHY, but doesn't fix everything. Typically the PHYs for all gianfar devices are on the same MDIO bus, which is associated with the first gianfar device. This means that the first gianfar's SerDes code could corrupt the MDIO transfers for a different gianfar's PHY. The lock used by phy_read/write() is contained in the mii_bus structure, which is pointed to by the PHY. This is difficult to access from the gianfar drivers, as there is no link between a gianfar device and the mii_bus which shares the same MDIO registers. As far as the device layer and drivers are concerned they are two unrelated devices (which happen to share registers). Generally all gianfar devices' PHYs will be on the bus associated with the first gianfar. But this might not be the case, so simply locking the gianfar's PHY's mii bus might not lock the mii bus that the SerDes setup code is going to use. We solve this by having the code that creates the gianfar platform device look in the device tree for an mdio device that shares the gianfar's registers. If one is found the ID of its platform device is saved in the gianfar's platform data. A new function in the gianfar mii code, gfar_get_miibus(), can use the bus ID to search through the platform devices for a gianfar_mdio device with the right ID. The platform device's driver data is the mii_bus structure, which the SerDes setup code can use to lock the current bus. Signed-off-by: Trent Piepho <tpiepho@freescale.com> CC: Andy Fleming <afleming@freescale.com> Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
2008-10-31 09:17:06 +08:00
if (!priv->pause_aneg_en) {
if (priv->tx_pause_en)
val |= MACCFG1_TX_FLOW;
if (priv->rx_pause_en)
val |= MACCFG1_RX_FLOW;
} else {
u16 lcl_adv, rmt_adv;
u8 flowctrl;
/* get link partner capabilities */
rmt_adv = 0;
if (phydev->pause)
rmt_adv = LPA_PAUSE_CAP;
if (phydev->asym_pause)
rmt_adv |= LPA_PAUSE_ASYM;
lcl_adv = linkmode_adv_to_lcl_adv_t(phydev->advertising);
flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv);
if (flowctrl & FLOW_CTRL_TX)
val |= MACCFG1_TX_FLOW;
if (flowctrl & FLOW_CTRL_RX)
val |= MACCFG1_RX_FLOW;
}
return val;
}
static noinline void gfar_update_link_state(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
struct net_device *ndev = priv->ndev;
struct phy_device *phydev = ndev->phydev;
struct gfar_priv_rx_q *rx_queue = NULL;
int i;
if (unlikely(test_bit(GFAR_RESETTING, &priv->state)))
return;
if (phydev->link) {
u32 tempval1 = gfar_read(&regs->maccfg1);
u32 tempval = gfar_read(&regs->maccfg2);
u32 ecntrl = gfar_read(&regs->ecntrl);
u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
if (phydev->duplex != priv->oldduplex) {
if (!(phydev->duplex))
tempval &= ~(MACCFG2_FULL_DUPLEX);
else
tempval |= MACCFG2_FULL_DUPLEX;
priv->oldduplex = phydev->duplex;
}
if (phydev->speed != priv->oldspeed) {
switch (phydev->speed) {
case 1000:
tempval =
((tempval & ~(MACCFG2_IF)) | MACCFG2_GMII);
ecntrl &= ~(ECNTRL_R100);
break;
case 100:
case 10:
tempval =
((tempval & ~(MACCFG2_IF)) | MACCFG2_MII);
/* Reduced mode distinguishes
* between 10 and 100
*/
if (phydev->speed == SPEED_100)
ecntrl |= ECNTRL_R100;
else
ecntrl &= ~(ECNTRL_R100);
break;
default:
netif_warn(priv, link, priv->ndev,
"Ack! Speed (%d) is not 10/100/1000!\n",
phydev->speed);
break;
}
priv->oldspeed = phydev->speed;
}
tempval1 &= ~(MACCFG1_TX_FLOW | MACCFG1_RX_FLOW);
tempval1 |= gfar_get_flowctrl_cfg(priv);
/* Turn last free buffer recording on */
if ((tempval1 & MACCFG1_TX_FLOW) && !tx_flow_oldval) {
for (i = 0; i < priv->num_rx_queues; i++) {
u32 bdp_dma;
rx_queue = priv->rx_queue[i];
bdp_dma = gfar_rxbd_dma_lastfree(rx_queue);
gfar_write(rx_queue->rfbptr, bdp_dma);
}
priv->tx_actual_en = 1;
}
if (unlikely(!(tempval1 & MACCFG1_TX_FLOW) && tx_flow_oldval))
priv->tx_actual_en = 0;
gfar_write(&regs->maccfg1, tempval1);
gfar_write(&regs->maccfg2, tempval);
gfar_write(&regs->ecntrl, ecntrl);
if (!priv->oldlink)
priv->oldlink = 1;
} else if (priv->oldlink) {
priv->oldlink = 0;
priv->oldspeed = 0;
priv->oldduplex = -1;
}
if (netif_msg_link(priv))
phy_print_status(phydev);
}
/* Called every time the controller might need to be made
* aware of new link state. The PHY code conveys this
* information through variables in the phydev structure, and this
* function converts those variables into the appropriate
* register values, and can bring down the device if needed.
*/
static void adjust_link(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
struct phy_device *phydev = dev->phydev;
if (unlikely(phydev->link != priv->oldlink ||
(phydev->link && (phydev->duplex != priv->oldduplex ||
phydev->speed != priv->oldspeed))))
gfar_update_link_state(priv);
}
/* Initialize TBI PHY interface for communicating with the
* SERDES lynx PHY on the chip. We communicate with this PHY
* through the MDIO bus on each controller, treating it as a
* "normal" PHY at the address found in the TBIPA register. We assume
* that the TBIPA register is valid. Either the MDIO bus code will set
* it to a value that doesn't conflict with other PHYs on the bus, or the
* value doesn't matter, as there are no other PHYs on the bus.
*/
static void gfar_configure_serdes(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
struct phy_device *tbiphy;
if (!priv->tbi_node) {
dev_warn(&dev->dev, "error: SGMII mode requires that the "
"device tree specify a tbi-handle\n");
return;
}
tbiphy = of_phy_find_device(priv->tbi_node);
if (!tbiphy) {
dev_err(&dev->dev, "error: Could not get TBI device\n");
return;
}
/* If the link is already up, we must already be ok, and don't need to
* configure and reset the TBI<->SerDes link. Maybe U-Boot configured
* everything for us? Resetting it takes the link down and requires
* several seconds for it to come back.
*/
if (phy_read(tbiphy, MII_BMSR) & BMSR_LSTATUS) {
put_device(&tbiphy->mdio.dev);
return;
}
/* Single clk mode, mii mode off(for serdes communication) */
phy_write(tbiphy, MII_TBICON, TBICON_CLK_SELECT);
phy_write(tbiphy, MII_ADVERTISE,
ADVERTISE_1000XFULL | ADVERTISE_1000XPAUSE |
ADVERTISE_1000XPSE_ASYM);
phy_write(tbiphy, MII_BMCR,
BMCR_ANENABLE | BMCR_ANRESTART | BMCR_FULLDPLX |
BMCR_SPEED1000);
put_device(&tbiphy->mdio.dev);
}
/* Initializes driver's PHY state, and attaches to the PHY.
* Returns 0 on success.
*/
static int init_phy(struct net_device *dev)
{
__ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
struct gfar_private *priv = netdev_priv(dev);
phy_interface_t interface = priv->interface;
struct phy_device *phydev;
struct ethtool_eee edata;
linkmode_set_bit_array(phy_10_100_features_array,
ARRAY_SIZE(phy_10_100_features_array),
mask);
linkmode_set_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, mask);
linkmode_set_bit(ETHTOOL_LINK_MODE_MII_BIT, mask);
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_GIGABIT)
linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, mask);
priv->oldlink = 0;
priv->oldspeed = 0;
priv->oldduplex = -1;
phydev = of_phy_connect(dev, priv->phy_node, &adjust_link, 0,
interface);
if (!phydev) {
dev_err(&dev->dev, "could not attach to PHY\n");
return -ENODEV;
}
if (interface == PHY_INTERFACE_MODE_SGMII)
gfar_configure_serdes(dev);
/* Remove any features not supported by the controller */
linkmode_and(phydev->supported, phydev->supported, mask);
linkmode_copy(phydev->advertising, phydev->supported);
/* Add support for flow control */
phy_support_asym_pause(phydev);
/* disable EEE autoneg, EEE not supported by eTSEC */
memset(&edata, 0, sizeof(struct ethtool_eee));
phy_ethtool_set_eee(phydev, &edata);
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
return 0;
}
static inline struct txfcb *gfar_add_fcb(struct sk_buff *skb)
{
struct txfcb *fcb = skb_push(skb, GMAC_FCB_LEN);
memset(fcb, 0, GMAC_FCB_LEN);
return fcb;
}
static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb,
int fcb_length)
{
/* If we're here, it's a IP packet with a TCP or UDP
* payload. We set it to checksum, using a pseudo-header
* we provide
*/
u8 flags = TXFCB_DEFAULT;
/* Tell the controller what the protocol is
* And provide the already calculated phcs
*/
if (ip_hdr(skb)->protocol == IPPROTO_UDP) {
flags |= TXFCB_UDP;
fcb->phcs = (__force __be16)(udp_hdr(skb)->check);
} else
fcb->phcs = (__force __be16)(tcp_hdr(skb)->check);
/* l3os is the distance between the start of the
* frame (skb->data) and the start of the IP hdr.
* l4os is the distance between the start of the
* l3 hdr and the l4 hdr
*/
fcb->l3os = (u8)(skb_network_offset(skb) - fcb_length);
fcb->l4os = skb_network_header_len(skb);
fcb->flags = flags;
}
static inline void gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
{
fcb->flags |= TXFCB_VLN;
fcb->vlctl = cpu_to_be16(skb_vlan_tag_get(skb));
}
static inline struct txbd8 *skip_txbd(struct txbd8 *bdp, int stride,
struct txbd8 *base, int ring_size)
{
struct txbd8 *new_bd = bdp + stride;
return (new_bd >= (base + ring_size)) ? (new_bd - ring_size) : new_bd;
}
static inline struct txbd8 *next_txbd(struct txbd8 *bdp, struct txbd8 *base,
int ring_size)
{
return skip_txbd(bdp, 1, base, ring_size);
}
/* eTSEC12: csum generation not supported for some fcb offsets */
static inline bool gfar_csum_errata_12(struct gfar_private *priv,
unsigned long fcb_addr)
{
return (gfar_has_errata(priv, GFAR_ERRATA_12) &&
(fcb_addr % 0x20) > 0x18);
}
/* eTSEC76: csum generation for frames larger than 2500 may
* cause excess delays before start of transmission
*/
static inline bool gfar_csum_errata_76(struct gfar_private *priv,
unsigned int len)
{
return (gfar_has_errata(priv, GFAR_ERRATA_76) &&
(len > 2500));
}
/* This is called by the kernel when a frame is ready for transmission.
* It is pointed to by the dev->hard_start_xmit function pointer
*/
static netdev_tx_t gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
struct gfar_priv_tx_q *tx_queue = NULL;
struct netdev_queue *txq;
struct gfar __iomem *regs = NULL;
struct txfcb *fcb = NULL;
struct txbd8 *txbdp, *txbdp_start, *base, *txbdp_tstamp = NULL;
u32 lstatus;
skb_frag_t *frag;
int i, rq = 0;
int do_tstamp, do_csum, do_vlan;
u32 bufaddr;
unsigned int nr_frags, nr_txbds, bytes_sent, fcb_len = 0;
rq = skb->queue_mapping;
tx_queue = priv->tx_queue[rq];
txq = netdev_get_tx_queue(dev, rq);
base = tx_queue->tx_bd_base;
regs = tx_queue->grp->regs;
do_csum = (CHECKSUM_PARTIAL == skb->ip_summed);
do_vlan = skb_vlan_tag_present(skb);
do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
priv->hwts_tx_en;
if (do_csum || do_vlan)
fcb_len = GMAC_FCB_LEN;
/* check if time stamp should be generated */
if (unlikely(do_tstamp))
fcb_len = GMAC_FCB_LEN + GMAC_TXPAL_LEN;
/* make space for additional header when fcb is needed */
gianfar: Replace skb_realloc_headroom with skb_cow_head for PTP When PTP timestamping is enabled on Tx, the controller inserts the Tx timestamp at the beginning of the frame buffer, between SFD and the L2 frame header. This means that the skb provided by the stack is required to have enough headroom otherwise a new skb needs to be created by the driver to accommodate the timestamp inserted by h/w. Up until now the driver was relying on skb_realloc_headroom() to create new skbs to accommodate PTP frames. Turns out that this method is not reliable in this context at least, as skb_realloc_headroom() for PTP frames can cause random crashes, mostly in subsequent skb_*() calls, when multiple concurrent TCP streams are run at the same time with the PTP flow on the same device (as seen in James' report). I also noticed that when the system is loaded by sending multiple TCP streams, the driver receives cloned skbs in large numbers. skb_cow_head() instead proves to be stable in this scenario, and not only handles cloned skbs too but it's also more efficient and widely used in other drivers. The commit introducing skb_realloc_headroom in the driver goes back to 2009, commit 93c1285c5d92 ("gianfar: reallocate skb when headroom is not enough for fcb"). For practical purposes I'm referencing a newer commit (from 2012) that brings the code to its current structure (and fixes the PTP case). Fixes: 9c4886e5e63b ("gianfar: Fix invalid TX frames returned on error queue when time stamping") Reported-by: James Jurack <james.jurack@ametek.com> Suggested-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com> Link: https://lore.kernel.org/r/20201029081057.8506-1-claudiu.manoil@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 16:10:56 +08:00
if (fcb_len) {
if (unlikely(skb_cow_head(skb, fcb_len))) {
dev->stats.tx_errors++;
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
}
/* total number of fragments in the SKB */
nr_frags = skb_shinfo(skb)->nr_frags;
/* calculate the required number of TxBDs for this skb */
if (unlikely(do_tstamp))
nr_txbds = nr_frags + 2;
else
nr_txbds = nr_frags + 1;
/* check if there is space to queue this packet */
if (nr_txbds > tx_queue->num_txbdfree) {
/* no space, stop the queue */
netif_tx_stop_queue(txq);
dev->stats.tx_fifo_errors++;
return NETDEV_TX_BUSY;
}
/* Update transmit stats */
bytes_sent = skb->len;
tx_queue->stats.tx_bytes += bytes_sent;
/* keep Tx bytes on wire for BQL accounting */
GFAR_CB(skb)->bytes_sent = bytes_sent;
tx_queue->stats.tx_packets++;
txbdp = txbdp_start = tx_queue->cur_tx;
lstatus = be32_to_cpu(txbdp->lstatus);
/* Add TxPAL between FCB and frame if required */
if (unlikely(do_tstamp)) {
skb_push(skb, GMAC_TXPAL_LEN);
memset(skb->data, 0, GMAC_TXPAL_LEN);
}
/* Add TxFCB if required */
if (fcb_len) {
fcb = gfar_add_fcb(skb);
lstatus |= BD_LFLAG(TXBD_TOE);
}
/* Set up checksumming */
if (do_csum) {
gfar_tx_checksum(skb, fcb, fcb_len);
if (unlikely(gfar_csum_errata_12(priv, (unsigned long)fcb)) ||
unlikely(gfar_csum_errata_76(priv, skb->len))) {
__skb_pull(skb, GMAC_FCB_LEN);
skb_checksum_help(skb);
if (do_vlan || do_tstamp) {
/* put back a new fcb for vlan/tstamp TOE */
fcb = gfar_add_fcb(skb);
} else {
/* Tx TOE not used */
lstatus &= ~(BD_LFLAG(TXBD_TOE));
fcb = NULL;
}
}
}
if (do_vlan)
gfar_tx_vlan(skb, fcb);
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled We need to use dma_mapping_error() to check the dma address returned by dma_map_single/page(). Otherwise we would get warning like this: WARNING: at lib/dma-debug.c:1140 Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.18.0-rc2-next-20141029 #196 task: c0834300 ti: effe6000 task.ti: c0874000 NIP: c02b2c98 LR: c02b2c98 CTR: c030abc4 REGS: effe7d70 TRAP: 0700 Not tainted (3.18.0-rc2-next-20141029) MSR: 00021000 <CE,ME> CR: 22044022 XER: 20000000 GPR00: c02b2c98 effe7e20 c0834300 00000098 00021000 00000000 c030b898 00000003 GPR08: 00000001 00000000 00000001 749eec9d 22044022 1001abe0 00000020 ef278678 GPR16: ef278670 ef278668 ef278660 070a8040 c087f99c c08cdc60 00029000 c0840d44 GPR24: c08be6e8 c0840000 effe7e78 ef041340 00000600 ef114e10 00000000 c08be6e0 NIP [c02b2c98] check_unmap+0x51c/0x9e4 LR [c02b2c98] check_unmap+0x51c/0x9e4 Call Trace: [effe7e20] [c02b2c98] check_unmap+0x51c/0x9e4 (unreliable) [effe7e70] [c02b31d8] debug_dma_unmap_page+0x78/0x8c [effe7ed0] [c03d1640] gfar_clean_rx_ring+0x208/0x488 [effe7f40] [c03d1a9c] gfar_poll_rx_sq+0x3c/0xa8 [effe7f60] [c04f8714] net_rx_action+0xc0/0x178 [effe7f90] [c00435a0] __do_softirq+0x100/0x1fc [effe7fe0] [c0043958] irq_exit+0xa4/0xc8 [effe7ff0] [c000d14c] call_do_irq+0x24/0x3c [c0875e90] [c00048a0] do_IRQ+0x8c/0xf8 [c0875eb0] [c000ed10] ret_from_except+0x0/0x18 For TX, we need to unmap the pages which has already been mapped and free the skb before return. For RX, move the dma mapping and error check to gfar_new_skb(). We would reuse the original skb in the rx ring when either allocating skb failure or dma mapping error. Signed-off-by: Kevin Hao <haokexin@gmail.com> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-11 14:08:41 +08:00
bufaddr = dma_map_single(priv->dev, skb->data, skb_headlen(skb),
DMA_TO_DEVICE);
if (unlikely(dma_mapping_error(priv->dev, bufaddr)))
goto dma_map_err;
txbdp_start->bufPtr = cpu_to_be32(bufaddr);
/* Time stamp insertion requires one additional TxBD */
if (unlikely(do_tstamp))
txbdp_tstamp = txbdp = next_txbd(txbdp, base,
tx_queue->tx_ring_size);
if (likely(!nr_frags)) {
if (likely(!do_tstamp))
lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
} else {
u32 lstatus_start = lstatus;
/* Place the fragment addresses and lengths into the TxBDs */
frag = &skb_shinfo(skb)->frags[0];
for (i = 0; i < nr_frags; i++, frag++) {
unsigned int size;
/* Point at the next BD, wrapping as needed */
txbdp = next_txbd(txbdp, base, tx_queue->tx_ring_size);
size = skb_frag_size(frag);
lstatus = be32_to_cpu(txbdp->lstatus) | size |
BD_LFLAG(TXBD_READY);
/* Handle the last BD specially */
if (i == nr_frags - 1)
lstatus |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
bufaddr = skb_frag_dma_map(priv->dev, frag, 0,
size, DMA_TO_DEVICE);
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled We need to use dma_mapping_error() to check the dma address returned by dma_map_single/page(). Otherwise we would get warning like this: WARNING: at lib/dma-debug.c:1140 Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.18.0-rc2-next-20141029 #196 task: c0834300 ti: effe6000 task.ti: c0874000 NIP: c02b2c98 LR: c02b2c98 CTR: c030abc4 REGS: effe7d70 TRAP: 0700 Not tainted (3.18.0-rc2-next-20141029) MSR: 00021000 <CE,ME> CR: 22044022 XER: 20000000 GPR00: c02b2c98 effe7e20 c0834300 00000098 00021000 00000000 c030b898 00000003 GPR08: 00000001 00000000 00000001 749eec9d 22044022 1001abe0 00000020 ef278678 GPR16: ef278670 ef278668 ef278660 070a8040 c087f99c c08cdc60 00029000 c0840d44 GPR24: c08be6e8 c0840000 effe7e78 ef041340 00000600 ef114e10 00000000 c08be6e0 NIP [c02b2c98] check_unmap+0x51c/0x9e4 LR [c02b2c98] check_unmap+0x51c/0x9e4 Call Trace: [effe7e20] [c02b2c98] check_unmap+0x51c/0x9e4 (unreliable) [effe7e70] [c02b31d8] debug_dma_unmap_page+0x78/0x8c [effe7ed0] [c03d1640] gfar_clean_rx_ring+0x208/0x488 [effe7f40] [c03d1a9c] gfar_poll_rx_sq+0x3c/0xa8 [effe7f60] [c04f8714] net_rx_action+0xc0/0x178 [effe7f90] [c00435a0] __do_softirq+0x100/0x1fc [effe7fe0] [c0043958] irq_exit+0xa4/0xc8 [effe7ff0] [c000d14c] call_do_irq+0x24/0x3c [c0875e90] [c00048a0] do_IRQ+0x8c/0xf8 [c0875eb0] [c000ed10] ret_from_except+0x0/0x18 For TX, we need to unmap the pages which has already been mapped and free the skb before return. For RX, move the dma mapping and error check to gfar_new_skb(). We would reuse the original skb in the rx ring when either allocating skb failure or dma mapping error. Signed-off-by: Kevin Hao <haokexin@gmail.com> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-11 14:08:41 +08:00
if (unlikely(dma_mapping_error(priv->dev, bufaddr)))
goto dma_map_err;
/* set the TxBD length and buffer pointer */
txbdp->bufPtr = cpu_to_be32(bufaddr);
txbdp->lstatus = cpu_to_be32(lstatus);
}
lstatus = lstatus_start;
}
/* If time stamping is requested one additional TxBD must be set up. The
* first TxBD points to the FCB and must have a data length of
* GMAC_FCB_LEN. The second TxBD points to the actual frame data with
* the full frame length.
*/
if (unlikely(do_tstamp)) {
u32 lstatus_ts = be32_to_cpu(txbdp_tstamp->lstatus);
bufaddr = be32_to_cpu(txbdp_start->bufPtr);
bufaddr += fcb_len;
lstatus_ts |= BD_LFLAG(TXBD_READY) |
(skb_headlen(skb) - fcb_len);
if (!nr_frags)
lstatus_ts |= BD_LFLAG(TXBD_LAST | TXBD_INTERRUPT);
txbdp_tstamp->bufPtr = cpu_to_be32(bufaddr);
txbdp_tstamp->lstatus = cpu_to_be32(lstatus_ts);
lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | GMAC_FCB_LEN;
/* Setup tx hardware time stamping */
skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
fcb->ptp = 1;
} else {
lstatus |= BD_LFLAG(TXBD_CRC | TXBD_READY) | skb_headlen(skb);
}
netdev_tx_sent_queue(txq, bytes_sent);
gfar_wmb();
txbdp_start->lstatus = cpu_to_be32(lstatus);
gfar_wmb(); /* force lstatus write before tx_skbuff */
gianfar: Fix TX ring processing on SMP machines Starting with commit a3bc1f11e9b867a4f49505 ("gianfar: Revive SKB recycling") gianfar driver sooner or later stops transmitting any packets on SMP machines. start_xmit() prepares new skb for transmitting, generally it does three things: 1. sets up all BDs (marks them ready to send), except the first one. 2. stores skb into tx_queue->tx_skbuff so that clean_tx_ring() would cleanup it later. 3. sets up the first BD, i.e. marks it ready. Here is what clean_tx_ring() does: 1. reads skbs from tx_queue->tx_skbuff 2. checks if the *last* BD is ready. If it's still ready [to send] then it it isn't transmitted, so clean_tx_ring() returns. Otherwise it actually cleanups BDs. All is OK. Now, if there is just one BD, code flow: - start_xmit(): stores skb into tx_skbuff. Note that the first BD (which is also the last one) isn't marked as ready, yet. - clean_tx_ring(): sees that skb is not null, *and* its lstatus says that it is NOT ready (like if BD was sent), so it cleans it up (bad!) - start_xmit(): marks BD as ready [to send], but it's too late. We can fix this simply by reordering lstatus/tx_skbuff writes. Reported-by: Martyn Welch <martyn.welch@ge.com> Bisected-by: Paul Gortmaker <paul.gortmaker@windriver.com> Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com> Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com> Tested-by: Martyn Welch <martyn.welch@ge.com> Cc: Sandeep Gopalpet <Sandeep.Kumar@freescale.com> Cc: Stable <stable@vger.kernel.org> [2.6.33] Signed-off-by: David S. Miller <davem@davemloft.net>
2010-03-03 16:18:58 +08:00
tx_queue->tx_skbuff[tx_queue->skb_curtx] = skb;
/* Update the current skb pointer to the next entry we will use
* (wrapping if necessary)
*/
tx_queue->skb_curtx = (tx_queue->skb_curtx + 1) &
TX_RING_MOD_MASK(tx_queue->tx_ring_size);
tx_queue->cur_tx = next_txbd(txbdp, base, tx_queue->tx_ring_size);
/* We can work in parallel with gfar_clean_tx_ring(), except
* when modifying num_txbdfree. Note that we didn't grab the lock
* when we were reading the num_txbdfree and checking for available
* space, that's because outside of this function it can only grow.
*/
spin_lock_bh(&tx_queue->txlock);
/* reduce TxBD free count */
tx_queue->num_txbdfree -= (nr_txbds);
spin_unlock_bh(&tx_queue->txlock);
/* If the next BD still needs to be cleaned up, then the bds
* are full. We need to tell the kernel to stop sending us stuff.
*/
if (!tx_queue->num_txbdfree) {
netif_tx_stop_queue(txq);
dev->stats.tx_fifo_errors++;
}
/* Tell the DMA to go go go */
gfar_write(&regs->tstat, TSTAT_CLEAR_THALT >> tx_queue->qindex);
return NETDEV_TX_OK;
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled We need to use dma_mapping_error() to check the dma address returned by dma_map_single/page(). Otherwise we would get warning like this: WARNING: at lib/dma-debug.c:1140 Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.18.0-rc2-next-20141029 #196 task: c0834300 ti: effe6000 task.ti: c0874000 NIP: c02b2c98 LR: c02b2c98 CTR: c030abc4 REGS: effe7d70 TRAP: 0700 Not tainted (3.18.0-rc2-next-20141029) MSR: 00021000 <CE,ME> CR: 22044022 XER: 20000000 GPR00: c02b2c98 effe7e20 c0834300 00000098 00021000 00000000 c030b898 00000003 GPR08: 00000001 00000000 00000001 749eec9d 22044022 1001abe0 00000020 ef278678 GPR16: ef278670 ef278668 ef278660 070a8040 c087f99c c08cdc60 00029000 c0840d44 GPR24: c08be6e8 c0840000 effe7e78 ef041340 00000600 ef114e10 00000000 c08be6e0 NIP [c02b2c98] check_unmap+0x51c/0x9e4 LR [c02b2c98] check_unmap+0x51c/0x9e4 Call Trace: [effe7e20] [c02b2c98] check_unmap+0x51c/0x9e4 (unreliable) [effe7e70] [c02b31d8] debug_dma_unmap_page+0x78/0x8c [effe7ed0] [c03d1640] gfar_clean_rx_ring+0x208/0x488 [effe7f40] [c03d1a9c] gfar_poll_rx_sq+0x3c/0xa8 [effe7f60] [c04f8714] net_rx_action+0xc0/0x178 [effe7f90] [c00435a0] __do_softirq+0x100/0x1fc [effe7fe0] [c0043958] irq_exit+0xa4/0xc8 [effe7ff0] [c000d14c] call_do_irq+0x24/0x3c [c0875e90] [c00048a0] do_IRQ+0x8c/0xf8 [c0875eb0] [c000ed10] ret_from_except+0x0/0x18 For TX, we need to unmap the pages which has already been mapped and free the skb before return. For RX, move the dma mapping and error check to gfar_new_skb(). We would reuse the original skb in the rx ring when either allocating skb failure or dma mapping error. Signed-off-by: Kevin Hao <haokexin@gmail.com> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-11 14:08:41 +08:00
dma_map_err:
txbdp = next_txbd(txbdp_start, base, tx_queue->tx_ring_size);
if (do_tstamp)
txbdp = next_txbd(txbdp, base, tx_queue->tx_ring_size);
for (i = 0; i < nr_frags; i++) {
lstatus = be32_to_cpu(txbdp->lstatus);
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled We need to use dma_mapping_error() to check the dma address returned by dma_map_single/page(). Otherwise we would get warning like this: WARNING: at lib/dma-debug.c:1140 Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.18.0-rc2-next-20141029 #196 task: c0834300 ti: effe6000 task.ti: c0874000 NIP: c02b2c98 LR: c02b2c98 CTR: c030abc4 REGS: effe7d70 TRAP: 0700 Not tainted (3.18.0-rc2-next-20141029) MSR: 00021000 <CE,ME> CR: 22044022 XER: 20000000 GPR00: c02b2c98 effe7e20 c0834300 00000098 00021000 00000000 c030b898 00000003 GPR08: 00000001 00000000 00000001 749eec9d 22044022 1001abe0 00000020 ef278678 GPR16: ef278670 ef278668 ef278660 070a8040 c087f99c c08cdc60 00029000 c0840d44 GPR24: c08be6e8 c0840000 effe7e78 ef041340 00000600 ef114e10 00000000 c08be6e0 NIP [c02b2c98] check_unmap+0x51c/0x9e4 LR [c02b2c98] check_unmap+0x51c/0x9e4 Call Trace: [effe7e20] [c02b2c98] check_unmap+0x51c/0x9e4 (unreliable) [effe7e70] [c02b31d8] debug_dma_unmap_page+0x78/0x8c [effe7ed0] [c03d1640] gfar_clean_rx_ring+0x208/0x488 [effe7f40] [c03d1a9c] gfar_poll_rx_sq+0x3c/0xa8 [effe7f60] [c04f8714] net_rx_action+0xc0/0x178 [effe7f90] [c00435a0] __do_softirq+0x100/0x1fc [effe7fe0] [c0043958] irq_exit+0xa4/0xc8 [effe7ff0] [c000d14c] call_do_irq+0x24/0x3c [c0875e90] [c00048a0] do_IRQ+0x8c/0xf8 [c0875eb0] [c000ed10] ret_from_except+0x0/0x18 For TX, we need to unmap the pages which has already been mapped and free the skb before return. For RX, move the dma mapping and error check to gfar_new_skb(). We would reuse the original skb in the rx ring when either allocating skb failure or dma mapping error. Signed-off-by: Kevin Hao <haokexin@gmail.com> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-11 14:08:41 +08:00
if (!(lstatus & BD_LFLAG(TXBD_READY)))
break;
lstatus &= ~BD_LFLAG(TXBD_READY);
txbdp->lstatus = cpu_to_be32(lstatus);
bufaddr = be32_to_cpu(txbdp->bufPtr);
dma_unmap_page(priv->dev, bufaddr, be16_to_cpu(txbdp->length),
gianfar: Fix dma check map error when DMA_API_DEBUG is enabled We need to use dma_mapping_error() to check the dma address returned by dma_map_single/page(). Otherwise we would get warning like this: WARNING: at lib/dma-debug.c:1140 Modules linked in: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 3.18.0-rc2-next-20141029 #196 task: c0834300 ti: effe6000 task.ti: c0874000 NIP: c02b2c98 LR: c02b2c98 CTR: c030abc4 REGS: effe7d70 TRAP: 0700 Not tainted (3.18.0-rc2-next-20141029) MSR: 00021000 <CE,ME> CR: 22044022 XER: 20000000 GPR00: c02b2c98 effe7e20 c0834300 00000098 00021000 00000000 c030b898 00000003 GPR08: 00000001 00000000 00000001 749eec9d 22044022 1001abe0 00000020 ef278678 GPR16: ef278670 ef278668 ef278660 070a8040 c087f99c c08cdc60 00029000 c0840d44 GPR24: c08be6e8 c0840000 effe7e78 ef041340 00000600 ef114e10 00000000 c08be6e0 NIP [c02b2c98] check_unmap+0x51c/0x9e4 LR [c02b2c98] check_unmap+0x51c/0x9e4 Call Trace: [effe7e20] [c02b2c98] check_unmap+0x51c/0x9e4 (unreliable) [effe7e70] [c02b31d8] debug_dma_unmap_page+0x78/0x8c [effe7ed0] [c03d1640] gfar_clean_rx_ring+0x208/0x488 [effe7f40] [c03d1a9c] gfar_poll_rx_sq+0x3c/0xa8 [effe7f60] [c04f8714] net_rx_action+0xc0/0x178 [effe7f90] [c00435a0] __do_softirq+0x100/0x1fc [effe7fe0] [c0043958] irq_exit+0xa4/0xc8 [effe7ff0] [c000d14c] call_do_irq+0x24/0x3c [c0875e90] [c00048a0] do_IRQ+0x8c/0xf8 [c0875eb0] [c000ed10] ret_from_except+0x0/0x18 For TX, we need to unmap the pages which has already been mapped and free the skb before return. For RX, move the dma mapping and error check to gfar_new_skb(). We would reuse the original skb in the rx ring when either allocating skb failure or dma mapping error. Signed-off-by: Kevin Hao <haokexin@gmail.com> Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Reviewed-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-12-11 14:08:41 +08:00
DMA_TO_DEVICE);
txbdp = next_txbd(txbdp, base, tx_queue->tx_ring_size);
}
gfar_wmb();
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
/* Changes the mac address if the controller is not running. */
static int gfar_set_mac_address(struct net_device *dev)
{
gfar_set_mac_for_addr(dev, 0, dev->dev_addr);
return 0;
}
static int gfar_change_mtu(struct net_device *dev, int new_mtu)
{
struct gfar_private *priv = netdev_priv(dev);
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
while (test_and_set_bit_lock(GFAR_RESETTING, &priv->state))
cpu_relax();
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (dev->flags & IFF_UP)
stop_gfar(dev);
dev->mtu = new_mtu;
gianfar: Fix on-the-fly vlan and mtu updates The RCTRL and TCTRL registers should not be changed on-the-fly, while the controller is running, otherwise unexpected behaviour occurs. But that's exactly what gfar_vlan_mode() does, updating the VLAN acceleration bits inside RCTRL/TCTRL. The attempt to lock these operations doesn't help, but only adds to the confusion. There's also a dependency for Rx FCB insertion (activating /de-activating the TOE offload block on Rx) which might change the required rx buffer size. This makes matters worse as gfar_vlan_mode() ends up calling gfar_change_mtu(), though the MTU size remains the same. Note that there are other situations that may affect the required rx buffer size, like changing RXCSUM or rx hw timestamping, but errorneously the rx buffer size is not recomputed/ updated in the process. To fix this, do the vlan updates properly inside the MAC reset and reconfiguration procedure, which takes care of the rx buffer size dependecy and the rx TOE block (PRSDEP) activation/deactivation as well (in the correct order). As a consequence, MTU/ rx buff size updates are done now by the same MAC reset and reconfig procedure, so that out of context updates to MAXFRM, MRBLR, and MACCFG inside change_mtu() are no longer needed. The rx buffer size dependecy to Rx FCB is now handled for the other cases too (RXCSUM and rx hw timestamping). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:43 +08:00
if (dev->flags & IFF_UP)
startup_gfar(dev);
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
clear_bit_unlock(GFAR_RESETTING, &priv->state);
return 0;
}
static void reset_gfar(struct net_device *ndev)
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
{
struct gfar_private *priv = netdev_priv(ndev);
while (test_and_set_bit_lock(GFAR_RESETTING, &priv->state))
cpu_relax();
stop_gfar(ndev);
startup_gfar(ndev);
clear_bit_unlock(GFAR_RESETTING, &priv->state);
}
/* gfar_reset_task gets scheduled when a packet has not been
* transmitted after a set amount of time.
* For now, assume that clearing out all the structures, and
* starting over will fix the problem.
*/
static void gfar_reset_task(struct work_struct *work)
{
struct gfar_private *priv = container_of(work, struct gfar_private,
reset_task);
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
reset_gfar(priv->ndev);
}
netdev: pass the stuck queue to the timeout handler This allows incrementing the correct timeout statistic without any mess. Down the road, devices can learn to reset just the specific queue. The patch was generated with the following script: use strict; use warnings; our $^I = '.bak'; my @work = ( ["arch/m68k/emu/nfeth.c", "nfeth_tx_timeout"], ["arch/um/drivers/net_kern.c", "uml_net_tx_timeout"], ["arch/um/drivers/vector_kern.c", "vector_net_tx_timeout"], ["arch/xtensa/platforms/iss/network.c", "iss_net_tx_timeout"], ["drivers/char/pcmcia/synclink_cs.c", "hdlcdev_tx_timeout"], ["drivers/infiniband/ulp/ipoib/ipoib_main.c", "ipoib_timeout"], ["drivers/infiniband/ulp/ipoib/ipoib_main.c", "ipoib_timeout"], ["drivers/message/fusion/mptlan.c", "mpt_lan_tx_timeout"], ["drivers/misc/sgi-xp/xpnet.c", "xpnet_dev_tx_timeout"], ["drivers/net/appletalk/cops.c", "cops_timeout"], ["drivers/net/arcnet/arcdevice.h", "arcnet_timeout"], ["drivers/net/arcnet/arcnet.c", "arcnet_timeout"], ["drivers/net/arcnet/com20020.c", "arcnet_timeout"], ["drivers/net/ethernet/3com/3c509.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c515.c", "corkscrew_timeout"], ["drivers/net/ethernet/3com/3c574_cs.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c589_cs.c", "el3_tx_timeout"], ["drivers/net/ethernet/3com/3c59x.c", "vortex_tx_timeout"], ["drivers/net/ethernet/3com/3c59x.c", "vortex_tx_timeout"], ["drivers/net/ethernet/3com/typhoon.c", "typhoon_tx_timeout"], ["drivers/net/ethernet/8390/8390.h", "ei_tx_timeout"], ["drivers/net/ethernet/8390/8390.h", "eip_tx_timeout"], ["drivers/net/ethernet/8390/8390.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/8390p.c", "eip_tx_timeout"], ["drivers/net/ethernet/8390/ax88796.c", "ax_ei_tx_timeout"], ["drivers/net/ethernet/8390/axnet_cs.c", "axnet_tx_timeout"], ["drivers/net/ethernet/8390/etherh.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/hydra.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/mac8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/mcf8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/lib8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/8390/ne2k-pci.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/pcnet_cs.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/smc-ultra.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/wd.c", "ei_tx_timeout"], ["drivers/net/ethernet/8390/zorro8390.c", "__ei_tx_timeout"], ["drivers/net/ethernet/adaptec/starfire.c", "tx_timeout"], ["drivers/net/ethernet/agere/et131x.c", "et131x_tx_timeout"], ["drivers/net/ethernet/allwinner/sun4i-emac.c", "emac_timeout"], ["drivers/net/ethernet/alteon/acenic.c", "ace_watchdog"], ["drivers/net/ethernet/amazon/ena/ena_netdev.c", "ena_tx_timeout"], ["drivers/net/ethernet/amd/7990.h", "lance_tx_timeout"], ["drivers/net/ethernet/amd/7990.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/a2065.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/am79c961a.c", "am79c961_timeout"], ["drivers/net/ethernet/amd/amd8111e.c", "amd8111e_tx_timeout"], ["drivers/net/ethernet/amd/ariadne.c", "ariadne_tx_timeout"], ["drivers/net/ethernet/amd/atarilance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/au1000_eth.c", "au1000_tx_timeout"], ["drivers/net/ethernet/amd/declance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/lance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/mvme147.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/ni65.c", "ni65_timeout"], ["drivers/net/ethernet/amd/nmclan_cs.c", "mace_tx_timeout"], ["drivers/net/ethernet/amd/pcnet32.c", "pcnet32_tx_timeout"], ["drivers/net/ethernet/amd/sunlance.c", "lance_tx_timeout"], ["drivers/net/ethernet/amd/xgbe/xgbe-drv.c", "xgbe_tx_timeout"], ["drivers/net/ethernet/apm/xgene-v2/main.c", "xge_timeout"], ["drivers/net/ethernet/apm/xgene/xgene_enet_main.c", "xgene_enet_timeout"], ["drivers/net/ethernet/apple/macmace.c", "mace_tx_timeout"], ["drivers/net/ethernet/atheros/ag71xx.c", "ag71xx_tx_timeout"], ["drivers/net/ethernet/atheros/alx/main.c", "alx_tx_timeout"], ["drivers/net/ethernet/atheros/atl1c/atl1c_main.c", "atl1c_tx_timeout"], ["drivers/net/ethernet/atheros/atl1e/atl1e_main.c", "atl1e_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl.c", "atlx_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl1.c", "atlx_tx_timeout"], ["drivers/net/ethernet/atheros/atlx/atl2.c", "atl2_tx_timeout"], ["drivers/net/ethernet/broadcom/b44.c", "b44_tx_timeout"], ["drivers/net/ethernet/broadcom/bcmsysport.c", "bcm_sysport_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2.c", "bnx2_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c", "bnx2x_tx_timeout"], ["drivers/net/ethernet/broadcom/bnxt/bnxt.c", "bnxt_tx_timeout"], ["drivers/net/ethernet/broadcom/genet/bcmgenet.c", "bcmgenet_timeout"], ["drivers/net/ethernet/broadcom/sb1250-mac.c", "sbmac_tx_timeout"], ["drivers/net/ethernet/broadcom/tg3.c", "tg3_tx_timeout"], ["drivers/net/ethernet/calxeda/xgmac.c", "xgmac_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_main.c", "liquidio_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_vf_main.c", "liquidio_tx_timeout"], ["drivers/net/ethernet/cavium/liquidio/lio_vf_rep.c", "lio_vf_rep_tx_timeout"], ["drivers/net/ethernet/cavium/thunder/nicvf_main.c", "nicvf_tx_timeout"], ["drivers/net/ethernet/cirrus/cs89x0.c", "net_timeout"], ["drivers/net/ethernet/cisco/enic/enic_main.c", "enic_tx_timeout"], ["drivers/net/ethernet/cisco/enic/enic_main.c", "enic_tx_timeout"], ["drivers/net/ethernet/cortina/gemini.c", "gmac_tx_timeout"], ["drivers/net/ethernet/davicom/dm9000.c", "dm9000_timeout"], ["drivers/net/ethernet/dec/tulip/de2104x.c", "de_tx_timeout"], ["drivers/net/ethernet/dec/tulip/tulip_core.c", "tulip_tx_timeout"], ["drivers/net/ethernet/dec/tulip/winbond-840.c", "tx_timeout"], ["drivers/net/ethernet/dlink/dl2k.c", "rio_tx_timeout"], ["drivers/net/ethernet/dlink/sundance.c", "tx_timeout"], ["drivers/net/ethernet/emulex/benet/be_main.c", "be_tx_timeout"], ["drivers/net/ethernet/ethoc.c", "ethoc_tx_timeout"], ["drivers/net/ethernet/faraday/ftgmac100.c", "ftgmac100_tx_timeout"], ["drivers/net/ethernet/fealnx.c", "fealnx_tx_timeout"], ["drivers/net/ethernet/freescale/dpaa/dpaa_eth.c", "dpaa_tx_timeout"], ["drivers/net/ethernet/freescale/fec_main.c", "fec_timeout"], ["drivers/net/ethernet/freescale/fec_mpc52xx.c", "mpc52xx_fec_tx_timeout"], ["drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c", "fs_timeout"], ["drivers/net/ethernet/freescale/gianfar.c", "gfar_timeout"], ["drivers/net/ethernet/freescale/ucc_geth.c", "ucc_geth_timeout"], ["drivers/net/ethernet/fujitsu/fmvj18x_cs.c", "fjn_tx_timeout"], ["drivers/net/ethernet/google/gve/gve_main.c", "gve_tx_timeout"], ["drivers/net/ethernet/hisilicon/hip04_eth.c", "hip04_timeout"], ["drivers/net/ethernet/hisilicon/hix5hd2_gmac.c", "hix5hd2_net_timeout"], ["drivers/net/ethernet/hisilicon/hns/hns_enet.c", "hns_nic_net_timeout"], ["drivers/net/ethernet/hisilicon/hns3/hns3_enet.c", "hns3_nic_net_timeout"], ["drivers/net/ethernet/huawei/hinic/hinic_main.c", "hinic_tx_timeout"], ["drivers/net/ethernet/i825xx/82596.c", "i596_tx_timeout"], ["drivers/net/ethernet/i825xx/ether1.c", "ether1_timeout"], ["drivers/net/ethernet/i825xx/lib82596.c", "i596_tx_timeout"], ["drivers/net/ethernet/i825xx/sun3_82586.c", "sun3_82586_timeout"], ["drivers/net/ethernet/ibm/ehea/ehea_main.c", "ehea_tx_watchdog"], ["drivers/net/ethernet/ibm/emac/core.c", "emac_tx_timeout"], ["drivers/net/ethernet/ibm/emac/core.c", "emac_tx_timeout"], ["drivers/net/ethernet/ibm/ibmvnic.c", "ibmvnic_tx_timeout"], ["drivers/net/ethernet/intel/e100.c", "e100_tx_timeout"], ["drivers/net/ethernet/intel/e1000/e1000_main.c", "e1000_tx_timeout"], ["drivers/net/ethernet/intel/e1000e/netdev.c", "e1000_tx_timeout"], ["drivers/net/ethernet/intel/fm10k/fm10k_netdev.c", "fm10k_tx_timeout"], ["drivers/net/ethernet/intel/i40e/i40e_main.c", "i40e_tx_timeout"], ["drivers/net/ethernet/intel/iavf/iavf_main.c", "iavf_tx_timeout"], ["drivers/net/ethernet/intel/ice/ice_main.c", "ice_tx_timeout"], ["drivers/net/ethernet/intel/ice/ice_main.c", "ice_tx_timeout"], ["drivers/net/ethernet/intel/igb/igb_main.c", "igb_tx_timeout"], ["drivers/net/ethernet/intel/igbvf/netdev.c", "igbvf_tx_timeout"], ["drivers/net/ethernet/intel/ixgb/ixgb_main.c", "ixgb_tx_timeout"], ["drivers/net/ethernet/intel/ixgbe/ixgbe_debugfs.c", "adapter->netdev->netdev_ops->ndo_tx_timeout(adapter->netdev);"], ["drivers/net/ethernet/intel/ixgbe/ixgbe_main.c", "ixgbe_tx_timeout"], ["drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c", "ixgbevf_tx_timeout"], ["drivers/net/ethernet/jme.c", "jme_tx_timeout"], ["drivers/net/ethernet/korina.c", "korina_tx_timeout"], ["drivers/net/ethernet/lantiq_etop.c", "ltq_etop_tx_timeout"], ["drivers/net/ethernet/marvell/mv643xx_eth.c", "mv643xx_eth_tx_timeout"], ["drivers/net/ethernet/marvell/pxa168_eth.c", "pxa168_eth_tx_timeout"], ["drivers/net/ethernet/marvell/skge.c", "skge_tx_timeout"], ["drivers/net/ethernet/marvell/sky2.c", "sky2_tx_timeout"], ["drivers/net/ethernet/marvell/sky2.c", "sky2_tx_timeout"], ["drivers/net/ethernet/mediatek/mtk_eth_soc.c", "mtk_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx4/en_netdev.c", "mlx4_en_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx4/en_netdev.c", "mlx4_en_tx_timeout"], ["drivers/net/ethernet/mellanox/mlx5/core/en_main.c", "mlx5e_tx_timeout"], ["drivers/net/ethernet/micrel/ks8842.c", "ks8842_tx_timeout"], ["drivers/net/ethernet/micrel/ksz884x.c", "netdev_tx_timeout"], ["drivers/net/ethernet/microchip/enc28j60.c", "enc28j60_tx_timeout"], ["drivers/net/ethernet/microchip/encx24j600.c", "encx24j600_tx_timeout"], ["drivers/net/ethernet/natsemi/sonic.h", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/sonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/jazzsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/macsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/natsemi/natsemi.c", "ns_tx_timeout"], ["drivers/net/ethernet/natsemi/ns83820.c", "ns83820_tx_timeout"], ["drivers/net/ethernet/natsemi/xtsonic.c", "sonic_tx_timeout"], ["drivers/net/ethernet/neterion/s2io.h", "s2io_tx_watchdog"], ["drivers/net/ethernet/neterion/s2io.c", "s2io_tx_watchdog"], ["drivers/net/ethernet/neterion/vxge/vxge-main.c", "vxge_tx_watchdog"], ["drivers/net/ethernet/netronome/nfp/nfp_net_common.c", "nfp_net_tx_timeout"], ["drivers/net/ethernet/nvidia/forcedeth.c", "nv_tx_timeout"], ["drivers/net/ethernet/nvidia/forcedeth.c", "nv_tx_timeout"], ["drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c", "pch_gbe_tx_timeout"], ["drivers/net/ethernet/packetengines/hamachi.c", "hamachi_tx_timeout"], ["drivers/net/ethernet/packetengines/yellowfin.c", "yellowfin_tx_timeout"], ["drivers/net/ethernet/pensando/ionic/ionic_lif.c", "ionic_tx_timeout"], ["drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c", "netxen_tx_timeout"], ["drivers/net/ethernet/qlogic/qla3xxx.c", "ql3xxx_tx_timeout"], ["drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c", "qlcnic_tx_timeout"], ["drivers/net/ethernet/qualcomm/emac/emac.c", "emac_tx_timeout"], ["drivers/net/ethernet/qualcomm/qca_spi.c", "qcaspi_netdev_tx_timeout"], ["drivers/net/ethernet/qualcomm/qca_uart.c", "qcauart_netdev_tx_timeout"], ["drivers/net/ethernet/rdc/r6040.c", "r6040_tx_timeout"], ["drivers/net/ethernet/realtek/8139cp.c", "cp_tx_timeout"], ["drivers/net/ethernet/realtek/8139too.c", "rtl8139_tx_timeout"], ["drivers/net/ethernet/realtek/atp.c", "tx_timeout"], ["drivers/net/ethernet/realtek/r8169_main.c", "rtl8169_tx_timeout"], ["drivers/net/ethernet/renesas/ravb_main.c", "ravb_tx_timeout"], ["drivers/net/ethernet/renesas/sh_eth.c", "sh_eth_tx_timeout"], ["drivers/net/ethernet/renesas/sh_eth.c", "sh_eth_tx_timeout"], ["drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c", "sxgbe_tx_timeout"], ["drivers/net/ethernet/seeq/ether3.c", "ether3_timeout"], ["drivers/net/ethernet/seeq/sgiseeq.c", "timeout"], ["drivers/net/ethernet/sfc/efx.c", "efx_watchdog"], ["drivers/net/ethernet/sfc/falcon/efx.c", "ef4_watchdog"], ["drivers/net/ethernet/sgi/ioc3-eth.c", "ioc3_timeout"], ["drivers/net/ethernet/sgi/meth.c", "meth_tx_timeout"], ["drivers/net/ethernet/silan/sc92031.c", "sc92031_tx_timeout"], ["drivers/net/ethernet/sis/sis190.c", "sis190_tx_timeout"], ["drivers/net/ethernet/sis/sis900.c", "sis900_tx_timeout"], ["drivers/net/ethernet/smsc/epic100.c", "epic_tx_timeout"], ["drivers/net/ethernet/smsc/smc911x.c", "smc911x_timeout"], ["drivers/net/ethernet/smsc/smc9194.c", "smc_timeout"], ["drivers/net/ethernet/smsc/smc91c92_cs.c", "smc_tx_timeout"], ["drivers/net/ethernet/smsc/smc91x.c", "smc_timeout"], ["drivers/net/ethernet/stmicro/stmmac/stmmac_main.c", "stmmac_tx_timeout"], ["drivers/net/ethernet/sun/cassini.c", "cas_tx_timeout"], ["drivers/net/ethernet/sun/ldmvsw.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/niu.c", "niu_tx_timeout"], ["drivers/net/ethernet/sun/sunbmac.c", "bigmac_tx_timeout"], ["drivers/net/ethernet/sun/sungem.c", "gem_tx_timeout"], ["drivers/net/ethernet/sun/sunhme.c", "happy_meal_tx_timeout"], ["drivers/net/ethernet/sun/sunqe.c", "qe_tx_timeout"], ["drivers/net/ethernet/sun/sunvnet.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/sunvnet_common.c", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/sun/sunvnet_common.h", "sunvnet_tx_timeout_common"], ["drivers/net/ethernet/synopsys/dwc-xlgmac-net.c", "xlgmac_tx_timeout"], ["drivers/net/ethernet/ti/cpmac.c", "cpmac_tx_timeout"], ["drivers/net/ethernet/ti/cpsw.c", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/cpsw_priv.c", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/cpsw_priv.h", "cpsw_ndo_tx_timeout"], ["drivers/net/ethernet/ti/davinci_emac.c", "emac_dev_tx_timeout"], ["drivers/net/ethernet/ti/netcp_core.c", "netcp_ndo_tx_timeout"], ["drivers/net/ethernet/ti/tlan.c", "tlan_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_net.h", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_net.c", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/ps3_gelic_wireless.c", "gelic_net_tx_timeout"], ["drivers/net/ethernet/toshiba/spider_net.c", "spider_net_tx_timeout"], ["drivers/net/ethernet/toshiba/tc35815.c", "tc35815_tx_timeout"], ["drivers/net/ethernet/via/via-rhine.c", "rhine_tx_timeout"], ["drivers/net/ethernet/wiznet/w5100.c", "w5100_tx_timeout"], ["drivers/net/ethernet/wiznet/w5300.c", "w5300_tx_timeout"], ["drivers/net/ethernet/xilinx/xilinx_emaclite.c", "xemaclite_tx_timeout"], ["drivers/net/ethernet/xircom/xirc2ps_cs.c", "xirc_tx_timeout"], ["drivers/net/fjes/fjes_main.c", "fjes_tx_retry"], ["drivers/net/slip/slip.c", "sl_tx_timeout"], ["include/linux/usb/usbnet.h", "usbnet_tx_timeout"], ["drivers/net/usb/aqc111.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/asix_devices.c", "usbnet_tx_timeout"], ["drivers/net/usb/ax88172a.c", "usbnet_tx_timeout"], ["drivers/net/usb/ax88179_178a.c", "usbnet_tx_timeout"], ["drivers/net/usb/catc.c", "catc_tx_timeout"], ["drivers/net/usb/cdc_mbim.c", "usbnet_tx_timeout"], ["drivers/net/usb/cdc_ncm.c", "usbnet_tx_timeout"], ["drivers/net/usb/dm9601.c", "usbnet_tx_timeout"], ["drivers/net/usb/hso.c", "hso_net_tx_timeout"], ["drivers/net/usb/int51x1.c", "usbnet_tx_timeout"], ["drivers/net/usb/ipheth.c", "ipheth_tx_timeout"], ["drivers/net/usb/kaweth.c", "kaweth_tx_timeout"], ["drivers/net/usb/lan78xx.c", "lan78xx_tx_timeout"], ["drivers/net/usb/mcs7830.c", "usbnet_tx_timeout"], ["drivers/net/usb/pegasus.c", "pegasus_tx_timeout"], ["drivers/net/usb/qmi_wwan.c", "usbnet_tx_timeout"], ["drivers/net/usb/r8152.c", "rtl8152_tx_timeout"], ["drivers/net/usb/rndis_host.c", "usbnet_tx_timeout"], ["drivers/net/usb/rtl8150.c", "rtl8150_tx_timeout"], ["drivers/net/usb/sierra_net.c", "usbnet_tx_timeout"], ["drivers/net/usb/smsc75xx.c", "usbnet_tx_timeout"], ["drivers/net/usb/smsc95xx.c", "usbnet_tx_timeout"], ["drivers/net/usb/sr9700.c", "usbnet_tx_timeout"], ["drivers/net/usb/sr9800.c", "usbnet_tx_timeout"], ["drivers/net/usb/usbnet.c", "usbnet_tx_timeout"], ["drivers/net/vmxnet3/vmxnet3_drv.c", "vmxnet3_tx_timeout"], ["drivers/net/wan/cosa.c", "cosa_net_timeout"], ["drivers/net/wan/farsync.c", "fst_tx_timeout"], ["drivers/net/wan/fsl_ucc_hdlc.c", "uhdlc_tx_timeout"], ["drivers/net/wan/lmc/lmc_main.c", "lmc_driver_timeout"], ["drivers/net/wan/x25_asy.c", "x25_asy_timeout"], ["drivers/net/wimax/i2400m/netdev.c", "i2400m_tx_timeout"], ["drivers/net/wireless/intel/ipw2x00/ipw2100.c", "ipw2100_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/hostap/hostap_main.c", "prism2_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/main.c", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/orinoco_usb.c", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/orinoco/orinoco.h", "orinoco_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_dev.c", "islpci_eth_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_eth.c", "islpci_eth_tx_timeout"], ["drivers/net/wireless/intersil/prism54/islpci_eth.h", "islpci_eth_tx_timeout"], ["drivers/net/wireless/marvell/mwifiex/main.c", "mwifiex_tx_timeout"], ["drivers/net/wireless/quantenna/qtnfmac/core.c", "qtnf_netdev_tx_timeout"], ["drivers/net/wireless/quantenna/qtnfmac/core.h", "qtnf_netdev_tx_timeout"], ["drivers/net/wireless/rndis_wlan.c", "usbnet_tx_timeout"], ["drivers/net/wireless/wl3501_cs.c", "wl3501_tx_timeout"], ["drivers/net/wireless/zydas/zd1201.c", "zd1201_tx_timeout"], ["drivers/s390/net/qeth_core.h", "qeth_tx_timeout"], ["drivers/s390/net/qeth_core_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l2_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l2_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l3_main.c", "qeth_tx_timeout"], ["drivers/s390/net/qeth_l3_main.c", "qeth_tx_timeout"], ["drivers/staging/ks7010/ks_wlan_net.c", "ks_wlan_tx_timeout"], ["drivers/staging/qlge/qlge_main.c", "qlge_tx_timeout"], ["drivers/staging/rtl8192e/rtl8192e/rtl_core.c", "_rtl92e_tx_timeout"], ["drivers/staging/rtl8192u/r8192U_core.c", "tx_timeout"], ["drivers/staging/unisys/visornic/visornic_main.c", "visornic_xmit_timeout"], ["drivers/staging/wlan-ng/p80211netdev.c", "p80211knetdev_tx_timeout"], ["drivers/tty/n_gsm.c", "gsm_mux_net_tx_timeout"], ["drivers/tty/synclink.c", "hdlcdev_tx_timeout"], ["drivers/tty/synclink_gt.c", "hdlcdev_tx_timeout"], ["drivers/tty/synclinkmp.c", "hdlcdev_tx_timeout"], ["net/atm/lec.c", "lec_tx_timeout"], ["net/bluetooth/bnep/netdev.c", "bnep_net_timeout"] ); for my $p (@work) { my @pair = @$p; my $file = $pair[0]; my $func = $pair[1]; print STDERR $file , ": ", $func,"\n"; our @ARGV = ($file); while (<ARGV>) { if (m/($func\s*\(struct\s+net_device\s+\*[A-Za-z_]?[A-Za-z-0-9_]*)(\))/) { print STDERR "found $1+$2 in $file\n"; } if (s/($func\s*\(struct\s+net_device\s+\*[A-Za-z_]?[A-Za-z-0-9_]*)(\))/$1, unsigned int txqueue$2/) { print STDERR "$func found in $file\n"; } print; } } where the list of files and functions is simply from: git grep ndo_tx_timeout, with manual addition of headers in the rare cases where the function is from a header, then manually changing the few places which actually call ndo_tx_timeout. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Heiner Kallweit <hkallweit1@gmail.com> Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com> Acked-by: Shannon Nelson <snelson@pensando.io> Reviewed-by: Martin Habets <mhabets@solarflare.com> changes from v9: fixup a forward declaration changes from v9: more leftovers from v3 change changes from v8: fix up a missing direct call to timeout rebased on net-next changes from v7: fixup leftovers from v3 change changes from v6: fix typo in rtl driver changes from v5: add missing files (allow any net device argument name) changes from v4: add a missing driver header changes from v3: change queue # to unsigned Changes from v2: added headers Changes from v1: Fix errors found by kbuild: generalize the pattern a bit, to pick up a couple of instances missed by the previous version. Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-10 22:23:51 +08:00
static void gfar_timeout(struct net_device *dev, unsigned int txqueue)
{
struct gfar_private *priv = netdev_priv(dev);
dev->stats.tx_errors++;
schedule_work(&priv->reset_task);
}
static int gfar_hwtstamp_set(struct net_device *netdev, struct ifreq *ifr)
{
struct hwtstamp_config config;
struct gfar_private *priv = netdev_priv(netdev);
if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
return -EFAULT;
/* reserved for future extensions */
if (config.flags)
return -EINVAL;
switch (config.tx_type) {
case HWTSTAMP_TX_OFF:
priv->hwts_tx_en = 0;
break;
case HWTSTAMP_TX_ON:
if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
return -ERANGE;
priv->hwts_tx_en = 1;
break;
default:
return -ERANGE;
}
switch (config.rx_filter) {
case HWTSTAMP_FILTER_NONE:
if (priv->hwts_rx_en) {
priv->hwts_rx_en = 0;
reset_gfar(netdev);
}
break;
default:
if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
return -ERANGE;
if (!priv->hwts_rx_en) {
priv->hwts_rx_en = 1;
reset_gfar(netdev);
}
config.rx_filter = HWTSTAMP_FILTER_ALL;
break;
}
return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
-EFAULT : 0;
}
static int gfar_hwtstamp_get(struct net_device *netdev, struct ifreq *ifr)
{
struct hwtstamp_config config;
struct gfar_private *priv = netdev_priv(netdev);
config.flags = 0;
config.tx_type = priv->hwts_tx_en ? HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
config.rx_filter = (priv->hwts_rx_en ?
HWTSTAMP_FILTER_ALL : HWTSTAMP_FILTER_NONE);
return copy_to_user(ifr->ifr_data, &config, sizeof(config)) ?
-EFAULT : 0;
}
static int gfar_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct phy_device *phydev = dev->phydev;
if (!netif_running(dev))
return -EINVAL;
if (cmd == SIOCSHWTSTAMP)
return gfar_hwtstamp_set(dev, rq);
if (cmd == SIOCGHWTSTAMP)
return gfar_hwtstamp_get(dev, rq);
if (!phydev)
return -ENODEV;
return phy_mii_ioctl(phydev, rq, cmd);
}
/* Interrupt Handler for Transmit complete */
static void gfar_clean_tx_ring(struct gfar_priv_tx_q *tx_queue)
{
struct net_device *dev = tx_queue->dev;
struct netdev_queue *txq;
struct gfar_private *priv = netdev_priv(dev);
struct txbd8 *bdp, *next = NULL;
struct txbd8 *lbdp = NULL;
struct txbd8 *base = tx_queue->tx_bd_base;
struct sk_buff *skb;
int skb_dirtytx;
int tx_ring_size = tx_queue->tx_ring_size;
int frags = 0, nr_txbds = 0;
int i;
int howmany = 0;
int tqi = tx_queue->qindex;
unsigned int bytes_sent = 0;
u32 lstatus;
size_t buflen;
txq = netdev_get_tx_queue(dev, tqi);
bdp = tx_queue->dirty_tx;
skb_dirtytx = tx_queue->skb_dirtytx;
while ((skb = tx_queue->tx_skbuff[skb_dirtytx])) {
gianfar: Fix TX timestamping with a stacked DSA driver The driver wrongly assumes that it is the only entity that can set the SKBTX_IN_PROGRESS bit of the current skb. Therefore, in the gfar_clean_tx_ring function, where the TX timestamp is collected if necessary, the aforementioned bit is used to discriminate whether or not the TX timestamp should be delivered to the socket's error queue. But a stacked driver such as a DSA switch can also set the SKBTX_IN_PROGRESS bit, which is actually exactly what it should do in order to denote that the hardware timestamping process is undergoing. Therefore, gianfar would misinterpret the "in progress" bit as being its own, and deliver a second skb clone in the socket's error queue, completely throwing off a PTP process which is not expecting to receive it, _even though_ TX timestamping is not enabled for gianfar. There have been discussions [0] as to whether non-MAC drivers need or not to set SKBTX_IN_PROGRESS at all (whose purpose is to avoid sending 2 timestamps, a sw and a hw one, to applications which only expect one). But as of this patch, there are at least 2 PTP drivers that would break in conjunction with gianfar: the sja1105 DSA switch and the felix switch, by way of its ocelot core driver. So regardless of that conclusion, fix the gianfar driver to not do stuff based on flags set by others and not intended for it. [0]: https://www.spinics.net/lists/netdev/msg619699.html Fixes: f0ee7acfcdd4 ("gianfar: Add hardware TX timestamping support") Signed-off-by: Vladimir Oltean <olteanv@gmail.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 21:30:45 +08:00
bool do_tstamp;
do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
priv->hwts_tx_en;
frags = skb_shinfo(skb)->nr_frags;
/* When time stamping, one additional TxBD must be freed.
* Also, we need to dma_unmap_single() the TxPAL.
*/
gianfar: Fix TX timestamping with a stacked DSA driver The driver wrongly assumes that it is the only entity that can set the SKBTX_IN_PROGRESS bit of the current skb. Therefore, in the gfar_clean_tx_ring function, where the TX timestamp is collected if necessary, the aforementioned bit is used to discriminate whether or not the TX timestamp should be delivered to the socket's error queue. But a stacked driver such as a DSA switch can also set the SKBTX_IN_PROGRESS bit, which is actually exactly what it should do in order to denote that the hardware timestamping process is undergoing. Therefore, gianfar would misinterpret the "in progress" bit as being its own, and deliver a second skb clone in the socket's error queue, completely throwing off a PTP process which is not expecting to receive it, _even though_ TX timestamping is not enabled for gianfar. There have been discussions [0] as to whether non-MAC drivers need or not to set SKBTX_IN_PROGRESS at all (whose purpose is to avoid sending 2 timestamps, a sw and a hw one, to applications which only expect one). But as of this patch, there are at least 2 PTP drivers that would break in conjunction with gianfar: the sja1105 DSA switch and the felix switch, by way of its ocelot core driver. So regardless of that conclusion, fix the gianfar driver to not do stuff based on flags set by others and not intended for it. [0]: https://www.spinics.net/lists/netdev/msg619699.html Fixes: f0ee7acfcdd4 ("gianfar: Add hardware TX timestamping support") Signed-off-by: Vladimir Oltean <olteanv@gmail.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 21:30:45 +08:00
if (unlikely(do_tstamp))
nr_txbds = frags + 2;
else
nr_txbds = frags + 1;
lbdp = skip_txbd(bdp, nr_txbds - 1, base, tx_ring_size);
lstatus = be32_to_cpu(lbdp->lstatus);
/* Only clean completed frames */
if ((lstatus & BD_LFLAG(TXBD_READY)) &&
(lstatus & BD_LENGTH_MASK))
break;
gianfar: Fix TX timestamping with a stacked DSA driver The driver wrongly assumes that it is the only entity that can set the SKBTX_IN_PROGRESS bit of the current skb. Therefore, in the gfar_clean_tx_ring function, where the TX timestamp is collected if necessary, the aforementioned bit is used to discriminate whether or not the TX timestamp should be delivered to the socket's error queue. But a stacked driver such as a DSA switch can also set the SKBTX_IN_PROGRESS bit, which is actually exactly what it should do in order to denote that the hardware timestamping process is undergoing. Therefore, gianfar would misinterpret the "in progress" bit as being its own, and deliver a second skb clone in the socket's error queue, completely throwing off a PTP process which is not expecting to receive it, _even though_ TX timestamping is not enabled for gianfar. There have been discussions [0] as to whether non-MAC drivers need or not to set SKBTX_IN_PROGRESS at all (whose purpose is to avoid sending 2 timestamps, a sw and a hw one, to applications which only expect one). But as of this patch, there are at least 2 PTP drivers that would break in conjunction with gianfar: the sja1105 DSA switch and the felix switch, by way of its ocelot core driver. So regardless of that conclusion, fix the gianfar driver to not do stuff based on flags set by others and not intended for it. [0]: https://www.spinics.net/lists/netdev/msg619699.html Fixes: f0ee7acfcdd4 ("gianfar: Add hardware TX timestamping support") Signed-off-by: Vladimir Oltean <olteanv@gmail.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 21:30:45 +08:00
if (unlikely(do_tstamp)) {
next = next_txbd(bdp, base, tx_ring_size);
buflen = be16_to_cpu(next->length) +
GMAC_FCB_LEN + GMAC_TXPAL_LEN;
} else
buflen = be16_to_cpu(bdp->length);
dma_unmap_single(priv->dev, be32_to_cpu(bdp->bufPtr),
buflen, DMA_TO_DEVICE);
gianfar: Fix TX timestamping with a stacked DSA driver The driver wrongly assumes that it is the only entity that can set the SKBTX_IN_PROGRESS bit of the current skb. Therefore, in the gfar_clean_tx_ring function, where the TX timestamp is collected if necessary, the aforementioned bit is used to discriminate whether or not the TX timestamp should be delivered to the socket's error queue. But a stacked driver such as a DSA switch can also set the SKBTX_IN_PROGRESS bit, which is actually exactly what it should do in order to denote that the hardware timestamping process is undergoing. Therefore, gianfar would misinterpret the "in progress" bit as being its own, and deliver a second skb clone in the socket's error queue, completely throwing off a PTP process which is not expecting to receive it, _even though_ TX timestamping is not enabled for gianfar. There have been discussions [0] as to whether non-MAC drivers need or not to set SKBTX_IN_PROGRESS at all (whose purpose is to avoid sending 2 timestamps, a sw and a hw one, to applications which only expect one). But as of this patch, there are at least 2 PTP drivers that would break in conjunction with gianfar: the sja1105 DSA switch and the felix switch, by way of its ocelot core driver. So regardless of that conclusion, fix the gianfar driver to not do stuff based on flags set by others and not intended for it. [0]: https://www.spinics.net/lists/netdev/msg619699.html Fixes: f0ee7acfcdd4 ("gianfar: Add hardware TX timestamping support") Signed-off-by: Vladimir Oltean <olteanv@gmail.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-28 21:30:45 +08:00
if (unlikely(do_tstamp)) {
struct skb_shared_hwtstamps shhwtstamps;
u64 *ns = (u64 *)(((uintptr_t)skb->data + 0x10) &
~0x7UL);
memset(&shhwtstamps, 0, sizeof(shhwtstamps));
shhwtstamps.hwtstamp = ns_to_ktime(be64_to_cpu(*ns));
skb_pull(skb, GMAC_FCB_LEN + GMAC_TXPAL_LEN);
skb_tstamp_tx(skb, &shhwtstamps);
gfar_clear_txbd_status(bdp);
bdp = next;
}
gfar_clear_txbd_status(bdp);
bdp = next_txbd(bdp, base, tx_ring_size);
for (i = 0; i < frags; i++) {
dma_unmap_page(priv->dev, be32_to_cpu(bdp->bufPtr),
be16_to_cpu(bdp->length),
DMA_TO_DEVICE);
gfar_clear_txbd_status(bdp);
bdp = next_txbd(bdp, base, tx_ring_size);
}
bytes_sent += GFAR_CB(skb)->bytes_sent;
dev_kfree_skb_any(skb);
tx_queue->tx_skbuff[skb_dirtytx] = NULL;
skb_dirtytx = (skb_dirtytx + 1) &
TX_RING_MOD_MASK(tx_ring_size);
howmany++;
spin_lock(&tx_queue->txlock);
tx_queue->num_txbdfree += nr_txbds;
spin_unlock(&tx_queue->txlock);
}
/* If we freed a buffer, we can restart transmission, if necessary */
gianfar: Fix device reset races (oops) for Tx The device reset procedure, stop_gfar()/startup_gfar(), has concurrency issues. "Kernel access of bad area" oopses show up during Tx timeout device reset or other reset cases (like changing MTU) that happen while the interface still has traffic. The oopses happen in start_xmit and clean_tx_ring when accessing tx_queue-> tx_skbuff which is NULL. The race comes from de-allocating the tx_skbuff while transmission and napi processing are still active. Though the Tx queues get temoprarily stopped when Tx timeout occurs, they get re-enabled as a result of Tx congestion handling inside the napi context (see clean_tx_ring()). Not disabling the napi during reset is also a bug, because clean_tx_ring() will try to access tx_skbuff while it is being de-alloc'ed and re-alloc'ed. To fix this, stop_gfar() needs to disable napi processing after stopping the Tx queues. However, in order to prevent clean_tx_ring() to re-enable the Tx queue before the napi gets disabled, the device state DOWN has been introduced. It prevents the Tx congestion management from re-enabling the de-congested Tx queue while the device is brought down. An additional locking state, RESETTING, has been introduced to prevent simultaneous resets or to prevent configuring the device while it is resetting. The bogus 'rxlock's (for each Rx queue) have been removed since their purpose is not justified, as they don't prevent nor are suited to prevent device reset/reconfig races (such as this one). Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-24 18:13:45 +08:00
if (tx_queue->num_txbdfree &&
netif_tx_queue_stopped(txq) &&
!(test_bit(GFAR_DOWN, &priv->state)))
netif_wake_subqueue(priv->ndev, tqi);
/* Update dirty indicators */
tx_queue->skb_dirtytx = skb_dirtytx;
tx_queue->dirty_tx = bdp;
netdev_tx_completed_queue(txq, howmany, bytes_sent);
}
static void count_errors(u32 lstatus, struct net_device *ndev)
{
struct gfar_private *priv = netdev_priv(ndev);
struct net_device_stats *stats = &ndev->stats;
struct gfar_extra_stats *estats = &priv->extra_stats;
/* If the packet was truncated, none of the other errors matter */
if (lstatus & BD_LFLAG(RXBD_TRUNCATED)) {
stats->rx_length_errors++;
atomic64_inc(&estats->rx_trunc);
return;
}
/* Count the errors, if there were any */
if (lstatus & BD_LFLAG(RXBD_LARGE | RXBD_SHORT)) {
stats->rx_length_errors++;
if (lstatus & BD_LFLAG(RXBD_LARGE))
atomic64_inc(&estats->rx_large);
else
atomic64_inc(&estats->rx_short);
}
if (lstatus & BD_LFLAG(RXBD_NONOCTET)) {
stats->rx_frame_errors++;
atomic64_inc(&estats->rx_nonoctet);
}
if (lstatus & BD_LFLAG(RXBD_CRCERR)) {
atomic64_inc(&estats->rx_crcerr);
stats->rx_crc_errors++;
}
if (lstatus & BD_LFLAG(RXBD_OVERRUN)) {
atomic64_inc(&estats->rx_overrun);
stats->rx_over_errors++;
}
}
static irqreturn_t gfar_receive(int irq, void *grp_id)
{
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
struct gfar_priv_grp *grp = (struct gfar_priv_grp *)grp_id;
unsigned long flags;
u32 imask, ievent;
ievent = gfar_read(&grp->regs->ievent);
if (unlikely(ievent & IEVENT_FGPI)) {
gfar_write(&grp->regs->ievent, IEVENT_FGPI);
return IRQ_HANDLED;
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
if (likely(napi_schedule_prep(&grp->napi_rx))) {
spin_lock_irqsave(&grp->grplock, flags);
imask = gfar_read(&grp->regs->imask);
imask &= IMASK_RX_DISABLED;
gfar_write(&grp->regs->imask, imask);
spin_unlock_irqrestore(&grp->grplock, flags);
__napi_schedule(&grp->napi_rx);
} else {
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived.
*/
gfar_write(&grp->regs->ievent, IEVENT_RX_MASK);
}
return IRQ_HANDLED;
}
/* Interrupt Handler for Transmit complete */
static irqreturn_t gfar_transmit(int irq, void *grp_id)
{
struct gfar_priv_grp *grp = (struct gfar_priv_grp *)grp_id;
unsigned long flags;
u32 imask;
if (likely(napi_schedule_prep(&grp->napi_tx))) {
spin_lock_irqsave(&grp->grplock, flags);
imask = gfar_read(&grp->regs->imask);
imask &= IMASK_TX_DISABLED;
gfar_write(&grp->regs->imask, imask);
spin_unlock_irqrestore(&grp->grplock, flags);
__napi_schedule(&grp->napi_tx);
} else {
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived.
*/
gfar_write(&grp->regs->ievent, IEVENT_TX_MASK);
}
return IRQ_HANDLED;
}
static bool gfar_add_rx_frag(struct gfar_rx_buff *rxb, u32 lstatus,
struct sk_buff *skb, bool first)
{
gianfar: prevent integer wrapping in the rx handler When the frame check sequence (FCS) is split across the last two frames of a fragmented packet, part of the FCS gets counted twice, once when subtracting the FCS, and again when subtracting the previously received data. For example, if 1602 bytes are received, and the first fragment contains the first 1600 bytes (including the first two bytes of the FCS), and the second fragment contains the last two bytes of the FCS: 'skb->len == 1600' from the first fragment size = lstatus & BD_LENGTH_MASK; # 1602 size -= ETH_FCS_LEN; # 1598 size -= skb->len; # -2 Since the size is unsigned, it wraps around and causes a BUG later in the packet handling, as shown below: kernel BUG at ./include/linux/skbuff.h:2068! Oops: Exception in kernel mode, sig: 5 [#1] ... NIP [c021ec60] skb_pull+0x24/0x44 LR [c01e2fbc] gfar_clean_rx_ring+0x498/0x690 Call Trace: [df7edeb0] [c01e2c1c] gfar_clean_rx_ring+0xf8/0x690 (unreliable) [df7edf20] [c01e33a8] gfar_poll_rx_sq+0x3c/0x9c [df7edf40] [c023352c] net_rx_action+0x21c/0x274 [df7edf90] [c0329000] __do_softirq+0xd8/0x240 [df7edff0] [c000c108] call_do_irq+0x24/0x3c [c0597e90] [c00041dc] do_IRQ+0x64/0xc4 [c0597eb0] [c000d920] ret_from_except+0x0/0x18 --- interrupt: 501 at arch_cpu_idle+0x24/0x5c Change the size to a signed integer and then trim off any part of the FCS that was received prior to the last fragment. Fixes: 6c389fc931bc ("gianfar: fix size of scatter-gathered frames") Signed-off-by: Andy Spencer <aspencer@spacex.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-01-26 11:37:50 +08:00
int size = lstatus & BD_LENGTH_MASK;
struct page *page = rxb->page;
if (likely(first)) {
skb_put(skb, size);
} else {
/* the last fragments' length contains the full frame length */
if (lstatus & BD_LFLAG(RXBD_LAST))
size -= skb->len;
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
rxb->page_offset + RXBUF_ALIGNMENT,
size, GFAR_RXB_TRUESIZE);
}
/* try reuse page */
if (unlikely(page_count(page) != 1 || page_is_pfmemalloc(page)))
return false;
/* change offset to the other half */
rxb->page_offset ^= GFAR_RXB_TRUESIZE;
2016-03-18 05:19:26 +08:00
page_ref_inc(page);
return true;
}
static void gfar_reuse_rx_page(struct gfar_priv_rx_q *rxq,
struct gfar_rx_buff *old_rxb)
{
struct gfar_rx_buff *new_rxb;
u16 nta = rxq->next_to_alloc;
new_rxb = &rxq->rx_buff[nta];
/* find next buf that can reuse a page */
nta++;
rxq->next_to_alloc = (nta < rxq->rx_ring_size) ? nta : 0;
/* copy page reference */
*new_rxb = *old_rxb;
/* sync for use by the device */
dma_sync_single_range_for_device(rxq->dev, old_rxb->dma,
old_rxb->page_offset,
GFAR_RXB_TRUESIZE, DMA_FROM_DEVICE);
}
static struct sk_buff *gfar_get_next_rxbuff(struct gfar_priv_rx_q *rx_queue,
u32 lstatus, struct sk_buff *skb)
{
struct gfar_rx_buff *rxb = &rx_queue->rx_buff[rx_queue->next_to_clean];
struct page *page = rxb->page;
bool first = false;
if (likely(!skb)) {
void *buff_addr = page_address(page) + rxb->page_offset;
skb = build_skb(buff_addr, GFAR_SKBFRAG_SIZE);
if (unlikely(!skb)) {
gfar_rx_alloc_err(rx_queue);
return NULL;
}
skb_reserve(skb, RXBUF_ALIGNMENT);
first = true;
}
dma_sync_single_range_for_cpu(rx_queue->dev, rxb->dma, rxb->page_offset,
GFAR_RXB_TRUESIZE, DMA_FROM_DEVICE);
if (gfar_add_rx_frag(rxb, lstatus, skb, first)) {
/* reuse the free half of the page */
gfar_reuse_rx_page(rx_queue, rxb);
} else {
/* page cannot be reused, unmap it */
dma_unmap_page(rx_queue->dev, rxb->dma,
PAGE_SIZE, DMA_FROM_DEVICE);
}
/* clear rxb content */
rxb->page = NULL;
return skb;
}
static inline void gfar_rx_checksum(struct sk_buff *skb, struct rxfcb *fcb)
{
/* If valid headers were found, and valid sums
* were verified, then we tell the kernel that no
* checksumming is necessary. Otherwise, it is [FIXME]
*/
if ((be16_to_cpu(fcb->flags) & RXFCB_CSUM_MASK) ==
(RXFCB_CIP | RXFCB_CTU))
skb->ip_summed = CHECKSUM_UNNECESSARY;
else
skb_checksum_none_assert(skb);
}
/* gfar_process_frame() -- handle one incoming packet if skb isn't NULL. */
static void gfar_process_frame(struct net_device *ndev, struct sk_buff *skb)
{
struct gfar_private *priv = netdev_priv(ndev);
struct rxfcb *fcb = NULL;
/* fcb is at the beginning if exists */
fcb = (struct rxfcb *)skb->data;
/* Remove the FCB from the skb
* Remove the padded bytes, if there are any
*/
if (priv->uses_rxfcb)
skb_pull(skb, GMAC_FCB_LEN);
/* Get receive timestamp from the skb */
if (priv->hwts_rx_en) {
struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
u64 *ns = (u64 *) skb->data;
memset(shhwtstamps, 0, sizeof(*shhwtstamps));
shhwtstamps->hwtstamp = ns_to_ktime(be64_to_cpu(*ns));
}
if (priv->padding)
skb_pull(skb, priv->padding);
/* Trim off the FCS */
pskb_trim(skb, skb->len - ETH_FCS_LEN);
if (ndev->features & NETIF_F_RXCSUM)
gfar_rx_checksum(skb, fcb);
/* There's need to check for NETIF_F_HW_VLAN_CTAG_RX here.
* Even if vlan rx accel is disabled, on some chips
* RXFCB_VLN is pseudo randomly set.
*/
if (ndev->features & NETIF_F_HW_VLAN_CTAG_RX &&
be16_to_cpu(fcb->flags) & RXFCB_VLN)
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
be16_to_cpu(fcb->vlctl));
}
/* gfar_clean_rx_ring() -- Processes each frame in the rx ring
* until the budget/quota has been reached. Returns the number
* of frames handled
*/
static int gfar_clean_rx_ring(struct gfar_priv_rx_q *rx_queue,
int rx_work_limit)
{
struct net_device *ndev = rx_queue->ndev;
struct gfar_private *priv = netdev_priv(ndev);
struct rxbd8 *bdp;
int i, howmany = 0;
struct sk_buff *skb = rx_queue->skb;
int cleaned_cnt = gfar_rxbd_unused(rx_queue);
unsigned int total_bytes = 0, total_pkts = 0;
/* Get the first full descriptor */
i = rx_queue->next_to_clean;
while (rx_work_limit--) {
u32 lstatus;
if (cleaned_cnt >= GFAR_RX_BUFF_ALLOC) {
gfar_alloc_rx_buffs(rx_queue, cleaned_cnt);
cleaned_cnt = 0;
}
bdp = &rx_queue->rx_bd_base[i];
lstatus = be32_to_cpu(bdp->lstatus);
if (lstatus & BD_LFLAG(RXBD_EMPTY))
break;
/* order rx buffer descriptor reads */
rmb();
/* fetch next to clean buffer from the ring */
skb = gfar_get_next_rxbuff(rx_queue, lstatus, skb);
if (unlikely(!skb))
break;
cleaned_cnt++;
howmany++;
if (unlikely(++i == rx_queue->rx_ring_size))
i = 0;
rx_queue->next_to_clean = i;
/* fetch next buffer if not the last in frame */
if (!(lstatus & BD_LFLAG(RXBD_LAST)))
continue;
gianfar: Fix oversized packets handling Issuing the following command on host: $ ifconfig eth2 mtu 1600 ; ping 10.0.0.27 -s 1485 -c 1 Makes some boards (tested with MPC8315 rev 1.1 and MPC8313 rev 1.0) oops like this: skb_over_panic: text:c0195914 len:1537 put:1537 head:c79e4800 data:c79e4880 tail:0xc79e4e81 end:0xc79e4e80 dev:eth1 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:127! Oops: Exception in kernel mode, sig: 5 [#1] MPC831x RDB last sysfs file: /sys/kernel/uevent_seqnum Modules linked in: NIP: c01c1840 LR: c01c1840 CTR: c016d918 [...] NIP [c01c1840] skb_over_panic+0x48/0x5c LR [c01c1840] skb_over_panic+0x48/0x5c Call Trace: [c0339d50] [c01c1840] skb_over_panic+0x48/0x5c (unreliable) [c0339d60] [c01c3020] skb_put+0x5c/0x60 [c0339d70] [c0195914] gfar_clean_rx_ring+0x25c/0x3d0 [c0339dc0] [c01976e8] gfar_poll+0x170/0x1bc Dumped buffer descriptors showed that eTSEC's length/truncation logic sometimes passes oversized packets, i.e. for the above ICMP packet the following two buffer descriptors may become ready: status=1400 length=1536 status=1800 length=1541 So, it seems that gianfar actually receives the whole big frame, and it tries to place the packet into two BDs. This situation confuses the driver, and so the skb_put() sanity check fails. This patch fixes the issue by adding an appropriate check, i.e. the driver should not try to process frames with buffer descriptor's length over rx_buffer_size (i.e. maxfrm and mrblr). Note that sometimes eTSEC works correctly, i.e. in the second (last) buffer descriptor bits 'truncated' and 'crcerr' are set, and so there's no oops. Though I couldn't find any logic when it works correctly and when not. Signed-off-by: Anton Vorontsov <avorontsov@mvista.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-11 18:51:03 +08:00
if (unlikely(lstatus & BD_LFLAG(RXBD_ERR))) {
count_errors(lstatus, ndev);
/* discard faulty buffer */
dev_kfree_skb(skb);
skb = NULL;
rx_queue->stats.rx_dropped++;
continue;
}
gfar_process_frame(ndev, skb);
/* Increment the number of packets */
total_pkts++;
total_bytes += skb->len;
skb_record_rx_queue(skb, rx_queue->qindex);
skb->protocol = eth_type_trans(skb, ndev);
/* Send the packet up the stack */
napi_gro_receive(&rx_queue->grp->napi_rx, skb);
skb = NULL;
}
/* Store incomplete frames for completion */
rx_queue->skb = skb;
rx_queue->stats.rx_packets += total_pkts;
rx_queue->stats.rx_bytes += total_bytes;
if (cleaned_cnt)
gfar_alloc_rx_buffs(rx_queue, cleaned_cnt);
/* Update Last Free RxBD pointer for LFC */
if (unlikely(priv->tx_actual_en)) {
u32 bdp_dma = gfar_rxbd_dma_lastfree(rx_queue);
gfar_write(rx_queue->rfbptr, bdp_dma);
}
return howmany;
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
static int gfar_poll_rx_sq(struct napi_struct *napi, int budget)
{
struct gfar_priv_grp *gfargrp =
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
container_of(napi, struct gfar_priv_grp, napi_rx);
struct gfar __iomem *regs = gfargrp->regs;
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
struct gfar_priv_rx_q *rx_queue = gfargrp->rx_queue;
int work_done = 0;
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived
*/
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
gfar_write(&regs->ievent, IEVENT_RX_MASK);
work_done = gfar_clean_rx_ring(rx_queue, budget);
if (work_done < budget) {
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
u32 imask;
napi_complete_done(napi, work_done);
/* Clear the halt bit in RSTAT */
gfar_write(&regs->rstat, gfargrp->rstat);
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
spin_lock_irq(&gfargrp->grplock);
imask = gfar_read(&regs->imask);
imask |= IMASK_RX_DEFAULT;
gfar_write(&regs->imask, imask);
spin_unlock_irq(&gfargrp->grplock);
}
return work_done;
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
static int gfar_poll_tx_sq(struct napi_struct *napi, int budget)
{
struct gfar_priv_grp *gfargrp =
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
container_of(napi, struct gfar_priv_grp, napi_tx);
struct gfar __iomem *regs = gfargrp->regs;
gianfar: Use Single-Queue polling for "fsl,etsec2" For the "fsl,etsec2" compatible models the driver currently supports 8 Tx and Rx DMA rings (aka HW queues). However, there are only 2 pairs of Rx/Tx interrupt lines, as these controllers are integrated in low power SoCs with 2 CPUs at most. As a result, there are at most 2 NAPI instances that have to service multiple Tx and Rx queues for these devices. This complicates the NAPI polling routine having to iterate over the mutiple Rx/Tx queues hooked to the same interrupt lines. And there's also an overhead at HW level, as the controller needs to service all the 8 Tx rings in a round robin manner. The combined overhead shows up for multi parallel Tx flows transmitted by the kernel stack, when the driver usually starts returning NETDEV_TX_BUSY leading to NETDEV WATCHDOG Tx timeout triggering if the Tx path is congested for too long. As an alternative, this patch makes the driver support only one Tx/Rx DMA ring per NAPI instance (per interrupt group or pair of Tx/Rx interrupt lines) by default. The simplified single queue polling routine (gfar_poll_sq) will be the default napi poll routine for the etsec2 devices too. Some adjustments needed to be made to link the Tx/Rx HW queues with each NAPI instance (2 in this case). The gfar_poll_sq() is already successfully used by older SQ_SG_MODE (single interrupt group) controllers. This patch fixes Tx timeout triggering under heavy Tx traffic load (i.e. iperf -c -P 8) for the "fsl,etsec2" (currently the only MQ_MG_MODE devices). There's also a significant memory footprint reduction by supporting 2 Rx/Tx DMA rings (at most), instead of 8, for these devices. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:46 +08:00
struct gfar_priv_tx_q *tx_queue = gfargrp->tx_queue;
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
u32 imask;
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived
*/
gfar_write(&regs->ievent, IEVENT_TX_MASK);
/* run Tx cleanup to completion */
if (tx_queue->tx_skbuff[tx_queue->skb_dirtytx])
gfar_clean_tx_ring(tx_queue);
napi_complete(napi);
spin_lock_irq(&gfargrp->grplock);
imask = gfar_read(&regs->imask);
imask |= IMASK_TX_DEFAULT;
gfar_write(&regs->imask, imask);
spin_unlock_irq(&gfargrp->grplock);
return 0;
}
static int gfar_poll_rx(struct napi_struct *napi, int budget)
{
struct gfar_priv_grp *gfargrp =
container_of(napi, struct gfar_priv_grp, napi_rx);
struct gfar_private *priv = gfargrp->priv;
struct gfar __iomem *regs = gfargrp->regs;
struct gfar_priv_rx_q *rx_queue = NULL;
int work_done = 0, work_done_per_q = 0;
int i, budget_per_q = 0;
unsigned long rstat_rxf;
int num_act_queues;
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived
*/
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
gfar_write(&regs->ievent, IEVENT_RX_MASK);
rstat_rxf = gfar_read(&regs->rstat) & RSTAT_RXF_MASK;
num_act_queues = bitmap_weight(&rstat_rxf, MAX_RX_QS);
if (num_act_queues)
budget_per_q = budget/num_act_queues;
gianfar: Simplify MQ polling to avoid soft lockup Under certain low traffic conditions, the single core devices with multiple Rx/Tx queues (MQ mode) may reach soft lockup due to gfar_poll not returning in proper time. The following exception was obtained using iperf on a 100Mbit half-duplex link, for a p1010 single core device: BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847] Modules linked in: CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16 task: e8bf8000 ti: eeb16000 task.ti: ee646000 NIP: c0255b6c LR: c0367ae8 CTR: c0461c18 REGS: eeb17e70 TRAP: 0901 Not tainted (3.12.0-rc3) MSR: 00029000 <CE,EE,ME> CR: 44228428 XER: 20000000 GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff 00000000 GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe NIP [c0255b6c] find_next_bit+0xb8/0xc4 LR [c0367ae8] gfar_poll+0xc8/0x1d8 Call Trace: [eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable) [eeb17f70] [c0422100] net_rx_action+0xa4/0x158 [eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c [eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c [ee647cc0] [c0004660] do_softirq+0x6c/0x94 [ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0 [ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc [ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c [ee647d80] [c047b630] inet_recvmsg+0x40/0x64 [ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0 [ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc To prevent this, the outer while() loop has been removed allowing gfar_poll() to return faster even if there's still budget left. Also, there's no need to recompute the budget per Rx queue anymore. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-14 22:05:09 +08:00
for_each_set_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) {
/* skip queue if not active */
if (!(rstat_rxf & (RSTAT_CLEAR_RXF0 >> i)))
continue;
gianfar: Simplify MQ polling to avoid soft lockup Under certain low traffic conditions, the single core devices with multiple Rx/Tx queues (MQ mode) may reach soft lockup due to gfar_poll not returning in proper time. The following exception was obtained using iperf on a 100Mbit half-duplex link, for a p1010 single core device: BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847] Modules linked in: CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16 task: e8bf8000 ti: eeb16000 task.ti: ee646000 NIP: c0255b6c LR: c0367ae8 CTR: c0461c18 REGS: eeb17e70 TRAP: 0901 Not tainted (3.12.0-rc3) MSR: 00029000 <CE,EE,ME> CR: 44228428 XER: 20000000 GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff 00000000 GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe NIP [c0255b6c] find_next_bit+0xb8/0xc4 LR [c0367ae8] gfar_poll+0xc8/0x1d8 Call Trace: [eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable) [eeb17f70] [c0422100] net_rx_action+0xa4/0x158 [eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c [eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c [ee647cc0] [c0004660] do_softirq+0x6c/0x94 [ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0 [ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc [ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c [ee647d80] [c047b630] inet_recvmsg+0x40/0x64 [ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0 [ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc To prevent this, the outer while() loop has been removed allowing gfar_poll() to return faster even if there's still budget left. Also, there's no need to recompute the budget per Rx queue anymore. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-14 22:05:09 +08:00
rx_queue = priv->rx_queue[i];
work_done_per_q =
gfar_clean_rx_ring(rx_queue, budget_per_q);
work_done += work_done_per_q;
/* finished processing this queue */
if (work_done_per_q < budget_per_q) {
/* clear active queue hw indication */
gfar_write(&regs->rstat,
RSTAT_CLEAR_RXF0 >> i);
num_act_queues--;
if (!num_act_queues)
break;
}
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
if (!num_act_queues) {
u32 imask;
napi_complete_done(napi, work_done);
gianfar: Simplify MQ polling to avoid soft lockup Under certain low traffic conditions, the single core devices with multiple Rx/Tx queues (MQ mode) may reach soft lockup due to gfar_poll not returning in proper time. The following exception was obtained using iperf on a 100Mbit half-duplex link, for a p1010 single core device: BUG: soft lockup - CPU#0 stuck for 23s! [iperf:2847] Modules linked in: CPU: 0 PID: 2847 Comm: iperf Not tainted 3.12.0-rc3 #16 task: e8bf8000 ti: eeb16000 task.ti: ee646000 NIP: c0255b6c LR: c0367ae8 CTR: c0461c18 REGS: eeb17e70 TRAP: 0901 Not tainted (3.12.0-rc3) MSR: 00029000 <CE,EE,ME> CR: 44228428 XER: 20000000 GPR00: c0367ad4 eeb17f20 e8bf8000 ee01f4b4 00000008 ffffffff ffffffff 00000000 GPR08: 000000c0 00000008 000000ff ffffffc0 000193fe NIP [c0255b6c] find_next_bit+0xb8/0xc4 LR [c0367ae8] gfar_poll+0xc8/0x1d8 Call Trace: [eeb17f20] [c0367ad4] gfar_poll+0xb4/0x1d8 (unreliable) [eeb17f70] [c0422100] net_rx_action+0xa4/0x158 [eeb17fa0] [c003ec6c] __do_softirq+0xcc/0x17c [eeb17ff0] [c000c28c] call_do_softirq+0x24/0x3c [ee647cc0] [c0004660] do_softirq+0x6c/0x94 [ee647ce0] [c003eb9c] local_bh_enable+0x9c/0xa0 [ee647cf0] [c0454fe8] tcp_prequeue_process+0xa4/0xdc [ee647d10] [c0457e44] tcp_recvmsg+0x498/0x96c [ee647d80] [c047b630] inet_recvmsg+0x40/0x64 [ee647da0] [c040ca8c] sock_recvmsg+0x90/0xc0 [ee647e30] [c040edb8] SyS_recvfrom+0x98/0xfc To prevent this, the outer while() loop has been removed allowing gfar_poll() to return faster even if there's still budget left. Also, there's no need to recompute the budget per Rx queue anymore. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-10-14 22:05:09 +08:00
/* Clear the halt bit in RSTAT */
gfar_write(&regs->rstat, gfargrp->rstat);
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
spin_lock_irq(&gfargrp->grplock);
imask = gfar_read(&regs->imask);
imask |= IMASK_RX_DEFAULT;
gfar_write(&regs->imask, imask);
spin_unlock_irq(&gfargrp->grplock);
}
return work_done;
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
static int gfar_poll_tx(struct napi_struct *napi, int budget)
{
struct gfar_priv_grp *gfargrp =
container_of(napi, struct gfar_priv_grp, napi_tx);
struct gfar_private *priv = gfargrp->priv;
struct gfar __iomem *regs = gfargrp->regs;
struct gfar_priv_tx_q *tx_queue = NULL;
int has_tx_work = 0;
int i;
/* Clear IEVENT, so interrupts aren't called again
* because of the packets that have already arrived
*/
gfar_write(&regs->ievent, IEVENT_TX_MASK);
for_each_set_bit(i, &gfargrp->tx_bit_map, priv->num_tx_queues) {
tx_queue = priv->tx_queue[i];
/* run Tx cleanup to completion */
if (tx_queue->tx_skbuff[tx_queue->skb_dirtytx]) {
gfar_clean_tx_ring(tx_queue);
has_tx_work = 1;
}
}
if (!has_tx_work) {
u32 imask;
napi_complete(napi);
spin_lock_irq(&gfargrp->grplock);
imask = gfar_read(&regs->imask);
imask |= IMASK_TX_DEFAULT;
gfar_write(&regs->imask, imask);
spin_unlock_irq(&gfargrp->grplock);
}
return 0;
}
/* GFAR error interrupt handler */
static irqreturn_t gfar_error(int irq, void *grp_id)
{
struct gfar_priv_grp *gfargrp = grp_id;
struct gfar __iomem *regs = gfargrp->regs;
struct gfar_private *priv= gfargrp->priv;
struct net_device *dev = priv->ndev;
/* Save ievent for future reference */
u32 events = gfar_read(&regs->ievent);
/* Clear IEVENT */
gfar_write(&regs->ievent, events & IEVENT_ERR_MASK);
/* Magic Packet is not an error. */
if ((priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET) &&
(events & IEVENT_MAG))
events &= ~IEVENT_MAG;
/* Hmm... */
if (netif_msg_rx_err(priv) || netif_msg_tx_err(priv))
netdev_dbg(dev,
"error interrupt (ievent=0x%08x imask=0x%08x)\n",
events, gfar_read(&regs->imask));
/* Update the error counters */
if (events & IEVENT_TXE) {
dev->stats.tx_errors++;
if (events & IEVENT_LC)
dev->stats.tx_window_errors++;
if (events & IEVENT_CRL)
dev->stats.tx_aborted_errors++;
if (events & IEVENT_XFUN) {
netif_dbg(priv, tx_err, dev,
"TX FIFO underrun, packet dropped\n");
dev->stats.tx_dropped++;
atomic64_inc(&priv->extra_stats.tx_underrun);
schedule_work(&priv->reset_task);
}
netif_dbg(priv, tx_err, dev, "Transmit Error\n");
}
if (events & IEVENT_BSY) {
dev->stats.rx_over_errors++;
atomic64_inc(&priv->extra_stats.rx_bsy);
netif_dbg(priv, rx_err, dev, "busy error (rstat: %x)\n",
gfar_read(&regs->rstat));
}
if (events & IEVENT_BABR) {
dev->stats.rx_errors++;
atomic64_inc(&priv->extra_stats.rx_babr);
netif_dbg(priv, rx_err, dev, "babbling RX error\n");
}
if (events & IEVENT_EBERR) {
atomic64_inc(&priv->extra_stats.eberr);
netif_dbg(priv, rx_err, dev, "bus error\n");
}
if (events & IEVENT_RXC)
netif_dbg(priv, rx_status, dev, "control frame\n");
if (events & IEVENT_BABT) {
atomic64_inc(&priv->extra_stats.tx_babt);
netif_dbg(priv, tx_err, dev, "babbling TX error\n");
}
return IRQ_HANDLED;
}
/* The interrupt handler for devices with one interrupt */
static irqreturn_t gfar_interrupt(int irq, void *grp_id)
{
struct gfar_priv_grp *gfargrp = grp_id;
/* Save ievent for future reference */
u32 events = gfar_read(&gfargrp->regs->ievent);
/* Check for reception */
if (events & IEVENT_RX_MASK)
gfar_receive(irq, grp_id);
/* Check for transmit completion */
if (events & IEVENT_TX_MASK)
gfar_transmit(irq, grp_id);
/* Check for errors */
if (events & IEVENT_ERR_MASK)
gfar_error(irq, grp_id);
return IRQ_HANDLED;
}
gianfar: Separate out the Tx interrupt handling (Tx NAPI) There are some concurrency issues on devices w/ 2 CPUs related to the handling of Rx and Tx interrupts. eTSEC has separate interrupt lines for Rx and Tx but a single imask register to mask these interrupts and a single NAPI instance to handle both Rx and Tx work. As a result, the Rx and Tx ISRs are identical, both are invoking gfar_schedule_cleanup(), however both handlers can be entered at the same time when the Rx and Tx interrupts are taken by different CPUs. In this case spurrious interrupts (SPU) show up (in /proc/interrupts) indicating a concurrency issue. Also, Tx overruns followed by Tx timeout have been observed under heavy Tx traffic load. To address these issues, the schedule cleanup ISR part has been changed to handle the Rx and Tx interrupts independently. The patch adds a separate NAPI poll routine for Tx cleanup to be triggerred independently by the Tx confirmation interrupts only. Existing poll functions are modified to handle only the Rx path processing. The Tx poll routine does not need a budget, since Tx processing doesn't consume NAPI budget, and hence it is registered with minimum NAPI weight. NAPI scheduling does not require locking since there are different NAPI instances between the Rx and Tx confirmation paths now. So, the patch fixes the occurence of spurrious Rx/Tx interrupts. Tx overruns also occur less frequently now. Signed-off-by: Claudiu Manoil <claudiu.manoil@freescale.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-07 20:42:45 +08:00
#ifdef CONFIG_NET_POLL_CONTROLLER
/* Polling 'interrupt' - used by things like netconsole to send skbs
* without having to re-enable interrupts. It's not called while
* the interrupt routine is executing.
*/
static void gfar_netpoll(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
int i;
/* If the device has multiple interrupts, run tx/rx */
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
for (i = 0; i < priv->num_grps; i++) {
struct gfar_priv_grp *grp = &priv->gfargrp[i];
disable_irq(gfar_irq(grp, TX)->irq);
disable_irq(gfar_irq(grp, RX)->irq);
disable_irq(gfar_irq(grp, ER)->irq);
gfar_interrupt(gfar_irq(grp, TX)->irq, grp);
enable_irq(gfar_irq(grp, ER)->irq);
enable_irq(gfar_irq(grp, RX)->irq);
enable_irq(gfar_irq(grp, TX)->irq);
}
} else {
for (i = 0; i < priv->num_grps; i++) {
struct gfar_priv_grp *grp = &priv->gfargrp[i];
disable_irq(gfar_irq(grp, TX)->irq);
gfar_interrupt(gfar_irq(grp, TX)->irq, grp);
enable_irq(gfar_irq(grp, TX)->irq);
gianfar: Fix build with CONFIG_NET_POLL_CONTROLLER=y commit 46ceb60ca80fa07703bc6eb8f4651f900dff5a82 ("gianfar: Add Multiple group Support") introduced the following build error with CONFIG_NET_POLL_CONTROLLER=y: CC ggianfar.o ggianfar.c: In function 'gfar_netpoll': ggianfar.c:2653: error: invalid storage class for function 'gfar_interrupt' ggianfar.c:2652: warning: ISO C90 forbids mixed declarations and code ggianfar.c:2681: error: invalid storage class for function 'adjust_link' ggianfar.c:2764: error: invalid storage class for function 'gfar_set_multi' ggianfar.c:2855: error: invalid storage class for function 'gfar_clear_exact_match' ggianfar.c:2877: error: invalid storage class for function 'gfar_set_hash_for_addr' ggianfar.c:2898: error: invalid storage class for function 'gfar_set_mac_for_addr' ggianfar.c:2922: error: invalid storage class for function 'gfar_error' ggianfar.c:3020: warning: ISO C90 forbids mixed declarations and code ggianfar.c:3032: error: invalid storage class for function 'gfar_init' ggianfar.c:3037: error: invalid storage class for function 'gfar_exit' ggianfar.c:3041: error: initializer element is not constant ggianfar.c:3042: error: initializer element is not constant ggianfar.c:3042: warning: ISO C90 forbids mixed declarations and code ggianfar.c:3042: error: expected declaration or statement at end of input make[1]: *** [ggianfar.o] Error 1 This patch fixes the issue. Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2009-12-09 18:52:19 +08:00
}
}
}
#endif
static void free_grp_irqs(struct gfar_priv_grp *grp)
{
free_irq(gfar_irq(grp, TX)->irq, grp);
free_irq(gfar_irq(grp, RX)->irq, grp);
free_irq(gfar_irq(grp, ER)->irq, grp);
}
static int register_grp_irqs(struct gfar_priv_grp *grp)
{
struct gfar_private *priv = grp->priv;
struct net_device *dev = priv->ndev;
int err;
/* If the device has multiple interrupts, register for
* them. Otherwise, only register for the one
*/
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
/* Install our interrupt handlers for Error,
* Transmit, and Receive
*/
err = request_irq(gfar_irq(grp, ER)->irq, gfar_error, 0,
gfar_irq(grp, ER)->name, grp);
if (err < 0) {
netif_err(priv, intr, dev, "Can't get IRQ %d\n",
gfar_irq(grp, ER)->irq);
goto err_irq_fail;
}
enable_irq_wake(gfar_irq(grp, ER)->irq);
err = request_irq(gfar_irq(grp, TX)->irq, gfar_transmit, 0,
gfar_irq(grp, TX)->name, grp);
if (err < 0) {
netif_err(priv, intr, dev, "Can't get IRQ %d\n",
gfar_irq(grp, TX)->irq);
goto tx_irq_fail;
}
err = request_irq(gfar_irq(grp, RX)->irq, gfar_receive, 0,
gfar_irq(grp, RX)->name, grp);
if (err < 0) {
netif_err(priv, intr, dev, "Can't get IRQ %d\n",
gfar_irq(grp, RX)->irq);
goto rx_irq_fail;
}
enable_irq_wake(gfar_irq(grp, RX)->irq);
} else {
err = request_irq(gfar_irq(grp, TX)->irq, gfar_interrupt, 0,
gfar_irq(grp, TX)->name, grp);
if (err < 0) {
netif_err(priv, intr, dev, "Can't get IRQ %d\n",
gfar_irq(grp, TX)->irq);
goto err_irq_fail;
}
enable_irq_wake(gfar_irq(grp, TX)->irq);
}
return 0;
rx_irq_fail:
free_irq(gfar_irq(grp, TX)->irq, grp);
tx_irq_fail:
free_irq(gfar_irq(grp, ER)->irq, grp);
err_irq_fail:
return err;
}
static void gfar_free_irq(struct gfar_private *priv)
{
int i;
/* Free the IRQs */
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
for (i = 0; i < priv->num_grps; i++)
free_grp_irqs(&priv->gfargrp[i]);
} else {
for (i = 0; i < priv->num_grps; i++)
free_irq(gfar_irq(&priv->gfargrp[i], TX)->irq,
&priv->gfargrp[i]);
}
}
static int gfar_request_irq(struct gfar_private *priv)
{
int err, i, j;
for (i = 0; i < priv->num_grps; i++) {
err = register_grp_irqs(&priv->gfargrp[i]);
if (err) {
for (j = 0; j < i; j++)
free_grp_irqs(&priv->gfargrp[j]);
return err;
}
}
return 0;
}
/* Called when something needs to use the ethernet device
* Returns 0 for success.
*/
static int gfar_enet_open(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
int err;
err = init_phy(dev);
if (err)
return err;
err = gfar_request_irq(priv);
if (err)
return err;
err = startup_gfar(dev);
if (err)
return err;
return err;
}
/* Stops the kernel queue, and halts the controller */
static int gfar_close(struct net_device *dev)
{
struct gfar_private *priv = netdev_priv(dev);
cancel_work_sync(&priv->reset_task);
stop_gfar(dev);
/* Disconnect from the PHY */
phy_disconnect(dev->phydev);
gfar_free_irq(priv);
return 0;
}
/* Clears each of the exact match registers to zero, so they
* don't interfere with normal reception
*/
static void gfar_clear_exact_match(struct net_device *dev)
{
int idx;
static const u8 zero_arr[ETH_ALEN] = {0, 0, 0, 0, 0, 0};
for (idx = 1; idx < GFAR_EM_NUM + 1; idx++)
gfar_set_mac_for_addr(dev, idx, zero_arr);
}
/* Update the hash table based on the current list of multicast
* addresses we subscribe to. Also, change the promiscuity of
* the device based on the flags (this function is called
* whenever dev->flags is changed
*/
static void gfar_set_multi(struct net_device *dev)
{
struct netdev_hw_addr *ha;
struct gfar_private *priv = netdev_priv(dev);
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
if (dev->flags & IFF_PROMISC) {
/* Set RCTRL to PROM */
tempval = gfar_read(&regs->rctrl);
tempval |= RCTRL_PROM;
gfar_write(&regs->rctrl, tempval);
} else {
/* Set RCTRL to not PROM */
tempval = gfar_read(&regs->rctrl);
tempval &= ~(RCTRL_PROM);
gfar_write(&regs->rctrl, tempval);
}
if (dev->flags & IFF_ALLMULTI) {
/* Set the hash to rx all multicast frames */
gfar_write(&regs->igaddr0, 0xffffffff);
gfar_write(&regs->igaddr1, 0xffffffff);
gfar_write(&regs->igaddr2, 0xffffffff);
gfar_write(&regs->igaddr3, 0xffffffff);
gfar_write(&regs->igaddr4, 0xffffffff);
gfar_write(&regs->igaddr5, 0xffffffff);
gfar_write(&regs->igaddr6, 0xffffffff);
gfar_write(&regs->igaddr7, 0xffffffff);
gfar_write(&regs->gaddr0, 0xffffffff);
gfar_write(&regs->gaddr1, 0xffffffff);
gfar_write(&regs->gaddr2, 0xffffffff);
gfar_write(&regs->gaddr3, 0xffffffff);
gfar_write(&regs->gaddr4, 0xffffffff);
gfar_write(&regs->gaddr5, 0xffffffff);
gfar_write(&regs->gaddr6, 0xffffffff);
gfar_write(&regs->gaddr7, 0xffffffff);
} else {
int em_num;
int idx;
/* zero out the hash */
gfar_write(&regs->igaddr0, 0x0);
gfar_write(&regs->igaddr1, 0x0);
gfar_write(&regs->igaddr2, 0x0);
gfar_write(&regs->igaddr3, 0x0);
gfar_write(&regs->igaddr4, 0x0);
gfar_write(&regs->igaddr5, 0x0);
gfar_write(&regs->igaddr6, 0x0);
gfar_write(&regs->igaddr7, 0x0);
gfar_write(&regs->gaddr0, 0x0);
gfar_write(&regs->gaddr1, 0x0);
gfar_write(&regs->gaddr2, 0x0);
gfar_write(&regs->gaddr3, 0x0);
gfar_write(&regs->gaddr4, 0x0);
gfar_write(&regs->gaddr5, 0x0);
gfar_write(&regs->gaddr6, 0x0);
gfar_write(&regs->gaddr7, 0x0);
/* If we have extended hash tables, we need to
* clear the exact match registers to prepare for
* setting them
*/
if (priv->extended_hash) {
em_num = GFAR_EM_NUM + 1;
gfar_clear_exact_match(dev);
idx = 1;
} else {
idx = 0;
em_num = 0;
}
if (netdev_mc_empty(dev))
return;
/* Parse the list, and set the appropriate bits */
netdev_for_each_mc_addr(ha, dev) {
if (idx < em_num) {
gfar_set_mac_for_addr(dev, idx, ha->addr);
idx++;
} else
gfar_set_hash_for_addr(dev, ha->addr);
}
}
}
void gfar_mac_reset(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
/* Reset MAC layer */
gfar_write(&regs->maccfg1, MACCFG1_SOFT_RESET);
/* We need to delay at least 3 TX clocks */
udelay(3);
/* the soft reset bit is not self-resetting, so we need to
* clear it before resuming normal operation
*/
gfar_write(&regs->maccfg1, 0);
udelay(3);
gfar_rx_offload_en(priv);
/* Initialize the max receive frame/buffer lengths */
gfar_write(&regs->maxfrm, GFAR_JUMBO_FRAME_SIZE);
gfar_write(&regs->mrblr, GFAR_RXB_SIZE);
/* Initialize the Minimum Frame Length Register */
gfar_write(&regs->minflr, MINFLR_INIT_SETTINGS);
/* Initialize MACCFG2. */
tempval = MACCFG2_INIT_SETTINGS;
/* eTSEC74 erratum: Rx frames of length MAXFRM or MAXFRM-1
* are marked as truncated. Avoid this by MACCFG2[Huge Frame]=1,
* and by checking RxBD[LG] and discarding larger than MAXFRM.
*/
if (gfar_has_errata(priv, GFAR_ERRATA_74))
tempval |= MACCFG2_HUGEFRAME | MACCFG2_LENGTHCHECK;
gfar_write(&regs->maccfg2, tempval);
/* Clear mac addr hash registers */
gfar_write(&regs->igaddr0, 0);
gfar_write(&regs->igaddr1, 0);
gfar_write(&regs->igaddr2, 0);
gfar_write(&regs->igaddr3, 0);
gfar_write(&regs->igaddr4, 0);
gfar_write(&regs->igaddr5, 0);
gfar_write(&regs->igaddr6, 0);
gfar_write(&regs->igaddr7, 0);
gfar_write(&regs->gaddr0, 0);
gfar_write(&regs->gaddr1, 0);
gfar_write(&regs->gaddr2, 0);
gfar_write(&regs->gaddr3, 0);
gfar_write(&regs->gaddr4, 0);
gfar_write(&regs->gaddr5, 0);
gfar_write(&regs->gaddr6, 0);
gfar_write(&regs->gaddr7, 0);
if (priv->extended_hash)
gfar_clear_exact_match(priv->ndev);
gfar_mac_rx_config(priv);
gfar_mac_tx_config(priv);
gfar_set_mac_address(priv->ndev);
gfar_set_multi(priv->ndev);
/* clear ievent and imask before configuring coalescing */
gfar_ints_disable(priv);
/* Configure the coalescing support */
gfar_configure_coalescing_all(priv);
}
static void gfar_hw_init(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 attrs;
/* Stop the DMA engine now, in case it was running before
* (The firmware could have used it, and left it running).
*/
gfar_halt(priv);
gfar_mac_reset(priv);
/* Zero out the rmon mib registers if it has them */
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_RMON) {
memset_io(&(regs->rmon), 0, sizeof(struct rmon_mib));
/* Mask off the CAM interrupts */
gfar_write(&regs->rmon.cam1, 0xffffffff);
gfar_write(&regs->rmon.cam2, 0xffffffff);
}
/* Initialize ECNTRL */
gfar_write(&regs->ecntrl, ECNTRL_INIT_SETTINGS);
/* Set the extraction length and index */
attrs = ATTRELI_EL(priv->rx_stash_size) |
ATTRELI_EI(priv->rx_stash_index);
gfar_write(&regs->attreli, attrs);
/* Start with defaults, and add stashing
* depending on driver parameters
*/
attrs = ATTR_INIT_SETTINGS;
if (priv->bd_stash_en)
attrs |= ATTR_BDSTASH;
if (priv->rx_stash_size != 0)
attrs |= ATTR_BUFSTASH;
gfar_write(&regs->attr, attrs);
/* FIFO configs */
gfar_write(&regs->fifo_tx_thr, DEFAULT_FIFO_TX_THR);
gfar_write(&regs->fifo_tx_starve, DEFAULT_FIFO_TX_STARVE);
gfar_write(&regs->fifo_tx_starve_shutoff, DEFAULT_FIFO_TX_STARVE_OFF);
/* Program the interrupt steering regs, only for MG devices */
if (priv->num_grps > 1)
gfar_write_isrg(priv);
}
static const struct net_device_ops gfar_netdev_ops = {
.ndo_open = gfar_enet_open,
.ndo_start_xmit = gfar_start_xmit,
.ndo_stop = gfar_close,
.ndo_change_mtu = gfar_change_mtu,
.ndo_set_features = gfar_set_features,
.ndo_set_rx_mode = gfar_set_multi,
.ndo_tx_timeout = gfar_timeout,
.ndo_do_ioctl = gfar_ioctl,
.ndo_get_stats = gfar_get_stats,
.ndo_change_carrier = fixed_phy_change_carrier,
.ndo_set_mac_address = gfar_set_mac_addr,
.ndo_validate_addr = eth_validate_addr,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = gfar_netpoll,
#endif
};
/* Set up the ethernet device structure, private data,
* and anything else we need before we start
*/
static int gfar_probe(struct platform_device *ofdev)
{
struct device_node *np = ofdev->dev.of_node;
struct net_device *dev = NULL;
struct gfar_private *priv = NULL;
int err = 0, i;
err = gfar_of_init(ofdev, &dev);
if (err)
return err;
priv = netdev_priv(dev);
priv->ndev = dev;
priv->ofdev = ofdev;
priv->dev = &ofdev->dev;
SET_NETDEV_DEV(dev, &ofdev->dev);
INIT_WORK(&priv->reset_task, gfar_reset_task);
platform_set_drvdata(ofdev, priv);
gfar_detect_errata(priv);
/* Set the dev->base_addr to the gfar reg region */
dev->base_addr = (unsigned long) priv->gfargrp[0].regs;
/* Fill in the dev structure */
dev->watchdog_timeo = TX_TIMEOUT;
/* MTU range: 50 - 9586 */
dev->mtu = 1500;
dev->min_mtu = 50;
dev->max_mtu = GFAR_JUMBO_FRAME_SIZE - ETH_HLEN;
dev->netdev_ops = &gfar_netdev_ops;
dev->ethtool_ops = &gfar_ethtool_ops;
/* Register for napi ...We are registering NAPI for each grp */
for (i = 0; i < priv->num_grps; i++) {
if (priv->poll_mode == GFAR_SQ_POLLING) {
netif_napi_add(dev, &priv->gfargrp[i].napi_rx,
gfar_poll_rx_sq, GFAR_DEV_WEIGHT);
netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx,
gfar_poll_tx_sq, 2);
} else {
netif_napi_add(dev, &priv->gfargrp[i].napi_rx,
gfar_poll_rx, GFAR_DEV_WEIGHT);
netif_tx_napi_add(dev, &priv->gfargrp[i].napi_tx,
gfar_poll_tx, 2);
}
}
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_CSUM) {
dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG |
NETIF_F_RXCSUM;
dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG |
NETIF_F_RXCSUM | NETIF_F_HIGHDMA;
}
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_VLAN) {
dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_CTAG_RX;
dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
}
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
gfar_init_addr_hash_table(priv);
/* Insert receive time stamps into padding alignment bytes, and
* plus 2 bytes padding to ensure the cpu alignment.
*/
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
priv->padding = 8 + DEFAULT_PADDING;
if (dev->features & NETIF_F_IP_CSUM ||
priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER)
gianfar: Account for Tx PTP timestamp in the skb headroom When PTP timestamping is enabled on Tx, the controller inserts the Tx timestamp at the beginning of the frame buffer, between SFD and the L2 frame header. This means that the skb provided by the stack is required to have enough headroom otherwise a new skb needs to be created by the driver to accommodate the timestamp inserted by h/w. Up until now the driver was relying on the second option, using skb_realloc_headroom() to create a new skb to accommodate PTP frames. Turns out that this method is not reliable, as reallocation of skbs for PTP frames along with the required overhead (skb_set_owner_w, consume_skb) is causing random crashes in subsequent skb_*() calls, when multiple concurrent TCP streams are run at the same time on the same device (as seen in James' report). Note that these crashes don't occur with a single TCP stream, nor with multiple concurrent UDP streams, but only when multiple TCP streams are run concurrently with the PTP packet flow (doing skb reallocation). This patch enforces the first method, by requesting enough headroom from the stack to accommodate PTP frames, and so avoiding skb_realloc_headroom() & co, and the crashes no longer occur. There's no reason not to set needed_headroom to a large enough value to accommodate PTP frames, so in this regard this patch is a fix. Reported-by: James Jurack <james.jurack@ametek.com> Fixes: bee9e58c9e98 ("gianfar:don't add FCB length to hard_header_len") Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com> Link: https://lore.kernel.org/r/20201020173605.1173-1-claudiu.manoil@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-21 01:36:05 +08:00
dev->needed_headroom = GMAC_FCB_LEN + GMAC_TXPAL_LEN;
/* Initializing some of the rx/tx queue level parameters */
for (i = 0; i < priv->num_tx_queues; i++) {
priv->tx_queue[i]->tx_ring_size = DEFAULT_TX_RING_SIZE;
priv->tx_queue[i]->num_txbdfree = DEFAULT_TX_RING_SIZE;
priv->tx_queue[i]->txcoalescing = DEFAULT_TX_COALESCE;
priv->tx_queue[i]->txic = DEFAULT_TXIC;
}
for (i = 0; i < priv->num_rx_queues; i++) {
priv->rx_queue[i]->rx_ring_size = DEFAULT_RX_RING_SIZE;
priv->rx_queue[i]->rxcoalescing = DEFAULT_RX_COALESCE;
priv->rx_queue[i]->rxic = DEFAULT_RXIC;
}
/* Always enable rx filer if available */
priv->rx_filer_enable =
(priv->device_flags & FSL_GIANFAR_DEV_HAS_RX_FILER) ? 1 : 0;
/* Enable most messages by default */
priv->msg_enable = (NETIF_MSG_IFUP << 1 ) - 1;
/* use pritority h/w tx queue scheduling for single queue devices */
if (priv->num_tx_queues == 1)
priv->prio_sched_en = 1;
set_bit(GFAR_DOWN, &priv->state);
gfar_hw_init(priv);
/* Carrier starts down, phylib will bring it up */
netif_carrier_off(dev);
err = register_netdev(dev);
if (err) {
pr_err("%s: Cannot register net device, aborting\n", dev->name);
goto register_fail;
}
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MAGIC_PACKET)
priv->wol_supported |= GFAR_WOL_MAGIC;
if ((priv->device_flags & FSL_GIANFAR_DEV_HAS_WAKE_ON_FILER) &&
priv->rx_filer_enable)
priv->wol_supported |= GFAR_WOL_FILER_UCAST;
device_set_wakeup_capable(&ofdev->dev, priv->wol_supported);
/* fill out IRQ number and name fields */
for (i = 0; i < priv->num_grps; i++) {
struct gfar_priv_grp *grp = &priv->gfargrp[i];
if (priv->device_flags & FSL_GIANFAR_DEV_HAS_MULTI_INTR) {
sprintf(gfar_irq(grp, TX)->name, "%s%s%c%s",
dev->name, "_g", '0' + i, "_tx");
sprintf(gfar_irq(grp, RX)->name, "%s%s%c%s",
dev->name, "_g", '0' + i, "_rx");
sprintf(gfar_irq(grp, ER)->name, "%s%s%c%s",
dev->name, "_g", '0' + i, "_er");
} else
strcpy(gfar_irq(grp, TX)->name, dev->name);
}
/* Initialize the filer table */
gfar_init_filer_table(priv);
/* Print out the device info */
netdev_info(dev, "mac: %pM\n", dev->dev_addr);
/* Even more device info helps when determining which kernel
* provided which set of benchmarks.
*/
netdev_info(dev, "Running with NAPI enabled\n");
for (i = 0; i < priv->num_rx_queues; i++)
netdev_info(dev, "RX BD ring size for Q[%d]: %d\n",
i, priv->rx_queue[i]->rx_ring_size);
for (i = 0; i < priv->num_tx_queues; i++)
netdev_info(dev, "TX BD ring size for Q[%d]: %d\n",
i, priv->tx_queue[i]->tx_ring_size);
return 0;
register_fail:
if (of_phy_is_fixed_link(np))
of_phy_deregister_fixed_link(np);
unmap_group_regs(priv);
gfar_free_rx_queues(priv);
gfar_free_tx_queues(priv);
of_node_put(priv->phy_node);
of_node_put(priv->tbi_node);
free_gfar_dev(priv);
return err;
}
static int gfar_remove(struct platform_device *ofdev)
{
struct gfar_private *priv = platform_get_drvdata(ofdev);
struct device_node *np = ofdev->dev.of_node;
of_node_put(priv->phy_node);
of_node_put(priv->tbi_node);
unregister_netdev(priv->ndev);
if (of_phy_is_fixed_link(np))
of_phy_deregister_fixed_link(np);
unmap_group_regs(priv);
gfar_free_rx_queues(priv);
gfar_free_tx_queues(priv);
free_gfar_dev(priv);
return 0;
}
#ifdef CONFIG_PM
static void __gfar_filer_disable(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 temp;
temp = gfar_read(&regs->rctrl);
temp &= ~(RCTRL_FILREN | RCTRL_PRSDEP_INIT);
gfar_write(&regs->rctrl, temp);
}
static void __gfar_filer_enable(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 temp;
temp = gfar_read(&regs->rctrl);
temp |= RCTRL_FILREN | RCTRL_PRSDEP_INIT;
gfar_write(&regs->rctrl, temp);
}
/* Filer rules implementing wol capabilities */
static void gfar_filer_config_wol(struct gfar_private *priv)
{
unsigned int i;
u32 rqfcr;
__gfar_filer_disable(priv);
/* clear the filer table, reject any packet by default */
rqfcr = RQFCR_RJE | RQFCR_CMP_MATCH;
for (i = 0; i <= MAX_FILER_IDX; i++)
gfar_write_filer(priv, i, rqfcr, 0);
i = 0;
if (priv->wol_opts & GFAR_WOL_FILER_UCAST) {
/* unicast packet, accept it */
struct net_device *ndev = priv->ndev;
/* get the default rx queue index */
u8 qindex = (u8)priv->gfargrp[0].rx_queue->qindex;
u32 dest_mac_addr = (ndev->dev_addr[0] << 16) |
(ndev->dev_addr[1] << 8) |
ndev->dev_addr[2];
rqfcr = (qindex << 10) | RQFCR_AND |
RQFCR_CMP_EXACT | RQFCR_PID_DAH;
gfar_write_filer(priv, i++, rqfcr, dest_mac_addr);
dest_mac_addr = (ndev->dev_addr[3] << 16) |
(ndev->dev_addr[4] << 8) |
ndev->dev_addr[5];
rqfcr = (qindex << 10) | RQFCR_GPI |
RQFCR_CMP_EXACT | RQFCR_PID_DAL;
gfar_write_filer(priv, i++, rqfcr, dest_mac_addr);
}
__gfar_filer_enable(priv);
}
static void gfar_filer_restore_table(struct gfar_private *priv)
{
u32 rqfcr, rqfpr;
unsigned int i;
__gfar_filer_disable(priv);
for (i = 0; i <= MAX_FILER_IDX; i++) {
rqfcr = priv->ftp_rqfcr[i];
rqfpr = priv->ftp_rqfpr[i];
gfar_write_filer(priv, i, rqfcr, rqfpr);
}
__gfar_filer_enable(priv);
}
/* gfar_start() for Rx only and with the FGPI filer interrupt enabled */
static void gfar_start_wol_filer(struct gfar_private *priv)
{
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
int i = 0;
/* Enable Rx hw queues */
gfar_write(&regs->rqueue, priv->rqueue);
/* Initialize DMACTRL to have WWR and WOP */
tempval = gfar_read(&regs->dmactrl);
tempval |= DMACTRL_INIT_SETTINGS;
gfar_write(&regs->dmactrl, tempval);
/* Make sure we aren't stopped */
tempval = gfar_read(&regs->dmactrl);
tempval &= ~DMACTRL_GRS;
gfar_write(&regs->dmactrl, tempval);
for (i = 0; i < priv->num_grps; i++) {
regs = priv->gfargrp[i].regs;
/* Clear RHLT, so that the DMA starts polling now */
gfar_write(&regs->rstat, priv->gfargrp[i].rstat);
/* enable the Filer General Purpose Interrupt */
gfar_write(&regs->imask, IMASK_FGPI);
}
/* Enable Rx DMA */
tempval = gfar_read(&regs->maccfg1);
tempval |= MACCFG1_RX_EN;
gfar_write(&regs->maccfg1, tempval);
}
static int gfar_suspend(struct device *dev)
{
struct gfar_private *priv = dev_get_drvdata(dev);
struct net_device *ndev = priv->ndev;
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
u16 wol = priv->wol_opts;
if (!netif_running(ndev))
return 0;
disable_napi(priv);
netif_tx_lock(ndev);
netif_device_detach(ndev);
netif_tx_unlock(ndev);
gfar_halt(priv);
if (wol & GFAR_WOL_MAGIC) {
/* Enable interrupt on Magic Packet */
gfar_write(&regs->imask, IMASK_MAG);
/* Enable Magic Packet mode */
tempval = gfar_read(&regs->maccfg2);
tempval |= MACCFG2_MPEN;
gfar_write(&regs->maccfg2, tempval);
/* re-enable the Rx block */
tempval = gfar_read(&regs->maccfg1);
tempval |= MACCFG1_RX_EN;
gfar_write(&regs->maccfg1, tempval);
} else if (wol & GFAR_WOL_FILER_UCAST) {
gfar_filer_config_wol(priv);
gfar_start_wol_filer(priv);
} else {
phy_stop(ndev->phydev);
}
return 0;
}
static int gfar_resume(struct device *dev)
{
struct gfar_private *priv = dev_get_drvdata(dev);
struct net_device *ndev = priv->ndev;
struct gfar __iomem *regs = priv->gfargrp[0].regs;
u32 tempval;
u16 wol = priv->wol_opts;
if (!netif_running(ndev))
return 0;
if (wol & GFAR_WOL_MAGIC) {
/* Disable Magic Packet mode */
tempval = gfar_read(&regs->maccfg2);
tempval &= ~MACCFG2_MPEN;
gfar_write(&regs->maccfg2, tempval);
} else if (wol & GFAR_WOL_FILER_UCAST) {
/* need to stop rx only, tx is already down */
gfar_halt(priv);
gfar_filer_restore_table(priv);
} else {
phy_start(ndev->phydev);
}
gfar_start(priv);
netif_device_attach(ndev);
enable_napi(priv);
return 0;
}
static int gfar_restore(struct device *dev)
{
struct gfar_private *priv = dev_get_drvdata(dev);
struct net_device *ndev = priv->ndev;
if (!netif_running(ndev)) {
netif_device_attach(ndev);
return 0;
}
gfar_init_bds(ndev);
gfar_mac_reset(priv);
gfar_init_tx_rx_base(priv);
gfar_start(priv);
priv->oldlink = 0;
priv->oldspeed = 0;
priv->oldduplex = -1;
if (ndev->phydev)
phy_start(ndev->phydev);
netif_device_attach(ndev);
enable_napi(priv);
return 0;
}
static const struct dev_pm_ops gfar_pm_ops = {
.suspend = gfar_suspend,
.resume = gfar_resume,
.freeze = gfar_suspend,
.thaw = gfar_resume,
.restore = gfar_restore,
};
#define GFAR_PM_OPS (&gfar_pm_ops)
#else
#define GFAR_PM_OPS NULL
#endif
static const struct of_device_id gfar_match[] =
{
{
.type = "network",
.compatible = "gianfar",
},
{
.compatible = "fsl,etsec2",
},
{},
};
MODULE_DEVICE_TABLE(of, gfar_match);
/* Structure for a device driver */
static struct platform_driver gfar_driver = {
.driver = {
.name = "fsl-gianfar",
.pm = GFAR_PM_OPS,
.of_match_table = gfar_match,
},
.probe = gfar_probe,
.remove = gfar_remove,
};
module_platform_driver(gfar_driver);