[PATCH 6/7] sfc: Fix search for flush completion events

Previous thread: [BUG?] iproute2/skbedit bug? by Denys Fedoryschenko on Wednesday, March 4, 2009 - 12:46 pm. (6 messages)

Next thread: [patch 1/1] sfc: modify allocation error message by akpm on Wednesday, March 4, 2009 - 1:03 pm. (2 messages)
From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 12:51 pm

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/ethtool.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/drivers/net/sfc/ethtool.c b/drivers/net/sfc/ethtool.c
index 7b5924c..589d132 100644
--- a/drivers/net/sfc/ethtool.c
+++ b/drivers/net/sfc/ethtool.c
@@ -529,7 +529,14 @@ static int efx_ethtool_nway_reset(struct net_device *net_dev)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);
 
-	return mii_nway_restart(&efx->mii);
+	if (efx->phy_op->mmds & DEV_PRESENT_BIT(MDIO_MMD_AN)) {
+		mdio_clause45_set_flag(efx, efx->mii.phy_id, MDIO_MMD_AN,
+				       MDIO_MMDREG_CTRL1,
+				       __ffs(BMCR_ANRESTART), true);
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
 }
 
 static u32 efx_ethtool_get_link(struct net_device *net_dev)

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 12:52 pm

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
This is necessary because netif_carrier_off() doesn't stop the queue
immediately.

Ben.

 drivers/net/sfc/tx.c |    3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/net/sfc/tx.c b/drivers/net/sfc/tx.c
index da3e9ff..9f97e44 100644
--- a/drivers/net/sfc/tx.c
+++ b/drivers/net/sfc/tx.c
@@ -376,6 +376,9 @@ int efx_hard_start_xmit(struct sk_buff *skb, struct net_device *net_dev)
 	struct efx_nic *efx = netdev_priv(net_dev);
 	struct efx_tx_queue *tx_queue;
 
+	if (unlikely(efx->port_inhibited))
+		return NETDEV_TX_BUSY;
+
 	if (likely(skb->ip_summed == CHECKSUM_PARTIAL))
 		tx_queue = &efx->tx_queue[EFX_TX_QUEUE_OFFLOAD_CSUM];
 	else
-- 
1.5.5

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

--

From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 12:52 pm

If MAC switching fails, stop the port properly.

If PHY reinitialisation fails, clear the port_initialized flag.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/efx.c |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 45df110..8fa68d8 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -557,6 +557,8 @@ static void efx_link_status_changed(struct efx_nic *efx)
 
 }
 
+static void efx_fini_port(struct efx_nic *efx);
+
 /* This call reinitialises the MAC to pick up new PHY settings. The
  * caller must hold the mac_lock */
 void __efx_reconfigure_port(struct efx_nic *efx)
@@ -592,8 +594,8 @@ void __efx_reconfigure_port(struct efx_nic *efx)
 
 fail:
 	EFX_ERR(efx, "failed to reconfigure MAC\n");
-	efx->phy_op->fini(efx);
-	efx->port_initialized = false;
+	efx->port_enabled = false;
+	efx_fini_port(efx);
 }
 
 /* Reinitialise the MAC to pick up new PHY settings, even if the port is
@@ -1667,7 +1669,8 @@ int efx_reset_up(struct efx_nic *efx, enum reset_type method,
 			rc = efx->phy_op->init(efx);
 			if (rc)
 				ok = false;
-		} else
+		}
+		if (!ok)
 			efx->port_initialized = false;
 	}
 

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

--

From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 12:53 pm

i2c_del_adapter() leaves dangling pointers in the structure.  If we
retry the NIC probe and pass the structure to i2c_add_adapter() again
it will lead to an oops unless we clear it first.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/falcon.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index 9e2f0f0..efd121c 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -3114,8 +3114,10 @@ void falcon_remove_nic(struct efx_nic *efx)
 	struct falcon_nic_data *nic_data = efx->nic_data;
 	int rc;
 
+	/* Remove I2C adapter and clear it in preparation for a retry */
 	rc = i2c_del_adapter(&efx->i2c_adap);
 	BUG_ON(rc);
+	memset(&efx->i2c_adap, 0, sizeof(efx->i2c_adap));
 
 	falcon_remove_spi_devices(efx);
 	falcon_free_buffer(efx, &efx->irq_status);

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

--

From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 12:53 pm

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/tx.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/sfc/tx.c b/drivers/net/sfc/tx.c
index 9f97e44..b1e1907 100644
--- a/drivers/net/sfc/tx.c
+++ b/drivers/net/sfc/tx.c
@@ -400,7 +400,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 	 * separates the update of read_count from the test of
 	 * stopped. */
 	smp_mb();
-	if (unlikely(tx_queue->stopped)) {
+	if (unlikely(tx_queue->stopped) && likely(efx->port_enabled)) {
 		fill_level = tx_queue->insert_count - tx_queue->read_count;
 		if (fill_level < EFX_NETDEV_TX_THRESHOLD(tx_queue)) {
 			EFX_BUG_ON_PARANOID(!efx_dev_registered(efx));

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

--

From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 1:01 pm

When flushing queues we disable normal interrupt and event handling and
poll event queue 0 looking for flush completions.  Unfortunately the
flush event polling loop fails to move past any other type of event.
This problem has not been observed in production hardware but appears to
be a possibility.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/falcon.c |   21 +++++++++------------
 1 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index efd121c..82c10f4 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -1187,31 +1187,29 @@ static void falcon_poll_flush_events(struct efx_nic *efx)
 	struct efx_channel *channel = &efx->channel[0];
 	struct efx_tx_queue *tx_queue;
 	struct efx_rx_queue *rx_queue;
-	unsigned int read_ptr, i;
+	unsigned int read_ptr = channel->eventq_read_ptr;
+	unsigned int end_ptr = (read_ptr - 1) & FALCON_EVQ_MASK;
 
-	read_ptr = channel->eventq_read_ptr;
-	for (i = 0; i < FALCON_EVQ_SIZE; ++i) {
+	do {
 		efx_qword_t *event = falcon_event(channel, read_ptr);
 		int ev_code, ev_sub_code, ev_queue;
 		bool ev_failed;
+
 		if (!falcon_event_present(event))
 			break;
 
 		ev_code = EFX_QWORD_FIELD(*event, EV_CODE);
-		if (ev_code != DRIVER_EV_DECODE)
-			continue;
-
 		ev_sub_code = EFX_QWORD_FIELD(*event, DRIVER_EV_SUB_CODE);
-		switch (ev_sub_code) {
-		case TX_DESCQ_FLS_DONE_EV_DECODE:
+		if (ev_code == DRIVER_EV_DECODE &&
+		    ev_sub_code == TX_DESCQ_FLS_DONE_EV_DECODE) {
 			ev_queue = EFX_QWORD_FIELD(*event,
 						   DRIVER_EV_TX_DESCQ_ID);
 			if (ev_queue < EFX_TX_QUEUE_COUNT) {
 				tx_queue = efx->tx_queue + ev_queue;
 				tx_queue->flushed = true;
 			}
-			break;
-		case RX_DESCQ_FLS_DONE_EV_DECODE:
+		} else if (ev_code == DRIVER_EV_DECODE &&
+			   ev_sub_code == RX_DESCQ_FLS_DONE_EV_DECODE) {
 			ev_queue = EFX_QWORD_FIELD(*event,
 						   DRIVER_EV_RX_DESCQ_ID);
 			ev_failed = ...
From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: Ben Hutchings
Date: Wednesday, March 4, 2009 - 1:01 pm

Make the error count a per-NIC variable.
Reset this the count after an hour if it has not reached the critical value.
Set the critical value back to 5.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 drivers/net/sfc/falcon.c |   23 +++++++++++++++++++----
 1 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/drivers/net/sfc/falcon.c b/drivers/net/sfc/falcon.c
index 82c10f4..2ae51fd 100644
--- a/drivers/net/sfc/falcon.c
+++ b/drivers/net/sfc/falcon.c
@@ -39,11 +39,16 @@
  * @next_buffer_table: First available buffer table id
  * @pci_dev2: The secondary PCI device if present
  * @i2c_data: Operations and state for I2C bit-bashing algorithm
+ * @int_error_count: Number of internal errors seen recently
+ * @int_error_expire: Time at which error count will be expired
  */
 struct falcon_nic_data {
 	unsigned next_buffer_table;
 	struct pci_dev *pci_dev2;
 	struct i2c_algo_bit_data i2c_data;
+
+	unsigned int_error_count;
+	unsigned long int_error_expire;
 };
 
 /**************************************************************************
@@ -119,8 +124,12 @@ MODULE_PARM_DESC(rx_xon_thresh_bytes, "RX fifo XON threshold");
 #define FALCON_EVQ_SIZE 4096
 #define FALCON_EVQ_MASK (FALCON_EVQ_SIZE - 1)
 
-/* Max number of internal errors. After this resets will not be performed */
-#define FALCON_MAX_INT_ERRORS 4
+/* If FALCON_MAX_INT_ERRORS internal errors occur within
+ * FALCON_INT_ERROR_EXPIRE seconds, we consider the NIC broken and
+ * disable it.
+ */
+#define FALCON_INT_ERROR_EXPIRE 3600
+#define FALCON_MAX_INT_ERRORS 5
 
 /* We poll for events every FLUSH_INTERVAL ms, and check FLUSH_POLL_COUNT times
  */
@@ -1374,7 +1383,6 @@ static irqreturn_t falcon_fatal_interrupt(struct efx_nic *efx)
 	efx_oword_t *int_ker = efx->irq_status.addr;
 	efx_oword_t fatal_intr;
 	int error, mem_perr;
-	static int n_int_errors;
 
 	falcon_read(efx, &fatal_intr, FATAL_INTR_REG_KER);
 	error = EFX_OWORD_FIELD(fatal_intr, INT_KER_ERROR);
@@ -1401,7 +1409,14 ...
From: David Miller
Date: Wednesday, March 4, 2009 - 6:55 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

From: David Miller
Date: Wednesday, March 4, 2009 - 6:54 pm

From: Ben Hutchings <bhutchings@solarflare.com>

Applied.
--

Previous thread: [BUG?] iproute2/skbedit bug? by Denys Fedoryschenko on Wednesday, March 4, 2009 - 12:46 pm. (6 messages)

Next thread: [patch 1/1] sfc: modify allocation error message by akpm on Wednesday, March 4, 2009 - 1:03 pm. (2 messages)