Tom, This patch series extends RFS to use hardware RX filters where available. Depending on the number of hardware RX queues and their IRQs' affinity, this should reduce the need for IPIs or at least get packets delivered to the right NUMA node. I've implemented the driver side of this for our hardware, though I don't know whether you have any of that to test on. I would be very interested to know how much this can help in the sort of cases where you use RFS. Ben. Ben Hutchings (4): IRQ: IRQ groups for multiqueue devices net: RPS: Enable hardware acceleration sfc: Implement RFS acceleration sfc/RFS/irq_group debug output drivers/net/sfc/efx.c | 49 +++++++++++--- drivers/net/sfc/efx.h | 9 +++ drivers/net/sfc/filter.c | 109 +++++++++++++++++++++++++++++ include/linux/irq.h | 52 ++++++++++++++ include/linux/netdevice.h | 29 +++++++- kernel/irq/manage.c | 170 +++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 88 ++++++++++++++++++++++-- 7 files changed, 488 insertions(+), 18 deletions(-) -- 1.7.2.1 -- Ben Hutchings, Senior Software Engineer, Solarflare Communications Not speaking for my employer; that's the marketing department's job. They asked us to note that Solarflare product names are trademarked. --
Allow drivers for multiqueue hardware with flow filter tables to
accelerate RFS. The driver must:
1. Set net_device::rx_irq_group to an irq_group of the RX completion
IRQs (in queue order). This will provide a mapping from CPUs to the
queues for which completions are handled nearest to them.
2. Implement net_device_ops::ndo_rx_flow_steer. This operation adds
or replaces a filter steering the given flow to the given RX queue, if
possible.
3. Periodically remove filters for which rps_may_expire_flow() returns
true.
---
The heuristic in rps_may_expire_flow() is quite possibly bogus. I'm not
even sure whether expiry of flows should be triggered by the driver or
from the RPS/RFS core. Any better ideas on how to do this?
Ben.
include/linux/netdevice.h | 29 +++++++++++++--
net/core/dev.c | 88 +++++++++++++++++++++++++++++++++++++++++---
2 files changed, 108 insertions(+), 9 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f7f1302..897118f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -524,14 +524,16 @@ struct rps_map {
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
/*
- * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
- * tail pointer for that CPU's input queue at the time of last enqueue.
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
*/
struct rps_dev_flow {
u16 cpu;
- u16 fill;
+ u16 filter;
unsigned int last_qtail;
};
+#define RPS_NO_FILTER 0xffff
/*
* The rps_dev_flow_table structure contains a table of flow mappings.
@@ -581,6 +583,9 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
extern struct rps_sock_flow_table *rps_sock_flow_table;
+extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 ...---
This depends on today's patch series for net-next-2.6.
Ben.
drivers/net/sfc/efx.c | 49 ++++++++++++++++++----
drivers/net/sfc/efx.h | 9 ++++
drivers/net/sfc/filter.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 149 insertions(+), 9 deletions(-)
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 8a51c41..aea6283 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -127,6 +127,8 @@ static int napi_weight = 64;
* monitor. On Falcon-based NICs, this will:
* - Check the on-board hardware monitor;
* - Poll the link state and reconfigure the hardware as necessary.
+ * If RFS is enabled, this will scan part of the RX IP filter table and
+ * remove filters for inactive flows.
*/
unsigned int efx_monitor_interval = 1 * HZ;
@@ -1174,7 +1176,7 @@ static int efx_wanted_channels(void)
/* Probe the number and type of interrupts we are able to obtain, and
* the resulting numbers of channels and RX queues.
*/
-static void efx_probe_interrupts(struct efx_nic *efx)
+static int efx_probe_interrupts(struct efx_nic *efx)
{
int max_channels =
min_t(int, efx->type->phys_addr_channels, EFX_MAX_CHANNELS);
@@ -1216,6 +1218,17 @@ static void efx_probe_interrupts(struct efx_nic *efx)
efx->n_tx_channels = efx->n_channels;
efx->n_rx_channels = efx->n_channels;
}
+#ifdef CONFIG_RPS
+ efx->net_dev->rx_irq_group =
+ alloc_irq_group(efx->n_rx_channels, GFP_KERNEL);
+ if (!efx->net_dev->rx_irq_group) {
+ pci_disable_msix(efx->pci_dev);
+ return -ENOMEM;
+ }
+ for (i = 0; i < efx->n_rx_channels; i++)
+ irq_group_add(efx->net_dev->rx_irq_group,
+ xentries[i].vector);
+#endif
for (i = 0; i < n_channels; i++)
efx_get_channel(efx, i)->irq =
xentries[i].vector;
@@ -1249,6 +1262,8 @@ static void efx_probe_interrupts(struct efx_nic *efx)
efx->n_tx_channels = 1;
efx->legacy_irq = efx->pci_dev->irq;
}
+
+ return 0;
}
static void ...Just some logging I found useful.
Ben.
---
drivers/net/sfc/filter.c | 11 ++++++++++-
kernel/irq/manage.c | 21 +++++++++++++++++++++
2 files changed, 31 insertions(+), 1 deletions(-)
diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index 349b5d1..db7fa46 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -506,6 +506,11 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
rc = efx_filter_insert_filter(efx, &spec, true);
if (rc >= 0)
state->rps_flow_id[rc] = flow_id;
+ netif_info(efx, rx_status, efx->net_dev,
+ "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
+ (ip->protocol == IPPROTO_TCP) ? "TCP" : "UDP",
+ &ip->saddr, ntohs(ports[0]), &ip->daddr, ntohs(ports[1]),
+ rxq_index, flow_id, rc);
return rc;
}
@@ -529,8 +534,12 @@ void efx_filter_rfs_expire(struct efx_nic *efx)
table->spec[index].priority == EFX_FILTER_PRI_HINT &&
rps_may_expire_flow(efx->net_dev,
table->spec[index].dmaq_id,
- state->rps_flow_id[index], index))
+ state->rps_flow_id[index], index)) {
+ netif_info(efx, rx_status, efx->net_dev,
+ "expiring filter %d [flow %u]\n",
+ index, state->rps_flow_id[index]);
efx_filter_table_clear_entry(efx, table, index);
+ }
index = (index + 1) & mask;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3f2b1a9..7199dde 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -129,6 +129,21 @@ static bool irq_group_copy_neigh(struct irq_group *group, int cpu,
return false;
}
+static void print_irq_group(const struct irq_group *group, const char *prefix)
+{
+ unsigned index;
+ int cpu;
+
+ pr_info("irq_group %p, %s:\n", group, prefix);
+
+ for_each_possible_cpu(cpu) {
+ index = group->closest[cpu].index;
+ pr_info("cpu %d -> index %u (IRQ %u; distance %u)\n",
+ cpu, index, group->irq[index]->irq,
+ group->closest[cpu].dist);
+ }
+}
+
/* Update the ...Thanks Ben, this does look interesting. We'll try to take a look. On Mon, Sep 20, 2010 at 12:01 PM, Ben Hutchings --
